Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions scripts/build_msigdb_cache.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env Rscript

suppressPackageStartupMessages({
library(dplyr)
library(msigdbr)
})

species_to_cache <- c(
"Homo sapiens",
"Mus musculus"
)

default_output_dir <- file.path("shiny", "data", "msigdb_genesets")
output_dir <- Sys.getenv("MSIGDB_CACHE_DIR", unset = default_output_dir)

dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

slugify <- function(x) {
x <- gsub("[^A-Za-z0-9]+", "_", x)
x <- gsub("_+", "_", x)
x <- gsub("^_|_$", "", x)
x
}

fetch_msigdb <- function(species, collection, subcollection = "") {
msigdbr_args <- list(species = species)
msigdbr_formals <- names(formals(msigdbr::msigdbr))

if ("collection" %in% msigdbr_formals) {
msigdbr_args$collection <- collection
} else if ("category" %in% msigdbr_formals) {
msigdbr_args$category <- collection
}

if (nzchar(subcollection)) {
if ("subcollection" %in% msigdbr_formals) {
msigdbr_args$subcollection <- subcollection
} else if ("subcategory" %in% msigdbr_formals) {
msigdbr_args$subcategory <- subcollection
}
}

do.call(msigdbr::msigdbr, msigdbr_args)
}

geneset_cache_path <- function(species, collection, subcollection) {
species_slug <- slugify(species)
subcollection_slug <- if (nzchar(subcollection)) slugify(subcollection) else "all"
file.path(
output_dir,
sprintf("%s__%s__%s.rds", species_slug, collection, subcollection_slug)
)
}

collections <- msigdbr::msigdbr_collections() |>
dplyr::select(gs_collection, gs_subcollection) |>
dplyr::distinct() |>
dplyr::arrange(gs_collection, gs_subcollection)

manifest <- list()

for (species in species_to_cache) {
message("Building MSigDB cache for: ", species)

for (i in seq_len(nrow(collections))) {
collection <- collections$gs_collection[[i]]
subcollection <- collections$gs_subcollection[[i]]
cache_file <- geneset_cache_path(species, collection, subcollection)

message(
" - ",
collection,
if (nzchar(subcollection)) paste0(" / ", subcollection) else "",
" -> ",
cache_file
)

msigdb_tbl <- fetch_msigdb(
species = species,
collection = collection,
subcollection = subcollection
) |>
dplyr::select(gs_name, gene_symbol)

genesets <- split(msigdb_tbl$gene_symbol, msigdb_tbl$gs_name)
genesets <- lapply(genesets, unique)

saveRDS(genesets, cache_file, compress = "xz")

manifest[[length(manifest) + 1]] <- data.frame(
species = species,
gs_collection = collection,
gs_subcollection = subcollection,
file = basename(cache_file),
genesets = length(genesets),
genes = length(unique(unlist(genesets, use.names = FALSE))),
stringsAsFactors = FALSE
)

rm(msigdb_tbl, genesets)
gc(verbose = FALSE)
}
}

manifest_df <- do.call(rbind, manifest)
manifest_file <- file.path(output_dir, "manifest.rds")
manifest_csv <- file.path(output_dir, "manifest.csv")

saveRDS(manifest_df, manifest_file, compress = "xz")
utils::write.csv(manifest_df, manifest_csv, row.names = FALSE)

message("MSigDB cache complete.")
message("Manifest: ", manifest_file)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
51 changes: 51 additions & 0 deletions shiny/data/msigdb_genesets/manifest.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"species","gs_collection","gs_subcollection","file","genesets","genes"
"Homo sapiens","C1","","Homo_sapiens__C1__all.rds",302,43321
"Homo sapiens","C2","CGP","Homo_sapiens__C2__CGP.rds",3538,21705
"Homo sapiens","C2","CP","Homo_sapiens__C2__CP.rds",19,349
"Homo sapiens","C2","CP:BIOCARTA","Homo_sapiens__C2__CP_BIOCARTA.rds",292,1509
"Homo sapiens","C2","CP:KEGG_LEGACY","Homo_sapiens__C2__CP_KEGG_LEGACY.rds",186,5245
"Homo sapiens","C2","CP:KEGG_MEDICUS","Homo_sapiens__C2__CP_KEGG_MEDICUS.rds",658,2788
"Homo sapiens","C2","CP:PID","Homo_sapiens__C2__CP_PID.rds",196,2534
"Homo sapiens","C2","CP:REACTOME","Homo_sapiens__C2__CP_REACTOME.rds",1787,11369
"Homo sapiens","C2","CP:WIKIPATHWAYS","Homo_sapiens__C2__CP_WIKIPATHWAYS.rds",885,9127
"Homo sapiens","C3","MIR:MIRDB","Homo_sapiens__C3__MIR_MIRDB.rds",2377,16652
"Homo sapiens","C3","MIR:MIR_LEGACY","Homo_sapiens__C3__MIR_MIR_LEGACY.rds",221,7450
"Homo sapiens","C3","TFT:GTRD","Homo_sapiens__C3__TFT_GTRD.rds",505,26928
"Homo sapiens","C3","TFT:TFT_LEGACY","Homo_sapiens__C3__TFT_TFT_LEGACY.rds",610,12779
"Homo sapiens","C4","3CA","Homo_sapiens__C4__3CA.rds",148,2992
"Homo sapiens","C4","CGN","Homo_sapiens__C4__CGN.rds",427,4883
"Homo sapiens","C4","CM","Homo_sapiens__C4__CM.rds",431,8359
"Homo sapiens","C5","GO:BP","Homo_sapiens__C5__GO_BP.rds",7583,18000
"Homo sapiens","C5","GO:CC","Homo_sapiens__C5__GO_CC.rds",1042,14869
"Homo sapiens","C5","GO:MF","Homo_sapiens__C5__GO_MF.rds",1855,15699
"Homo sapiens","C5","HPO","Homo_sapiens__C5__HPO.rds",5748,5112
"Homo sapiens","C6","","Homo_sapiens__C6__all.rds",189,10927
"Homo sapiens","C7","IMMUNESIGDB","Homo_sapiens__C7__IMMUNESIGDB.rds",4872,20440
"Homo sapiens","C7","VAX","Homo_sapiens__C7__VAX.rds",347,13431
"Homo sapiens","C8","","Homo_sapiens__C8__all.rds",866,20533
"Homo sapiens","H","","Homo_sapiens__H__all.rds",50,4384
"Mus musculus","C1","","Mus_musculus__C1__all.rds",297,17961
"Mus musculus","C2","CGP","Mus_musculus__C2__CGP.rds",3537,17514
"Mus musculus","C2","CP","Mus_musculus__C2__CP.rds",19,350
"Mus musculus","C2","CP:BIOCARTA","Mus_musculus__C2__CP_BIOCARTA.rds",292,1513
"Mus musculus","C2","CP:KEGG_LEGACY","Mus_musculus__C2__CP_KEGG_LEGACY.rds",186,5032
"Mus musculus","C2","CP:KEGG_MEDICUS","Mus_musculus__C2__CP_KEGG_MEDICUS.rds",658,2718
"Mus musculus","C2","CP:PID","Mus_musculus__C2__CP_PID.rds",196,2546
"Mus musculus","C2","CP:REACTOME","Mus_musculus__C2__CP_REACTOME.rds",1787,10688
"Mus musculus","C2","CP:WIKIPATHWAYS","Mus_musculus__C2__CP_WIKIPATHWAYS.rds",885,8516
"Mus musculus","C3","MIR:MIRDB","Mus_musculus__C3__MIR_MIRDB.rds",2377,15499
"Mus musculus","C3","MIR:MIR_LEGACY","Mus_musculus__C3__MIR_MIR_LEGACY.rds",221,7303
"Mus musculus","C3","TFT:GTRD","Mus_musculus__C3__TFT_GTRD.rds",502,16143
"Mus musculus","C3","TFT:TFT_LEGACY","Mus_musculus__C3__TFT_TFT_LEGACY.rds",610,12357
"Mus musculus","C4","3CA","Mus_musculus__C4__3CA.rds",148,2875
"Mus musculus","C4","CGN","Mus_musculus__C4__CGN.rds",427,4751
"Mus musculus","C4","CM","Mus_musculus__C4__CM.rds",431,8079
"Mus musculus","C5","GO:BP","Mus_musculus__C5__GO_BP.rds",7580,15844
"Mus musculus","C5","GO:CC","Mus_musculus__C5__GO_CC.rds",1042,13330
"Mus musculus","C5","GO:MF","Mus_musculus__C5__GO_MF.rds",1852,14226
"Mus musculus","C5","HPO","Mus_musculus__C5__HPO.rds",5748,5027
"Mus musculus","C6","","Mus_musculus__C6__all.rds",189,10255
"Mus musculus","C7","IMMUNESIGDB","Mus_musculus__C7__IMMUNESIGDB.rds",4872,17329
"Mus musculus","C7","VAX","Mus_musculus__C7__VAX.rds",346,11828
"Mus musculus","C8","","Mus_musculus__C8__all.rds",866,15683
"Mus musculus","H","","Mus_musculus__H__all.rds",50,4393
Binary file added shiny/data/msigdb_genesets/manifest.rds
Binary file not shown.
158 changes: 143 additions & 15 deletions shiny/modules/hypeR_module.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,90 @@ msigdb_collection_metadata <- data.frame(
)


fetch_msigdb_table <- function(species, collection, subcollection = "") {
msigdbr_args <- list(species = species)
msigdbr_formals <- names(formals(msigdbr::msigdbr))

if ("collection" %in% msigdbr_formals) {
msigdbr_args$collection <- collection
} else if ("category" %in% msigdbr_formals) {
msigdbr_args$category <- collection
}

if (!is.null(subcollection) && nzchar(subcollection)) {
if ("subcollection" %in% msigdbr_formals) {
msigdbr_args$subcollection <- subcollection
} else if ("subcategory" %in% msigdbr_formals) {
msigdbr_args$subcategory <- subcollection
}
}

msigdb_tbl <- do.call(msigdbr::msigdbr, msigdbr_args)

required_columns <- c("gs_name", "gs_collection", "gs_subcollection", "gene_symbol")
missing_columns <- setdiff(required_columns, names(msigdb_tbl))

if (length(missing_columns) > 0) {
stop(
sprintf(
"MSigDB result is missing required columns: %s",
paste(missing_columns, collapse = ", ")
),
call. = FALSE
)
}

msigdb_tbl |>
dplyr::select(
gs_name,
gs_collection,
gs_subcollection,
gene_symbol
)
}


msigdb_cache_dir <- function() {
Sys.getenv(
"MSIGDB_CACHE_DIR",
unset = file.path("shiny", "data", "msigdb_genesets")
)
}


msigdb_slugify <- function(x) {
x <- gsub("[^A-Za-z0-9]+", "_", x)
x <- gsub("_+", "_", x)
gsub("^_|_$", "", x)
}


msigdb_cache_file <- function(species, collection, subcollection = "") {
species_slug <- msigdb_slugify(species)
subcollection_slug <- if (!is.null(subcollection) && nzchar(subcollection)) {
msigdb_slugify(subcollection)
} else {
"all"
}

file.path(
msigdb_cache_dir(),
sprintf("%s__%s__%s.rds", species_slug, collection, subcollection_slug)
)
}


load_cached_msigdb_genesets <- function(species, collection, subcollection = "") {
cache_file <- msigdb_cache_file(species, collection, subcollection)

if (!file.exists(cache_file)) {
return(NULL)
}

readRDS(cache_file)
}


# hypeR genests ui rewrite
#' Shiny UI for MSigDB subcategory selection
#'
Expand Down Expand Up @@ -136,18 +220,61 @@ genesets_hypeR_Server <- function(id, species, clean = FALSE) {
req(input$collection)
req(!is.null(input$subcategory))

filtered_tbl <- msigdbr::msigdbr(species = species()) |>
dplyr::select(
gs_name,
gs_collection,
gs_subcollection,
gene_symbol
) |>
dplyr::filter(gs_collection == input$collection)

if (!identical(input$subcategory, "")) {
filtered_tbl <- filtered_tbl |>
dplyr::filter(gs_subcollection == input$subcategory)
selected_genesets(list())

cached_genesets <- tryCatch(
load_cached_msigdb_genesets(
species = species(),
collection = input$collection,
subcollection = input$subcategory
),
error = function(err) {
showNotification(
sprintf("Failed to load cached genesets: %s", conditionMessage(err)),
type = "error",
duration = 10
)
NULL
}
)

if (!is.null(cached_genesets)) {
if (clean) {
names(cached_genesets) <- clean_genesets(names(cached_genesets))
}

selected_genesets(cached_genesets)
showNotification("Loaded genesets from local cache.", type = "message")
return()
}

filtered_tbl <- tryCatch(
{
shiny::withProgress(
message = "Fetching MSigDB genesets...",
value = 0.25,
{
fetch_msigdb_table(
species = species(),
collection = input$collection,
subcollection = input$subcategory
)
}
)
},
error = function(err) {
showNotification(
sprintf("Failed to fetch genesets: %s", conditionMessage(err)),
type = "error",
duration = 10
)
NULL
}
)

if (is.null(filtered_tbl)) {
selected_genesets(list())
return()
}

if (nrow(filtered_tbl) == 0) {
Expand All @@ -156,15 +283,16 @@ genesets_hypeR_Server <- function(id, species, clean = FALSE) {
return()
}

gs <- filtered_tbl |>
(\(df) split(df, df$gs_name))() |>
(\(lst) lapply(lst, function(x) unique(x$gene_symbol)))()
gs <- split(filtered_tbl$gene_symbol, filtered_tbl$gs_name)
gs <- lapply(gs, unique)

if (clean) {
names(gs) <- clean_genesets(names(gs))
}

selected_genesets(gs)
rm(filtered_tbl)
gc(verbose = FALSE)
})

# Status message
Expand Down
Loading