# Helper functions transform_ensembl_2_entrez <- function(Data){ gene_ids <- rownames(Data) from_ID_type = "ENSEMBL" to_ID_type = "ENTREZID" #check if ENSEMBL ID contain ".", if yes, split the ID by "." and only take the first part for matching if(grepl("\\.",Data %>% rownames) %>% all & from_ID_type == "ENSEMBL") { gene_ids = gene_ids %>% stringr::word( sep = '\\.') } gene_name <- gene_ids %>% clusterProfiler::bitr(fromType = from_ID_type, toType = to_ID_type, OrgDb = org.Hs.eg.db) %>% filter(!duplicated(across(from_ID_type))) %>% filter(!duplicated(across(to_ID_type))) %>% dplyr::rename_with(~"gene",from_ID_type) suppressMessages(logcpmx <- Data %>% as_tibble(rownames = "gene") %>% mutate(gene = stringr::word(gene, sep = '\\.')) %>% filter(gene %in% gene_name$gene) %>% left_join(gene_name) %>% dplyr::select(-gene) ) logcpmxx <- logcpmx %>% tibble::column_to_rownames(var = "ENTREZID") %>% as.matrix() return(logcpmxx) } check_gene_name_match <- function(genes_in_Gene_sets, genes_in_Data, genes_in_Direction_matrix){ pct_overlap_gene_sets <- length(intersect(genes_in_Gene_sets, genes_in_Data))/length(genes_in_Gene_sets) pct_overlap_direction_matrix <- length(intersect(genes_in_Direction_matrix, genes_in_Gene_sets))/length(genes_in_Gene_sets) if(pct_overlap_gene_sets < 0.1){ stop("The gene ids in the gene sets and the data matrix do not match well. Please check if data has the same gene id type as the gene sets.")} else if(pct_overlap_direction_matrix < 0.1){ stop("The gene ids in the gene sets and the direction matrix do not match well. Please check if direction_matrix has the same gene id type as the gene sets.")} } check_genes_missing <- function(Gene_sets, Data, Direction_matrix){ check_in_fun <- function(x, y){ return(x[!(x %in% y)]) } PKnames <- names(Gene_sets) Gene_sets_data_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = rownames(Data)) Gene_sets_direction_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = Direction_matrix$gene) pct_missingsets_data <- round(length(vctrs::list_drop_empty(Gene_sets_data_missing))/length(PKnames)*100, 2) pct_missingsets_direction <- round(length(vctrs::list_drop_empty(Gene_sets_direction_missing))/length(PKnames)*100, 2) writeLines(paste0("Warning messages: \n", pct_missingsets_data, "% (", length(vctrs::list_drop_empty(Gene_sets_data_missing)),"/", length(PKnames),") of gene sets have information missing in the data matrix for at least one gene; \n", pct_missingsets_direction, "% (", length(vctrs::list_drop_empty(Gene_sets_direction_missing)),"/", length(PKnames),") of gene sets have information missing in the direction matrix for at least one gene. \n", sep = "")) Gene_sets_data_missing <- vctrs::list_drop_empty(Gene_sets_data_missing) PKnames_data_missing <- names(Gene_sets_data_missing) Gene_sets_direction_missing <- vctrs::list_drop_empty(Gene_sets_direction_missing) PKnames_direction_missing <- names(Gene_sets_direction_missing) if ((length(Gene_sets_data_missing)>0)&(length(Gene_sets_data_missing)<10)){ pct_missing_data <- round(lengths(Gene_sets_data_missing)/lengths(Gene_sets[PKnames_data_missing])*100,2) writeLines(paste0(PKnames_data_missing, ": \n ", pct_missing_data, "% of genes (", lengths(Gene_sets_data_missing), "/", lengths(Gene_sets[PKnames_data_missing]), ") in this gene set have information missing in the data matrix; \n", sep = "")) if (length((names(pct_missing_data)[pct_missing_data == 100]))>0){ writeLines(paste0("Gene set: '", names(pct_missing_data)[pct_missing_data == 100], "' has 100% information missing in data matrix.", sep = "")) } } else { writeLines("Note since more than 10 gene sets have missing information in the data matrix, no detailed individual gene set missing information will be reported.") } if ((length(Gene_sets_direction_missing)>0)&(length(Gene_sets_direction_missing)<10)){ pct_missing_direction <- round(lengths(Gene_sets_direction_missing)/lengths(Gene_sets[PKnames_direction_missing])*100,2) writeLines(paste0(PKnames_direction_missing, ": \n ", pct_missing_direction, "% of genes (", lengths(Gene_sets_direction_missing), "/", lengths(Gene_sets[PKnames_direction_missing]), ") in this gene set have information missing in the direction matrix; \n", sep = "")) if (length((names(pct_missing_direction)[pct_missing_direction == 100]))>0){ writeLines(paste0("Gene set: '", names(pct_missing_direction)[pct_missing_direction == 100], "' has 100% information missing in direction matrix.", sep = "")) } } else { writeLines("Note since more than 10 gene sets have missing information in the direction matrix, no detailed individual gene set missing information will be reported.") } } ## AVG cpm method avg_expression <- function(Data, pathway.db){ cal.avg.logcpm <- function(pathway.set, df){ dat.sub <- Data %>% as_tibble(rownames = "gene") %>% dplyr::filter(gene %in% pathway.set) %>% dplyr::select(-gene) %>% colMeans() # use colMeans() for column-wise means of a matrix out <- c(names(pathway.set), dat.sub) } result <- purrr::map(pathway.db, cal.avg.logcpm, df = Data) %>% bind_rows(.id = "pathway") %>% tibble::column_to_rownames(var = "pathway") return(result) } ## MEDIAN cpm method median_expression <- function(Data, pathway.db){ cal.med.logcpm <- function(pathway.set, df){ dat.sub <- Data %>% as_tibble(rownames = "gene") %>% filter(gene %in% pathway.set) %>% dplyr::select(-gene) %>% apply(., 2, median) # as.matrix() %>% # robustbase::colMedians() # use colMedians() for column-wise medians of a matrix out <- c(names(pathway.set), dat.sub) } result <- purrr::map(pathway.db, cal.med.logcpm, df = Data) %>% bind_rows(.id = "pathway") %>% tibble::column_to_rownames(var = "pathway") return(result) } check_gene_name_match_noDir <- function(genes_in_Gene_sets, genes_in_Data){ pct_overlap_gene_sets <- length(intersect(genes_in_Gene_sets, genes_in_Data))/length(genes_in_Gene_sets) if(pct_overlap_gene_sets < 0.1){ stop("The gene ids in the gene sets and the data matrix do not match well. Please check if data has the same gene id type as the gene sets.") } } check_genes_missing_noDir <- function(Gene_sets, Data){ check_in_fun <- function(x, y){ return(x[!(x %in% y)]) } PKnames <- names(Gene_sets) Gene_sets_data_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = rownames(Data)) pct_missingsets_data <- round(length(vctrs::list_drop_empty(Gene_sets_data_missing))/length(PKnames)*100, 2) writeLines(paste0("Warning messages: \n", pct_missingsets_data, "% (", length(vctrs::list_drop_empty(Gene_sets_data_missing)),"/", length(PKnames),") of gene sets have information missing in the data matrix for at least one gene.", sep = "")) Gene_sets_data_missing <- vctrs::list_drop_empty(Gene_sets_data_missing) PKnames_data_missing <- names(Gene_sets_data_missing) if ((length(Gene_sets_data_missing)>0)&(length(Gene_sets_data_missing)<10)){ pct_missing_data <- round(lengths(Gene_sets_data_missing)/lengths(Gene_sets[PKnames_data_missing])*100,2) writeLines(paste0(PKnames_data_missing, ": \n ", pct_missing_data, "% of genes (", lengths(Gene_sets_data_missing), "/", lengths(Gene_sets[PKnames_data_missing]), ") in this gene set have information missing in the data matrix; \n", sep = "")) if (length((names(pct_missing_data)[pct_missing_data == 100]))>0){ writeLines(paste0("Gene set: '", names(pct_missing_data)[pct_missing_data == 100], "' has 100% information missing in data matrix.", sep = "")) } }else { writeLines("Note since more than 10 gene sets have missing information in the data matrix, no detailed individual gene set missing information will be reported.") } } check_genes_missing_total <- function(Gene_sets, Data){ check_in_fun <- function(x, y){ return(x[!(x %in% y)]) } PKnames <- names(Gene_sets) Gene_sets_data_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = rownames(Data)) Gene_sets_data_missing <- vctrs::list_drop_empty(Gene_sets_data_missing) PKnames_data_missing <- names(Gene_sets_data_missing) Gene_names_data_100 <- c() if (length(Gene_sets_data_missing)>0){ pct_missing_data <- round(lengths(Gene_sets_data_missing)/lengths(Gene_sets[PKnames_data_missing])*100,2) if (length((names(pct_missing_data)[pct_missing_data == 100]))>0){ Gene_names_data_100 <- names(pct_missing_data)[pct_missing_data == 100] } } return(list(Total_missing_in_data_matrix = Gene_names_data_100)) } ## ssGSA ## Main ssGSA function ssGSA <- function(Data, # Input data matrix here Gene_sets, # Input gene sets here GSA_weight = "equal_weighted", # Options are: "equal_weighted" or "group_weighted" GSA_weighted_by = "sum.ES", # Options are: "sum.ES", "avg.ES" or "median.ES" GSA_method = "gsva", # Options are: "gsva","ssgsea","zscore","avg.exprs", or "median.exprs" min.sz = 6, # GSVA parameter max.sz = 2000, # GSVA parameter mx.diff = TRUE # GSVA parameter ){ #0. Check parameter input: #Check input and see if character input is correct #Check GSA_method if(!(GSA_method %in% c("gsva","ssgsea","zscore","avg.exprs", "median.exprs"))){ stop("The parameter input 'GSA_method' is not correct. Please choose from 'gsva','ssgsea','zscore','avg.exprs', 'median.exprs'.")} #Check GSA_weight if(!(GSA_weight %in% c("equal_weighted","group_weighted"))){ stop("The parameter input 'GSA_weight' is not correct. Please choose from 'equal_weighted','group_weighted'.")} #Check GSA_weighted_by if(!(GSA_weighted_by %in% c("sum.ES","avg.ES","median.ES"))){ stop("The parameter input 'GSA_weighted_by' is not correct. Please choose one from 'sum.ES','avg.ES','median.ES'.")} #1.1 Check if the row names of data matrix (Data) and gene sets (Gene_sets) are using the same gene ID genes_in_Gene_sets <- unlist(Gene_sets) %>% as.vector genes_in_Data <- rownames(Data) %>% as.vector check_gene_name_match_noDir(genes_in_Gene_sets, genes_in_Data) #1.2 Report the percent of genes in each gene set that do not have information in the data matrix check_genes_missing_noDir(Gene_sets, Data) #2. Run GSVA for each gene set # Add a message to state the running of gsva writeLines(paste0("Note: Running GSVA for ", length(names(Gene_sets)), " gene sets using '", GSA_method, "' method with '", GSA_weight, "' weights.", sep = "")) if(GSA_method %in% c("gsva","ssgsea","zscore")){ # "gsva", "ssgsea", "zscore" suppressMessages(GS_final <- gsva(Data, Gene_sets, method = GSA_method, # alternatives are "ssgsea", "zscore" min.sz = min.sz, max.sz = max.sz, mx.diff = mx.diff)) # "avg.exprs" }else if(GSA_method == "avg.exprs"){ GS_final <- avg_expression(Data, Gene_sets) # "median.exprs" }else if(GSA_method == "median.exprs"){ GS_final <- median_expression(Data, Gene_sets) } return(GS_final) }