## Main ssdGSA function ssdGSA <- function(Data, # Input data matrix here Gene_sets, # Input gene sets here Direction_matrix = NULL, # Input direction matrix here GSA_weight = "equal_weighted", # Options are: "equal_weighted" or "group_weighted" GSA_weighted_by = "sum.ES", # Options are: "sum.ES", "avg.ES" or "median.ES" GSA_method = "gsva", # Options are: "gsva","ssgsea","zscore","avg.exprs", or "median.exprs" min.sz = 1, # GSVA parameter max.sz = 2000, # GSVA parameter mx.diff = TRUE # GSVA parameter ){ #check if Data and Gene_sets are formated if(!is.matrix(Data)){stop("Please check the format of the data matrix.")} if(!is.list(Gene_sets)){stop("Please check the format of the gene sets.")} #the case when Direction_matrix is empty if(is.null(Direction_matrix)){ GS_final = ssGSA(Data, Gene_sets, GSA_weight, GSA_weighted_by, GSA_method, min.sz, max.sz, mx.diff) }else{ # Format the input data Direction_matrix <- Direction_matrix %>% mutate(across(c(1), as.character)) %>% dplyr::rename(ES = c(2)) ### name gene!! as.character(first column) #0. Check parameter input: #Check input and see if character input is correct #Check GSA_method if(!(GSA_method %in% c("gsva","ssgsea","zscore","avg.exprs", "median.exprs"))){ stop("The parameter input 'GSA_method' is not correct. Please choose from 'gsva','ssgsea','zscore','avg.exprs', 'median.exprs'.")} #Check GSA_weight if(!(GSA_weight %in% c("equal_weighted","group_weighted"))){ stop("The parameter input 'GSA_weight' is not correct. Please choose from 'equal_weighted','group_weighted'.")} #Check GSA_weighted_by if(!(GSA_weighted_by %in% c("sum.ES","avg.ES","median.ES"))){ stop("The parameter input 'GSA_weighted_by' is not correct. Please choose one from 'sum.ES','avg.ES','median.ES'.")} #1.1 Check if the row names of data matrix (Data), gene sets (Gene_sets), and direction matrix (Direction_matrix) are using the same gene ID genes_in_Gene_sets <- unlist(Gene_sets) %>% as.vector genes_in_Data <- rownames(Data) %>% as.vector genes_in_Direction_matrix <- Direction_matrix$gene %>% as.vector check_gene_name_match(genes_in_Gene_sets, genes_in_Data, genes_in_Direction_matrix) #1.2 Report the percent of genes in each gene set that do not have information in the data matrix or direction matrix check_genes_missing(Gene_sets, Data, Direction_matrix) #2. Split the gene sets based on the directionality of genes with information in the direction matrix up_genes <- Direction_matrix %>% filter(ES > 0) %>% pull(gene) down_genes <- Direction_matrix %>% filter(ES < 0) %>% pull(gene) Gene_sets_pov <- lapply(Gene_sets, intersect, y = up_genes) Gene_sets_neg <- lapply(Gene_sets, intersect, y = down_genes) #2.1 Check whether there are gene sets that contain both positively and negatively correlated genes missing in data matrix if ((!is.null(check_genes_missing_total(Gene_sets_pov, Data)$Total_missing_in_data_matrix))& (!is.null(check_genes_missing_total(Gene_sets_pov, Data)$Total_missing_in_data_matrix))){ Missing_Genes <- intersect(check_genes_missing_total(Gene_sets_pov, Data)$Total_missing_in_data_matrix, check_genes_missing_total(Gene_sets_neg, Data)$Total_missing_in_data_matrix) if (length(Missing_Genes)>0){ writeLines(paste0("Both positively and negatively correlated genes in gene set(s): ", Missing_Genes, " are missing in data matrix, so these gene sets will not be calculated in the following analysis.", sep = "")) Gene_sets_pov <- Gene_sets_pov[names(Gene_sets_pov)[!(names(Gene_sets_pov) %in% Missing_Genes)]] Gene_sets_neg <- Gene_sets_neg[names(Gene_sets_neg)[!(names(Gene_sets_neg) %in% Missing_Genes)]] } } #3. Run GSA for each gene set # Add a message to state the running of gsva writeLines(paste0("Note: Running GSA for ", length(names(Gene_sets)), " gene sets using '", GSA_method, "' method with '", GSA_weight, "' weights.", sep = "")) if(GSA_method %in% c("gsva","ssgsea","zscore")){ # "gsva", "ssgsea", "zscore" suppressMessages(GS_up <- gsva(Data, Gene_sets_pov, method = GSA_method, # alternatives are "ssgsea", "zscore" min.sz = min.sz, max.sz = max.sz, mx.diff = mx.diff)) # if (length(Gene_sets_pov)!= dim(GS_up)[1]){ # writeLines(paste0("Warning messages: \n", "There are ", length(Gene_sets_pov), " positively correlated gene sets, but ", dim(GS_up)[1], " gene sets are calculated gsva scores.\n0 will be assigned as gsva scores for the remaining gene sets.", sep = "")) # } suppressMessages(GS_down <- gsva(Data, Gene_sets_neg, method = GSA_method, # alternatives are "ssgsea", "zscore" min.sz = min.sz, max.sz = max.sz, mx.diff = mx.diff)) # if (length(Gene_sets_neg)!= dim(GS_down)[1]){ # writeLines(paste0("Warning messages: \n", "There are ", length(Gene_sets_neg), " negatively correlated gene sets, but ", dim(GS_down)[1], " gene sets are calculated gsva scores.\n0 will be assigned as gsva scores for the remaining gene sets.", sep = "")) # } # "avg.exprs" }else if(GSA_method == "avg.exprs"){ GS_up <- avg_expression(Data, Gene_sets_pov) GS_down <- avg_expression(Data, Gene_sets_neg) # "median.exprs" }else if(GSA_method == "median.exprs"){ GS_up <- median_expression(Data, Gene_sets_pov) GS_down <- median_expression(Data, Gene_sets_neg) } # Check if the dimension of GS_up and GS_down is the same if ((dim(GS_up)[1] != dim(GS_down)[1])|(dim(GS_up)[2] != dim(GS_down)[2])) warning("Please check if there are enough genes in the up-regulated or down-regulated gene sets") #4. Calculate final score for each gene set if(GSA_weight == "equal_weighted"){ suppressMessages(GS_up <- tibble(GS = names(Gene_sets_pov)) %>% left_join(., GS_up %>% as.data.frame()%>% tibble::rownames_to_column(var="GS"), by = "GS") %>% replace(is.na(.), 0) %>% tibble::column_to_rownames(var = "GS")) suppressMessages(GS_down <- tibble(GS = names(Gene_sets_neg)) %>% left_join(., GS_down %>% as.data.frame()%>% tibble::rownames_to_column(var="GS"), by = "GS") %>% replace(is.na(.), 0) %>% tibble::column_to_rownames(var = "GS")) GS_final = GS_up - GS_down }else if(GSA_weight == "group_weighted"){ # function to get the sum/avg/median of ES suppressMessages(GS_pov_temp <- Gene_sets_pov %>% utils::stack() %>% rename_at(1, ~"gene") %>% rename_at(2, ~"GS_pov" ) %>% left_join(Direction_matrix) %>% group_by(GS_pov)) suppressMessages(GS_neg_temp <- Gene_sets_neg %>% utils::stack() %>% rename_at(1, ~"gene") %>% rename_at(2, ~"GS_neg" ) %>% left_join(Direction_matrix) %>% group_by(GS_neg)) if(GSA_weighted_by == "sum.ES"){ GS_pov <- GS_pov_temp %>% summarise(cal_ES = sum(ES)) GS_neg <- GS_neg_temp %>% summarise(cal_ES = sum(ES)) } else if(GSA_weighted_by == "avg.ES") { GS_pov <- GS_pov_temp %>% summarise(cal_ES = mean(ES)) GS_neg <- GS_neg_temp %>% summarise(cal_ES = mean(ES)) } else if(GSA_weighted_by == "median.ES") { GS_pov <- GS_pov_temp %>% summarise(cal_ES = median(ES)) GS_neg <- GS_neg_temp %>% summarise(cal_ES = median(ES)) } # If there are missing information in the direction matrix, we need to modify that. In this case, we assign ES 0 to those # gene sets which have missing values in direction matrix. suppressMessages(GS_pov2 <- tibble(GS_pov = names(Gene_sets_pov)) %>% left_join(., GS_pov) %>% replace(is.na(.), 0) ) suppressMessages(GS_neg2 <- tibble(GS_neg = names(Gene_sets_neg)) %>% left_join(., GS_neg) %>% replace(is.na(.), 0) ) # To modify the case when length(Gene_sets_pov)!= dim(GS_up)[1]. In this case, we assign scores 0 to those # gene sets which have not been calculated GSA scores. GS_up2 <- GS_up %>% as.data.frame() %>% tibble::rownames_to_column(var = "GS_pov") %>% left_join(GS_pov2,., by = "GS_pov") %>% replace(is.na(.), 0) %>% dplyr::select(-cal_ES) %>% as.data.frame() %>% tibble::column_to_rownames(., "GS_pov") GS_down2 <- GS_down %>% as.data.frame() %>% tibble::rownames_to_column(var = "GS_neg") %>% left_join(GS_neg2,., by = "GS_neg") %>% replace(is.na(.), 0) %>% dplyr::select(-cal_ES) %>% as.data.frame() %>% tibble::column_to_rownames(., "GS_neg") GS_final = 2 * (abs(GS_pov2$cal_ES) * GS_up2 - abs(GS_neg2$cal_ES) * GS_down2) / (abs(GS_pov2$cal_ES) + abs(GS_neg2$cal_ES)) } } return(GS_final) }