# Helper functions


transform_ensembl_2_entrez <- function(Data){

  gene_ids <- rownames(Data)
  from_ID_type = "ENSEMBL"
  to_ID_type = "ENTREZID"

  #check if ENSEMBL ID contain ".", if yes, split the ID by "." and only take the first part for matching
  if(grepl("\\.",Data %>% rownames) %>% all & from_ID_type == "ENSEMBL") {
    gene_ids = gene_ids %>% stringr::word( sep = '\\.')
  }

  gene_name <- gene_ids %>%
    clusterProfiler::bitr(fromType = from_ID_type, toType = to_ID_type, OrgDb = org.Hs.eg.db) %>%
    filter(!duplicated(across(from_ID_type))) %>%
    filter(!duplicated(across(to_ID_type))) %>%
    dplyr::rename_with(~"gene",from_ID_type)



  suppressMessages(logcpmx <- Data %>%
                     as_tibble(rownames = "gene") %>%
                     mutate(gene = stringr::word(gene, sep = '\\.')) %>%
                     filter(gene %in% gene_name$gene) %>%
                     left_join(gene_name) %>%
                     dplyr::select(-gene) )

  logcpmxx <- logcpmx %>%
    tibble::column_to_rownames(var = "ENTREZID") %>%
    as.matrix()

  return(logcpmxx)
}




check_gene_name_match <- function(genes_in_Gene_sets,
                                  genes_in_Data,
                                  genes_in_Direction_matrix){
  pct_overlap_gene_sets <- length(intersect(genes_in_Gene_sets, genes_in_Data))/length(genes_in_Gene_sets)
  pct_overlap_direction_matrix <- length(intersect(genes_in_Direction_matrix, genes_in_Gene_sets))/length(genes_in_Gene_sets)

  if(pct_overlap_gene_sets < 0.1){
    stop("The gene ids in the gene sets and the data matrix do not match well. Please check if data has the same gene id type as the gene sets.")}
  else if(pct_overlap_direction_matrix < 0.1){
    stop("The gene ids in the gene sets and the direction matrix do not match well. Please check if direction_matrix has the same gene id type as the gene sets.")}

}




check_genes_missing <- function(Gene_sets, Data, Direction_matrix){
  check_in_fun <- function(x, y){
    return(x[!(x %in% y)])
  }
  PKnames <- names(Gene_sets)
  Gene_sets_data_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = rownames(Data))
  Gene_sets_direction_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = Direction_matrix$gene)
  pct_missingsets_data <- round(length(vctrs::list_drop_empty(Gene_sets_data_missing))/length(PKnames)*100, 2)
  pct_missingsets_direction <- round(length(vctrs::list_drop_empty(Gene_sets_direction_missing))/length(PKnames)*100, 2)
  writeLines(paste0("Warning messages: \n", pct_missingsets_data, "% (", length(vctrs::list_drop_empty(Gene_sets_data_missing)),"/", length(PKnames),") of gene sets have information missing in the data matrix for at least one gene; \n",
                    pct_missingsets_direction, "% (", length(vctrs::list_drop_empty(Gene_sets_direction_missing)),"/", length(PKnames),") of gene sets have information missing in the direction matrix for at least one gene. \n", sep = ""))

  Gene_sets_data_missing <- vctrs::list_drop_empty(Gene_sets_data_missing)
  PKnames_data_missing <- names(Gene_sets_data_missing)
  Gene_sets_direction_missing <- vctrs::list_drop_empty(Gene_sets_direction_missing)
  PKnames_direction_missing <- names(Gene_sets_direction_missing)

  if ((length(Gene_sets_data_missing)>0)&(length(Gene_sets_data_missing)<10)){
    pct_missing_data <- round(lengths(Gene_sets_data_missing)/lengths(Gene_sets[PKnames_data_missing])*100,2)
    writeLines(paste0(PKnames_data_missing, ": \n ", pct_missing_data, "% of genes (",
                      lengths(Gene_sets_data_missing), "/", lengths(Gene_sets[PKnames_data_missing]),
                      ") in this gene set have information missing in the data matrix; \n", sep = ""))
    if (length((names(pct_missing_data)[pct_missing_data == 100]))>0){
      writeLines(paste0("Gene set: '", names(pct_missing_data)[pct_missing_data == 100], "' has 100% information missing in data matrix.", sep = ""))
    }
  } else {
    writeLines("Note since more than 10 gene sets have missing information in the data matrix, no detailed individual gene set missing information will be reported.")
  }

  if ((length(Gene_sets_direction_missing)>0)&(length(Gene_sets_direction_missing)<10)){
    pct_missing_direction <- round(lengths(Gene_sets_direction_missing)/lengths(Gene_sets[PKnames_direction_missing])*100,2)
    writeLines(paste0(PKnames_direction_missing, ": \n ", pct_missing_direction,
                      "% of genes (", lengths(Gene_sets_direction_missing), "/", lengths(Gene_sets[PKnames_direction_missing]),
                      ") in this gene set have information missing in the direction matrix; \n", sep = ""))
    if (length((names(pct_missing_direction)[pct_missing_direction == 100]))>0){
      writeLines(paste0("Gene set: '", names(pct_missing_direction)[pct_missing_direction == 100], "' has 100% information missing in direction matrix.", sep = ""))
    }
  } else {
    writeLines("Note since more than 10 gene sets have missing information in the direction matrix, no detailed individual gene set missing information will be reported.")
  }
}


## AVG cpm method
avg_expression <- function(Data, pathway.db){
  cal.avg.logcpm <- function(pathway.set, df){
    dat.sub <- Data %>%
      as_tibble(rownames = "gene") %>%
      dplyr::filter(gene %in% pathway.set) %>%
      dplyr::select(-gene) %>%
      colMeans() # use colMeans() for column-wise means of a matrix
    out <- c(names(pathway.set), dat.sub)
  }
  result <- purrr::map(pathway.db, cal.avg.logcpm, df = Data) %>%
    bind_rows(.id = "pathway") %>%
    tibble::column_to_rownames(var = "pathway")
  return(result)
}




## MEDIAN cpm method
median_expression <- function(Data, pathway.db){
  cal.med.logcpm <- function(pathway.set, df){
    dat.sub <- Data %>%
      as_tibble(rownames = "gene") %>%
      filter(gene %in% pathway.set) %>%
      dplyr::select(-gene) %>%
      apply(., 2, median)
    # as.matrix() %>%
    # robustbase::colMedians() # use colMedians() for column-wise medians of a matrix
    out <- c(names(pathway.set), dat.sub)
  }
  result <- purrr::map(pathway.db, cal.med.logcpm, df = Data) %>%
    bind_rows(.id = "pathway") %>%
    tibble::column_to_rownames(var = "pathway")
  return(result)
}




check_gene_name_match_noDir <- function(genes_in_Gene_sets,
                                        genes_in_Data){
  pct_overlap_gene_sets <- length(intersect(genes_in_Gene_sets, genes_in_Data))/length(genes_in_Gene_sets)

  if(pct_overlap_gene_sets < 0.1){
    stop("The gene ids in the gene sets and the data matrix do not match well. Please check if data has the same gene id type as the gene sets.")
  }
}




check_genes_missing_noDir <- function(Gene_sets, Data){
  check_in_fun <- function(x, y){
    return(x[!(x %in% y)])
  }
  PKnames <- names(Gene_sets)
  Gene_sets_data_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = rownames(Data))
  pct_missingsets_data <- round(length(vctrs::list_drop_empty(Gene_sets_data_missing))/length(PKnames)*100, 2)
  writeLines(paste0("Warning messages: \n", pct_missingsets_data, "% (", length(vctrs::list_drop_empty(Gene_sets_data_missing)),"/", length(PKnames),") of gene sets have information missing in the data matrix for at least one gene.", sep = ""))

  Gene_sets_data_missing <- vctrs::list_drop_empty(Gene_sets_data_missing)
  PKnames_data_missing <- names(Gene_sets_data_missing)

  if ((length(Gene_sets_data_missing)>0)&(length(Gene_sets_data_missing)<10)){
    pct_missing_data <- round(lengths(Gene_sets_data_missing)/lengths(Gene_sets[PKnames_data_missing])*100,2)
    writeLines(paste0(PKnames_data_missing, ": \n ", pct_missing_data, "% of genes (",
                      lengths(Gene_sets_data_missing), "/", lengths(Gene_sets[PKnames_data_missing]),
                      ") in this gene set have information missing in the data matrix; \n", sep = ""))
    if (length((names(pct_missing_data)[pct_missing_data == 100]))>0){
      writeLines(paste0("Gene set: '", names(pct_missing_data)[pct_missing_data == 100], "' has 100% information missing in data matrix.", sep = ""))
    }
  }else {
    writeLines("Note since more than 10 gene sets have missing information in the data matrix, no detailed individual gene set missing information will be reported.")
  }
}



check_genes_missing_total <- function(Gene_sets, Data){
  check_in_fun <- function(x, y){
    return(x[!(x %in% y)])
  }
  PKnames <- names(Gene_sets)
  Gene_sets_data_missing <- lapply(Gene_sets[PKnames], check_in_fun, y = rownames(Data))

  Gene_sets_data_missing <- vctrs::list_drop_empty(Gene_sets_data_missing)
  PKnames_data_missing <- names(Gene_sets_data_missing)
  Gene_names_data_100 <- c()
  if (length(Gene_sets_data_missing)>0){
    pct_missing_data <- round(lengths(Gene_sets_data_missing)/lengths(Gene_sets[PKnames_data_missing])*100,2)
    if (length((names(pct_missing_data)[pct_missing_data == 100]))>0){
      Gene_names_data_100 <- names(pct_missing_data)[pct_missing_data == 100]
    }
  }
  return(list(Total_missing_in_data_matrix = Gene_names_data_100))
}


## ssGSA
## Main ssGSA function
ssGSA <- function(Data, # Input data matrix here
                  Gene_sets, # Input gene sets here
                  GSA_weight = "equal_weighted", # Options are: "equal_weighted" or "group_weighted"
                  GSA_weighted_by = "sum.ES", # Options are: "sum.ES", "avg.ES" or "median.ES"
                  GSA_method = "gsva", # Options are: "gsva","ssgsea","zscore","avg.exprs", or "median.exprs"
                  min.sz = 6,  # GSVA parameter
                  max.sz = 2000, # GSVA parameter
                  mx.diff = TRUE # GSVA parameter
){
  #0. Check parameter input:
  #Check input and see if character input is correct
  #Check GSA_method
  if(!(GSA_method %in% c("gsva","ssgsea","zscore","avg.exprs", "median.exprs"))){
    stop("The parameter input 'GSA_method' is not correct. Please choose from 'gsva','ssgsea','zscore','avg.exprs', 'median.exprs'.")}
  #Check GSA_weight
  if(!(GSA_weight %in% c("equal_weighted","group_weighted"))){
    stop("The parameter input 'GSA_weight' is not correct. Please choose from 'equal_weighted','group_weighted'.")}
  #Check GSA_weighted_by
  if(!(GSA_weighted_by %in% c("sum.ES","avg.ES","median.ES"))){
    stop("The parameter input 'GSA_weighted_by' is not correct. Please choose one from 'sum.ES','avg.ES','median.ES'.")}


  #1.1 Check if the row names of data matrix (Data) and gene sets (Gene_sets) are using the same gene ID
  genes_in_Gene_sets <- unlist(Gene_sets) %>% as.vector
  genes_in_Data <- rownames(Data) %>% as.vector

  check_gene_name_match_noDir(genes_in_Gene_sets,
                              genes_in_Data)

  #1.2 Report the percent of genes in each gene set that do not have information in the data matrix
  check_genes_missing_noDir(Gene_sets, Data)



  #2. Run GSVA for each gene set

  # Add a message to state the running of gsva
  writeLines(paste0("Note: Running GSVA for ", length(names(Gene_sets)), " gene sets using '", GSA_method, "' method with '", GSA_weight, "' weights.", sep = ""))
  if(GSA_method %in% c("gsva","ssgsea","zscore")){

    # "gsva", "ssgsea", "zscore"
    suppressMessages(GS_final <- gsva(Data,
                                      Gene_sets,
                                      method = GSA_method, # alternatives are "ssgsea", "zscore"
                                      min.sz = min.sz,
                                      max.sz = max.sz,
                                      mx.diff = mx.diff))

    # "avg.exprs"
  }else if(GSA_method == "avg.exprs"){
    GS_final <- avg_expression(Data, Gene_sets)

    # "median.exprs"
  }else if(GSA_method == "median.exprs"){
    GS_final <- median_expression(Data, Gene_sets)
  }
  return(GS_final)
}