# cmahalanobis.R utils::globalVariables(c("Species", "Distance", "Comparison")) utils::globalVariables(c("Var1", "Var2", "value", "element_text")) utils::globalVariables(c("p_values", "distances")) #' @importFrom stats mahalanobis #' @importFrom mice mice #' @importFrom mice complete #' @importFrom stats cov #' @importFrom stats mahalanobis #' @importFrom stats cov #' @importFrom ggplot2 geom_bar #' @importFrom ggplot2 labs #' @importFrom ggplot2 theme_minimal #' @importFrom reshape2 melt #' @importFrom reshape2 melt #' #' @name cmahalanobis #' @title Calculate the Mahalanobis distance for each species. #' #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Mahalanobis distances about it. #' #' #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Mahalanobis distances matrix. #' @param plot Logical, if TRUE, a plot of Mahalanobis distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Mahalanobis distances between each pair of groups and the plot. #' #' #' @examples #' # Example with the iris dataset #' #' data(iris) #' #' # Calculate the Mahalanobis distance with the cmahalanobis function #' cmahalanobis(iris, ~Species, plot = TRUE, plot_title = "Mahalanobis Distance Between Groups") #' #' # Example with the mtcars dataset #' data(mtcars) #' #' # Calculate the Mahalanobis distance with the cmahalanobis function #' cmahalanobis(mtcars, ~am, plot = TRUE, plot_title = "Mahalanobis Distance Between Groups") #' #' @export cmahalanobis <- function(dataset, formula, plot = TRUE, plot_title = "Mahalanobis Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_mean) # Obtain the number of groups n <- length(groups) # Precalculate means and covariances means <- lapply(groups, colMeans) covariances <- lapply(groups, function(df) { cov_matrix <- cov(df) n <- nrow(cov_matrix) reg <- 0.01 # Regularization value cov_matrix <- cov_matrix + diag(reg, nrow = n) return(cov_matrix) }) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Mahalanobis distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean of the group i cov_i <- covariances[[i]] for (j in 1:n) { if (i != j) { distances[i, j] <- mean(mahalanobis(x = groups[[j]], center = mean_i, cov = cov_i)) } } } # If plot is TRUE, call the "plot_mahalanobis_distances" function and print the plot if (plot) { print(plot_mahalanobis_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } impute_with_mean <- function(df) { # Calculate mean for each columns, ignoring missing values means <- colMeans(df, na.rm = TRUE) # Replace missing values with the corresponding mean for (i in 1:ncol(df)) { df[is.na(df[, i]), i] <- means[i] } return(df) } # Auxiliary function to print the Mahalanobis distances plot plot_mahalanobis_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_cmahalanobis #' @title Generate a Microsoft Word document about Mahalanobis distance matrix and p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about Mahalanobis distance matrix and p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Mahalanobis distances matrix and p_values matrix. #' @param pvalue.method A method with which you want to calculate pvalue matrix.The default method is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations A number of permutations to define if you choose "permutation". #' @param num.bootstraps A number of bootstrap to define if you choose "bootstrap". #' @return A Microsoft Word document about Mahalanobis distances matrix and p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cmahalanobis(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cmahalanobis(mtcars, ~am) #' #' @export generate_report_cmahalanobis <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cmahalanobis_results <- cmahalanobis(dataset, formula) distances <- cmahalanobis_results if (pvalue.method == "chisq") { p_values <- pvaluescmaha(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescmaha(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescmaha(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcmahalanobis_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cmahalanobis.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcmahalanobis.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcmahalanobis_files", recursive = TRUE) } #' @name pvaluescmaha #' @title Calculate p_values matrix for each species, using Mahalanobis distance as a base. #' #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of the species and a plot if the user select TRUE using Mahalanobis distance for distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Mahalanobis distances matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq".Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot a p_values heatmap. The default value is TRUE. #' @return A list containing the p-values matrix and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescmaha(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescmaha(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluescmaha <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("Response variable", response, "is absent in the dataset")) } # Calculate Mahalanobis distances mahalanobis_results <- cmahalanobis(dataset, formula, plot = FALSE) distances <- mahalanobis_results$distances # Obtain the number of groups n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for covariance matrix estimate) for (j in 1:n) { if (i != j) { if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cmahalanobis(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cmahalanobis(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values method calculation not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' Calculate Euclidean distance #' #' @name ceuclide #' @title Calculate the Euclidean distance of a factor in a dataframe. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Euclidean distances about it. #' @param dataset A dataframe. #' @param formula The factor which you want to calculate the Euclidean distances matrix. #' @param plot If TRUE, shows a plot of the Euclidean distances matrix. #' @param plot_title The title of the plot. #' @return The matrix containing distances. #' @examples #' #' # Example with iris dataset #' #' ceuclide(iris, ~Species, plot = TRUE, plot_title = "Euclidean Distance Between Groups") #' #' # Example with mtcars dataset #' #' ceuclide(mtcars, ~am, plot = TRUE, plot_title = "Euclidean Distance Between Groups") #' #' @export ceuclide <- function(dataset, formula, plot = TRUE, plot_title = "Euclidean Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means means <- lapply(groups, colMeans) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Euclidean distance between each pair of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean of group i for (j in 1:n) { if (i != j) { distances[i, j] <- sqrt(sum((mean_i - means[[j]])^2)) } } } # If plot is TRUE, call the "plot_euclidean_distances" function and print the plot if (plot) { print(plot_euclidean_distances(distances, plot_title)) } # Return a matrix containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Euclide distances plot plot_euclidean_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_ceuclide #' @title Generate a Microsoft Word document about the Euclidean distance matrix and the p-values matrix with relative plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Euclidean distance matrix and the p-values matrix with relative plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Euclidean distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq".Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Euclidean distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_ceuclide(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_ceuclide(mtcars, ~am) #' #' @export generate_report_ceuclide <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") ceuclide_results <- ceuclide(dataset, formula) distances <- ceuclide_results if (pvalue.method == "chisq") { p_values <- pvaluesceucl(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluesceucl(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluesceucl(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportceuclide_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_ceuclide.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportceuclide.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportceuclide_files", recursive = TRUE) } #' @name pvaluesceucl #' @title Calculate the p_values matrix for each species, using the Euclidean distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Euclidean distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Euclidean distances. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing the p_values matrix and, optionally, the plot. #' #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluesceucl(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluesceucl(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' #' @export pvaluesceucl <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Euclidean distances using "ceuclide" function ceuclide_results <- ceuclide(dataset, formula, plot = FALSE) distances <- ceuclide_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- ceuclide(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- ceuclide(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' Calculate Manhattan distance #' @name cmanhattan #' @title Calculate a Manhattan distance of a factor in a dataframe. #' @description This function takes a dataframe and a factor in input, and returns a matrix with the Manhattan distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Manhattan distance. #' @param plot If TRUE, show a plot of distances. #' @param plot_title The title of plot. #' @return A matrix containing distances. #' @examples #' # Example with iris dataset #' #' cmanhattan(iris, ~Species, plot = TRUE, plot_title = "Manhattan Distance Between Groups") #' #' # Example with mtcars dataset #' #' cmanhattan(mtcars, ~am, plot = TRUE, plot_title = "Manhattan Distance Between Groups") #' #' @export cmanhattan <- function(dataset, formula, plot = TRUE, plot_title = "Manhattan Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means means <- lapply(groups, colMeans) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Manhattan distance between each pair of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean of group i for (j in 1:n) { if (i != j) { distances[i, j] <- sum(abs(mean_i - means[[j]])) } } } # If plot is TRUE, call the "plot_manhattan_distances" function and print the plot if (plot) { print(plot_manhattan_distances(distances, plot_title)) } # Return a matrix containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Manhattan distances plot plot_manhattan_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_cmanhattan #' @title Generate a Microsoft Word document about the Manhattan distance and the p-values matrices with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Manhattan distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Manhattan distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq".Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Manhattan distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cmanhattan(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cmanhattan(mtcars, ~am) #' #' @export generate_report_cmanhattan <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cmanhattan_results <- cmanhattan(dataset, formula) distances <- cmanhattan_results if (pvalue.method == "chisq") { p_values <- pvaluescmanh(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescmanh(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescmanh(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcmanhattan_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cmanhattan.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcmanhattan.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcmanhattan_files", recursive = TRUE) } #' @name pvaluescmanh #' @title Calculate the p_values matrix for each species, using Manhattan distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Manhattan distance for the distances calculation. #' @param dataset A dataframe #' @param formula A factor which you want to calculate Manhattan distances. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A matrix containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescmanh(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescmanh(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' #' @export pvaluescmanh <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Manhattan distances using "cmanhattan" function cmanhattan_results <- cmanhattan(dataset, formula, plot = FALSE) distances <- cmanhattan_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cmanhattan(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cmanhattan(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name cchebyshev #' @title Calculate the p_values matrix for each species, using Chebyshev distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using the Chebyshev distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Chebyshev distance. #' @param plot If TRUE, displays a plot of distances. #' @param plot_title The title of plot. #' @return A matrix containing distances and, optionally, the plot. #' @examples #' # Example with iris dataset #' #' cchebyshev(iris, ~Species, plot = TRUE, plot_title = "Chebyshev Distance Between Groups") #' #' # Example with mtcars dataset #' #' cchebyshev(mtcars, ~am, plot = TRUE, plot_title = "Chebyshev Distance Between Groups") #' #' #' @export cchebyshev <- function(dataset, formula, plot = TRUE, plot_title = "Chebyshev Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means means <- lapply(groups, colMeans) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Chebyshev distance between each pair of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean of group i for (j in 1:n) { if (i != j) { distances[i, j] <- max(abs(mean_i - means[[j]])) } } } # If plot is TRUE, call the "plot_chebyshev_distances" function and print the plot if (plot) { print(plot_chebyshev_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Mahalanobis distances plot plot_chebyshev_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_cchebyshev #' @title Generate a Microsoft Word document about the Chebyshev distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Chebyshev distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Chebyshev distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Chebyshev distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cchebyshev(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cchebyshev(mtcars, ~am) #' #' @export generate_report_cchebyshev <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cchebyshev_results <- cchebyshev(dataset, formula) distances <- cchebyshev_results if (pvalue.method == "chisq") { p_values <- pvaluesccheb(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluesccheb(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluesccheb(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcchebyshev_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cchebyshev.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcchebyshev.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcchebyshev_files", recursive = TRUE) } #' @name pvaluesccheb #' @title Calculate the p_values matrix for each species, using Chebyshev distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Chebyshev distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Chebyshev distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluesccheb(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluesccheb(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluesccheb <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Chebyshev distances using "cchebyshev" function cchebyshev_results <- cchebyshev(dataset, formula, plot = FALSE) distances <- cchebyshev_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cchebyshev(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cchebyshev(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' Calculate Hamming distance #' #' @name chamming #' @title Calculate the Hamming distance of a factor in a dataframe. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Hamming distances about it. #' @param dataset A dataframe. #' @param formula The factor which you want to calculate the Hamming distances matrix. #' @param plot If TRUE, shows a plot of the Hamming distances matrix. #' @param plot_title The title of the plot. #' @return The matrix containing distances. #' @examples #' #' # Example with iris dataset #' #' chamming(iris, ~Species, plot = TRUE, plot_title = "Hamming Distance Between Groups") #' #' # Example with mtcars dataset #' #' chamming(mtcars, ~am, plot = TRUE, plot_title = "Hamming Distance Between Groups") #' #' @export chamming <- function(dataset, formula, plot = TRUE, plot_title = "Hamming Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variable in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Hamming distance between each couple of groups for (i in 1:n) { for (j in 1:n) { if (i != j) { distances[i, j] <- mean(apply(groups[[i]], 1, function(row_i) { apply(groups[[j]], 1, function(row_j) { sum(row_i != row_j) }) })) } } } # If plot is TRUE, call the "plot_hamming_distances" function and print the plot if (plot) { print(plot_hamming_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Hamming distances plot plot_hamming_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvalueschamm #' @title Calculate the p_values matrix for each species, using Hamming distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Hamming distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Hamming distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvalueschamm(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvalueschamm(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvalueschamm <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Hamming distances using "chamming" function chamming_results <- chamming(dataset, formula, plot = FALSE) distances <- chamming_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- chamming(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- chamming(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_chamming #' @title Generate a Microsoft Word document about the Hamming distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Hamming distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Hamming distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Hamming distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_chamming(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_chamming(mtcars, ~am) #' #' @export generate_report_chamming <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") chamming_results <- chamming(dataset, formula) distances <- chamming_results if (pvalue.method == "chisq") { p_values <- pvalueschamm(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvalueschamm(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvalueschamm(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportchamming_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_chamming.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportchamming.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportchamming_files", recursive = TRUE) } #' @name ccanberra #' @title Calculate the Canberra distance for each species. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Canberra distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Canberra distances matrix. #' @param plot Logical, if TRUE, a plot of Canberra distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Canberra distances between each pair of groups and the plot. #' #' #' @examples #' # Example with the iris dataset #' #' ccanberra(iris, ~Species, plot = TRUE, plot_title = "Canberra Distance Between Groups") #' #' # Example with the mtcars dataset #' ccanberra(mtcars, ~am, plot = TRUE, plot_title = "Canberra Distance Between Groups") #' #' @export ccanberra <- function(dataset, formula, plot = TRUE, plot_title = "Canberra Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means means <- lapply(groups, colMeans) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate the Canberra distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean of group i for (j in 1:n) { if (i != j) { distances[i, j] <- mean(apply(groups[[j]], 1, function(row) sum(abs(row - mean_i) / (abs(row) + abs(mean_i))))) } } } # If plot is TRUE, call the "plot_canberra_distances" and print the plot if (plot) { print(plot_canberra_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Canberra distances plot plot_canberra_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvaluesccanb #' @title Calculate the p_values matrix for each species, using Canberra distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Canberra distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Canberra distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluesccanb(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluesccanb(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluesccanb <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Canberra distances using "ccanberra" function ccanberra_results <- ccanberra(dataset, formula, plot = FALSE) distances <- ccanberra_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- ccanberra(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- ccanberra(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_ccanberra #' @title Generate a Microsoft Word document about the Canberra distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Canberra distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Canberra distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Canberra distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_ccanberra(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_ccanberra(mtcars, ~am) #' #' @export generate_report_ccanberra <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") ccanberra_results <- ccanberra(dataset, formula) distances <- ccanberra_results if (pvalue.method == "chisq") { p_values <- pvaluesccanb(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluesccanb(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluesccanb(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportccanberra_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_ccanberra.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportccanberra.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportccanberra_files", recursive = TRUE) } #' Calculate Minkowski distance #' #' @name cminkowski #' @title Calculate the Minkowski distance of a factor in a dataframe. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Minkowski distances about it. #' @param dataset A dataframe. #' @param formula The factor which you want to calculate the Minkowski distances matrix. #' @param p Order of the Minkowski distance #' @param plot If TRUE, shows a plot of the Minkowski distances matrix. #' @param plot_title The title of the plot. #' @return The matrix containing distances. #' @examples #' #' # Example with iris dataset #' #' cminkowski(iris, ~Species, p = 3, plot = TRUE, plot_title = "Minkowski Distance Between Groups") #' #' # Example with mtcars dataset #' #' cminkowski(mtcars, ~am, p = 3, plot = TRUE, plot_title = "Minkowski Distance Between Groups") #' #' @export cminkowski <- function(dataset, formula, p = 3, plot = TRUE, plot_title = "Minkowski Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means means <- lapply(groups, colMeans) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Minkowski distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean of group i for (j in 1:n) { if (i != j) { distances[i, j] <- mean(apply(groups[[j]], 1, function(row) { sum(abs(row - mean_i)^p)^(1/p) })) } } } # If plot is TRUE, call the function "plot_minkowski_distances" and print the plot if (plot) { print(plot_minkowski_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Minkowski distances plot plot_minkowski_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvaluescmink #' @title Calculate the p_values matrix for each species, using Minkowski distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Minkowski distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Minkowski distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param p Order of the Minkowski distance #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescmink(iris,~Species, p = 3, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescmink(mtcars,~am, p = 3, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluescmink <- function(dataset, formula, p = 3, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Minkowski distances using "cminkowski" function cminkowski_results <- cminkowski(dataset, formula, p = p, plot = FALSE) distances <- cminkowski_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cminkowski(permuted_data, formula, p = p, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cminkowski(bootstrap_data, formula, p = p, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_cminkowski #' @title Generate a Microsoft Word document about the Minkowski distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Minkowski distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Minkowski distance matrix and the p_values matrix. #' @param p Order of the Minkowski distance #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Minkowski distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cminkowski(iris, ~Species, p = 3) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cminkowski(mtcars, ~am, p = 3) #' #' @export generate_report_cminkowski <- function(dataset, formula, p = 3, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cminkowski_results <- cminkowski(dataset, formula, p = p) distances <- cminkowski_results if (pvalue.method == "chisq") { p_values <- pvaluescmink(dataset, formula, p = p, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescmink(dataset, formula, p = p, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescmink(dataset, formula, p = p, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcminkowski_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cminkowski.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcminkowski.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcminkowski_files", recursive = TRUE) } #' Calculate Cosine distance #' #' @name ccosine #' @title Calculate the Cosine distance of a factor in a dataframe. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Cosine distances about it. #' @param dataset A dataframe. #' @param formula The factor which you want to calculate the Cosine distances matrix. #' @param plot If TRUE, shows a plot of the Cosine distances matrix. #' @param plot_title The title of the plot. #' @return The matrix containing distances. #' @examples #' #' # Example with iris dataset #' #' ccosine(iris, ~Species, plot = TRUE, plot_title = "Cosine Distance Between Groups") #' #' # Example with mtcars dataset #' #' ccosine(mtcars, ~am, plot = TRUE, plot_title = "Cosine Distance Between Groups") #' #' @export ccosine <- function(dataset, formula, plot = TRUE, plot_title = "Cosine Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means means <- lapply(groups, colMeans) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Cosine distance between each couple of groups for (i in 1:n) { for (j in 1:n) { if (i != j) { num <- sum(means[[i]] * means[[j]]) denom <- sqrt(sum(means[[i]]^2)) * sqrt(sum(means[[j]]^2)) distances[i, j] <- 1 - (num / denom) } } } # If plot is TRUE, call the "plot_cosine_distances" function and print the plot if (plot) { print(plot_cosine_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Cosine distances plot plot_cosine_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvaluesccosi #' @title Calculate the p_values matrix for each species, using Cosine distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Cosine distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Cosine distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluesccosi(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluesccosi(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluesccosi <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Cosine distances using "ccosine" function ccosine_results <- ccosine(dataset, formula, plot = FALSE) distances <- ccosine_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- ccosine(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- ccosine(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_ccosine #' @title Generate a Microsoft Word document about the Cosine distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Cosine distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Cosine distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Cosine distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_ccosine(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_ccosine(mtcars, ~am) #' #' @export generate_report_ccosine <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") ccosine_results <- ccosine(dataset, formula) distances <- ccosine_results if (pvalue.method == "chisq") { p_values <- pvaluesccosi(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluesccosi(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluesccosi(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportccosine_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_ccosine.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportccosine.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportccosine_files", recursive = TRUE) } #' @name ccbhattacharyya #' @title Calculate the Bhattacharyya distance for each species. #' #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Bhattacharyya distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Bhattacharyya distances matrix. #' @param plot Logical, if TRUE, a plot of Bhattacharyya distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Bhattacharyya distances between each pair of groups and the plot. #' @examples #' # Example with the iris dataset #' cbhattacharyya(iris, ~Species, plot = TRUE, plot_title = "Bhattacharyya Distance Between Groups") #' #' # Example with the mtcars dataset #' cbhattacharyya(mtcars, ~am, plot = TRUE, plot_title = "Bhattacharyya Distance Between Groups") #' #' @export cbhattacharyya <- function(dataset, formula, plot = TRUE, plot_title = "Bhattacharyya Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means and covariances means <- lapply(groups, colMeans) covariances <- lapply(groups, function(df) { cov_matrix <- cov(df) n <- nrow(cov_matrix) reg <- 0.01 # Regularization value cov_matrix <- cov_matrix + diag(reg, nrow = n) return(cov_matrix) }) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Bhattacharyya distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean in the group i cov_i <- covariances[[i]] for (j in 1:n) { if (i != j) { mean_j <- means[[j]] cov_j <- covariances[[j]] # Calculate the average covariance matrix cov_mean <- (cov_i + cov_j) / 2 # Calculate the Bhattacharyya distance term1 <- 0.25 * t(mean_i - mean_j) %*% solve(cov_mean) %*% (mean_i - mean_j) term2 <- 0.5 * log(det(cov_mean) / sqrt(det(cov_i) * det(cov_j))) distances[i, j] <- term1 + term2 } } } # If plot is TRUE, call the "plot_bhattacharyya_distances" function and print the plot if (plot) { print(plot_bhattacharyya_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Bhattacharyya distances plot plot_bhattacharyya_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvaluescbatt #' @title Calculate the p_values matrix for each species, using Bhattacharyya distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Bhattacharyya distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Bhattacharyya distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescbatt(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescbatt(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluescbatt <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Bhattacharyya distances using "cbhattacharyya" function cbhattacharyya_results <- cbhattacharyya(dataset, formula, plot = FALSE) distances <- cbhattacharyya_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cbhattacharyya(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cbhattacharyya(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_cbhattacharyya #' @title Generate a Microsoft Word document about the Bhattacharyya distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Bhattacharyya distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Bhattacharyya distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Bhattacharyya distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cbhattacharyya(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cbhattacharyya(mtcars, ~am) #' #' @export generate_report_cbhattacharyya <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cbhattacharyya_results <- cbhattacharyya(dataset, formula) distances <- cbhattacharyya_results if (pvalue.method == "chisq") { p_values <- pvaluescbatt(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescbatt(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescbatt(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcbhattacharyya_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cbhattacharyya.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcbhattacharyya.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcbhattacharyya_files", recursive = TRUE) } #' @name cjaccard #' @title Calculate the Jaccard distance for each species. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Jaccard distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Jaccard distances matrix. #' @param plot Logical, if TRUE, a plot of Jaccard distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Jaccard distances between each pair of groups and the plot. #' #' #' @examples #' # Example with the iris dataset #' #' cjaccard(iris, ~Species, plot = TRUE, plot_title = "Jaccard Distance Between Groups") #' #' # Example with the mtcars dataset #' cjaccard(mtcars, ~am, plot = TRUE, plot_title = "Jaccard Distance Between Groups") #' #' @export cjaccard <- function(dataset, formula, plot = TRUE, plot_title = "Jaccard Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group and binarize them binarize <- function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns df <- as.data.frame(lapply(df, function(x) as.integer(x > mean(x)))) return(df) } groups <- lapply(groups, binarize) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Jaccard distance between each couple of groups for (i in 1:n) { for (j in 1:n) { if (i != j) { # Flatten groups to create binary vectors vector_i <- as.vector(unlist(groups[[i]])) vector_j <- as.vector(unlist(groups[[j]])) # Calculate Jaccard similarity intersection <- sum(vector_i & vector_j) union <- sum(vector_i | vector_j) jaccard_similarity <- intersection / union # Calculate Jaccard distance distances[i, j] <- 1 - jaccard_similarity } } } # If plot is TRUE, call the "plot_jaccard_distances" function and print the plot if (plot) { print(plot_jaccard_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Jaccard distances plot plot_jaccard_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvaluescjacc #' @title Calculate the p_values matrix for each species, using Jaccard distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Jaccard distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Jaccard distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescjacc(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescjacc(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluescjacc <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Jaccard distances using "cjaccard" function cjaccard_results <- cjaccard(dataset, formula, plot = FALSE) distances <- cjaccard_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cjaccard(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cjaccard(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_cjaccard #' @title Generate a Microsoft Word document about the Jaccard distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Jaccard distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Jaccard distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Jaccard distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cjaccard(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cjaccard(mtcars, ~am) #' #' @export generate_report_cjaccard <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cjaccard_results <- cjaccard(dataset, formula) distances <- cjaccard_results if (pvalue.method == "chisq") { p_values <- pvaluescjacc(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescjacc(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescjacc(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcjaccard_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cjaccard.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcjaccard.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcjaccard_files", recursive = TRUE) } #' @name chellinger #' @title Calculate the Hellinger distance for each species. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Hellinger distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Hellinger distances matrix. #' @param plot Logical, if TRUE, a plot of Hellinger distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Hellinger distances between each pair of groups and the plot. #' @examples #' # Example with the iris dataset #' #' chellinger(iris, ~Species, plot = TRUE, plot_title = "Hellinger Distance Between Groups") #' #' # Example with the mtcars dataset #' chellinger(mtcars, ~am, plot = TRUE, plot_title = "Hellinger Distance Between Groups") #' #' @export chellinger <- function(dataset, formula, plot = TRUE, plot_title = "Hellinger Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means and standard deviations means <- lapply(groups, colMeans) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Hellinger distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean in group i for (j in 1:n) { if (i != j) { mean_j <- means[[j]] # Calculate Hellinger distance hellinger_distance <- sqrt(0.5 * sum((sqrt(mean_i) - sqrt(mean_j))^2)) distances[i, j] <- hellinger_distance } } } # If plot is TRUE, call the "plot_hellinger_distances" function and print the plot if (plot) { print(plot_hellinger_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Hellinger distances plot plot_hellinger_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvalueschell #' @title Calculate the p_values matrix for each species, using Hellinger distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Hellinger distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Hellinger distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvalueschell(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvalueschell(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvalueschell <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Hellinger distances using "chellinger" function chellinger_results <- chellinger(dataset, formula, plot = FALSE) distances <- chellinger_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- chellinger(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- chellinger(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_chellinger #' @title Generate a Microsoft Word document about the Hellinger distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Hellinger distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Hellinger distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Hellinger distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_chellinger(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_chellinger(mtcars, ~am) #' #' @export generate_report_chellinger <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") chellinger_results <- chellinger(dataset, formula) distances <- chellinger_results if (pvalue.method == "chisq") { p_values <- pvalueschell(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvalueschell(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvalueschell(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportchellinger_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_chellinger.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportchellinger.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportchellinger_files", recursive = TRUE) } #' @name cbraycurtis #' @title Calculate the Bray-Curtis distance for each species. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Bray-Curtis distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Bray-Curtis distances matrix. #' @param plot Logical, if TRUE, a plot of Bray-Curtis distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Bray-Curtis distances between each pair of groups and the plot. #' @examples #' # Example with the iris dataset #' #' cbraycurtis(iris, ~Species, plot = TRUE, plot_title = "Bray-Curtis Distance Between Groups") #' #' # Example with the mtcars dataset #' cbraycurtis(mtcars, ~am, plot = TRUE, plot_title = "Bray-Curtis Distance Between Groups") #' #' @export cbraycurtis <- function(dataset, formula, plot = TRUE, plot_title = "Bray-Curtis Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Bray-Curtis distance between each couple of groups for (i in 1:n) { for (j in 1:n) { if (i != j) { common_columns <- intersect(colnames(groups[[i]]), colnames(groups[[j]])) group_i <- rowSums(groups[[i]][, common_columns, drop = FALSE]) group_j <- rowSums(groups[[j]][, common_columns, drop = FALSE]) sum_abs_min <- sum(abs(group_i - group_j)) sum_total <- sum(group_i) + sum(group_j) bray_curtis_distance <- sum_abs_min / sum_total distances[i, j] <- bray_curtis_distance } } } # If plot is TRUE, call the "plot_braycurtis_distances" function and print the plot if (plot) { print(plot_braycurtis_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) } # Auxiliary function to print the Bray-Curtis distances plot plot_braycurtis_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvaluescbrcu #' @title Calculate the p_values matrix for each species, using Bray-Curtis distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Bray-Curtis distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Bray-Curtis distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescbrcu(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescbrcu(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluescbrcu <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Bray-Curtis distances using "cbraycurtis" function cbraycurtis_results <- cbraycurtis(dataset, formula, plot = FALSE) distances <- cbraycurtis_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cbraycurtis(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cbraycurtis(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_cbraycurtis #' @title Generate a Microsoft Word document about the Bray-Curtis distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Bray-Curtis distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Bray-Curtis distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Bray-Curtis distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cbraycurtis(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cbraycurtis(mtcars, ~am) #' #' @export generate_report_cbraycurtis <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cbraycurtis_results <- cbraycurtis(dataset, formula) distances <- cbraycurtis_results if (pvalue.method == "chisq") { p_values <- pvaluescbrcu(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescbrcu(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescbrcu(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcbraycurtis_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cbraycurtis.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcbraycurtis.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcbraycurtis_files", recursive = TRUE) } #' @name csorensendice #' @title Calculate the Sorensen-Dice distance for each species. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Sorensen-Dice distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Sorensen-Dice distances matrix. #' @param plot Logical, if TRUE, a plot of Sorensen-Dice distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Sorensen-Dice distances between each pair of groups and the plot. #' @examples #' # Example with the iris dataset #' #' csorensendice(iris, ~Species, plot = TRUE, plot_title = "Sorensen-Dice Distance Between Groups") #' #' # Example with the mtcars dataset #' csorensendice(mtcars, ~am, plot = TRUE, plot_title = "Sorensen-Dice Distance Between Groups") #' #' @export csorensendice <- function(dataset, formula, plot = TRUE, plot_title = "Sorensen-Dice Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with the multiple imputation in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Create an empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Sorensen-Dice distance between each couple of groups for (i in 1:n) { for (j in 1:n) { if (i != j) { common_columns <- intersect(colnames(groups[[i]]), colnames(groups[[j]])) group_i <- rowSums(groups[[i]][, common_columns, drop = FALSE]) group_j <- rowSums(groups[[j]][, common_columns, drop = FALSE]) intersection_sum <- sum(pmin(group_i, group_j)) sorensen_dice_distance <- (2 * intersection_sum) / (sum(group_i) + sum(group_j)) distances[i, j] <- sorensen_dice_distance } } } # If plot is TRUE, call the "plot_sorensendice_distances" function and print the plot if (plot) { print(plot_sorensendice_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the multiple imputation impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) } # Auxiliary function to print the Sorensen-Dice distances plot plot_sorensendice_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name pvaluescsore #' @title Calculate the p_values matrix for each species, using Sorensen-Dice distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Sorensen-Dice distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Sorensen-Dice distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescsore(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescsore(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluescsore <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Sorensen-Dice distances using "csorensendice" function csorensendice_results <- csorensendice(dataset, formula, plot = FALSE) distances <- csorensendice_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- csorensendice(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- csorensendice(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name generate_report_csorensendice #' @title Generate a Microsoft Word document about the Sorensen-Dice distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Sorensen-Dice distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Sorensen-Dice distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Sorensen-Dice distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_csorensendice(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_csorensendice(mtcars, ~am) #' #' @export generate_report_csorensendice <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") csorensendice_results <- csorensendice(dataset, formula) distances <- csorensendice_results if (pvalue.method == "chisq") { p_values <- pvaluescsore(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescsore(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescsore(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcsorensendice_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_csorensendice.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcsorensendice.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcsorensendice_files", recursive = TRUE) }