# cmahalanobis.R utils::globalVariables(c("Species", "Distance", "Comparison")) utils::globalVariables(c("Var1", "Var2", "value", "element_text")) utils::globalVariables(c("p_values", "distances")) #' @importFrom stats mahalanobis #' @importFrom mice mice #' @importFrom mice complete #' @importFrom stats cov #' @importFrom stats mahalanobis #' @importFrom stats cov #' @importFrom ggplot2 geom_bar #' @importFrom ggplot2 labs #' @importFrom ggplot2 theme_minimal #' @importFrom reshape2 melt #' @importFrom reshape2 melt #' #' @name cmahalanobis #' @title Calculate the Mahalanobis distance for each species. #' #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Mahalanobis distances about it. #' #' #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Mahalanobis distances matrix. #' @param plot Logical, if TRUE, a plot of Mahalanobis distances matrix is displayed. #' @param plot_title The title to be used for the plot if plot is TRUE. #' @return A matrix containing Mahalanobis distances between each pair of groups and the plot. #' #' #' @examples #' # Example with the iris dataset #' #' data(iris) #' #' # Calculate the Mahalanobis distance with the cmahalanobis function #' cmahalanobis(iris, ~Species, plot = TRUE, plot_title = "Mahalanobis Distance Between Groups") #' #' # Example with the mtcars dataset #' data(mtcars) #' #' # Calculate the Mahalanobis distance with the cmahalanobis function #' cmahalanobis(mtcars, ~am, plot = TRUE, plot_title = "Mahalanobis Distance Between Groups") #' #' @export cmahalanobis <- function(dataset, formula, plot = TRUE, plot_title = "Mahalanobis Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with arithmetic mean in each dataframe into the list groups <- lapply(groups, impute_with_mean) # Obtain the number of groups n <- length(groups) # Precalculate means and covariances means <- lapply(groups, colMeans) covariances <- lapply(groups, function(df) { cov_matrix <- cov(df) n <- nrow(cov_matrix) reg <- 0.01 # Regularization value cov_matrix <- cov_matrix + diag(reg, nrow = n) return(cov_matrix) }) # Create a empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Mahalanobis distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] cov_i <- covariances[[i]] for (j in 1:n) { if (i != j) { distances[i, j] <- mean(mahalanobis(x = groups[[j]], center = mean_i, cov = cov_i)) } } } # If plot is TRUE, call the "plot_mahalanobis_distances" function and print the plot if (plot) { print(plot_mahalanobis_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } impute_with_mean <- function(df) { # Calculate mean for each columns, ignoring missing values means <- colMeans(df, na.rm = TRUE) # Replace missing values with the corresponding mean for (i in 1:ncol(df)) { df[is.na(df[, i]), i] <- means[i] } return(df) } # Auxiliary function to print the Mahalanobis distances plot plot_mahalanobis_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_cmahalanobis #' @title Generate a Microsoft Word document about Mahalanobis distance matrix and p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about Mahalanobis distance matrix and p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Mahalanobis distances matrix and p_values matrix. #' @param pvalue.method A method with which you want to calculate pvalue matrix.The default method is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations A number of permutations to define if you choose "permutation". #' @param num.bootstraps A number of bootstrap to define if you choose "bootstrap". #' @return A Microsoft Word document about Mahalanobis distances matrix and p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cmahalanobis(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cmahalanobis(mtcars, ~am) #' #' @export generate_report_cmahalanobis <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cmahalanobis_results <- cmahalanobis(dataset, formula) distances <- cmahalanobis_results if (pvalue.method == "chisq") { p_values <- pvaluescmaha(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescmaha(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescmaha(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcmahalanobis_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cmahalanobis.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcmahalanobis.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcmahalanobis_files", recursive = TRUE) } #' @name pvaluescmaha #' @title Calculate p_values matrix for each species, using Mahalanobis distance as a base. #' #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of the species and a plot if the user select TRUE using Mahalanobis distance for distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Mahalanobis distances matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq".Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot a p_values heatmap. The default value is TRUE. #' @return A list containing the p-values matrix and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescmaha(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescmaha(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluescmaha <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that the input is a dataframe if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the name of response variable by the formula response <- all.vars(formula)[1] # Verify that the response variable exists in the dataset if (!response %in% names(dataset)) { stop(paste("Response variable", response, "is absent in the dataset")) } # Calculate Mahalanobis distances using "cmahalanobis" function mahalanobis_results <- cmahalanobis(dataset, formula, plot = FALSE) distances <- mahalanobis_results$distances # Obtain groups number n <- nrow(distances) # Initialize p_values matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for covariance matrix estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user input if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cmahalanobis(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a repetition sample bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cmahalanobis(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values method calculation not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Plot the heatmap if plot is TRUE if (plot) { requireNamespace("ggplot2") requireNamespace("reshape2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' Calculate Euclidean distance #' #' @name ceuclide #' @title Calculate the Euclidean distance of a factor in a dataframe. #' @description #' This function takes a dataframe and a factor in input, and returns a matrix with the Euclidean distances about it. #' @param dataset A dataframe. #' @param formula The factor which you want to calculate the Euclidean distances matrix. #' @param plot If TRUE, shows a plot of the Euclidean distances matrix. #' @param plot_title The title of the plot. #' @return The matrix containing distances. #' @examples #' #' # Example with iris dataset #' #' ceuclide(iris, ~Species, plot = TRUE, plot_title = "Euclidean Distance Between Groups") #' #' # Example with mtcars dataset #' #' ceuclide(mtcars, ~am, plot = TRUE, plot_title = "Euclidean Distance Between Groups") #' #' @export ceuclide <- function(dataset, formula, plot = TRUE, plot_title = "Euclidean Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with arithmetic mean in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means and covariances means <- lapply(groups, colMeans) covariances <- lapply(groups, function(df) { cov_matrix <- cov(df) n <- nrow(cov_matrix) reg <- 0.01 # Regularization value cov_matrix <- cov_matrix + diag(reg, nrow = n) return(cov_matrix) }) # Create a empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Euclidean distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean of group i for (j in 1:n) { if (i != j) { distances[i, j] <- sqrt(sum((mean_i - means[[j]])^2)) } } } # If plot is TRUE, call the "plot_euclidean_distances" function and print the plot if (plot) { print(plot_euclidean_distances(distances, plot_title)) } # Return a matrix containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the mean impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Euclide distances plot plot_euclidean_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_ceuclide #' @title Generate a Microsoft Word document about the Euclidean distance matrix and the p-values matrix with relative plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Euclidean distance matrix and the p-values matrix with relative plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Euclidean distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq".Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Euclidean distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_ceuclide(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_ceuclide(mtcars, ~am) #' #' @export generate_report_ceuclide <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") ceuclide_results <- ceuclide(dataset, formula) distances <- ceuclide_results if (pvalue.method == "chisq") { p_values <- pvaluesceucl(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluesceucl(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluesceucl(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportceuclide_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_ceuclide.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportceuclide.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportceuclide_files", recursive = TRUE) } #' @name pvaluesceucl #' @title Calculate the p_values matrix for each species, using the Euclidean distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Euclidean distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Euclidean distances. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing the p_values matrix and, optionally, the plot. #' #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluesceucl(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluesceucl(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' #' @export pvaluesceucl <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Euclidean distances using "ceuclide" function ceuclide_results <- ceuclide(dataset, formula, plot = FALSE) distances <- ceuclide_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- ceuclide(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- ceuclide(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' Calculate Manhattan distance #' @name cmanhattan #' @title Calculate a Manhattan distance of a factor in a dataframe. #' @description This function takes a dataframe and a factor in input, and returns a matrix with the Manhattan distances about it. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Manhattan distance. #' @param plot If TRUE, show a plot of distances. #' @param plot_title The title of plot. #' @return A matrix containing distances. #' @examples #' # Example with iris dataset #' #' cmanhattan(iris, ~Species, plot = TRUE, plot_title = "Manhattan Distance Between Groups") #' #' # Example with mtcars dataset #' #' cmanhattan(mtcars, ~am, plot = TRUE, plot_title = "Manhattan Distance Between Groups") #' #' @export cmanhattan <- function(dataset, formula, plot = TRUE, plot_title = "Manhattan Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with arithmetic mean in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means and covariances means <- lapply(groups, colMeans) covariances <- lapply(groups, function(df) { cov_matrix <- cov(df) n <- nrow(cov_matrix) reg <- 0.01 # Regularization value cov_matrix <- cov_matrix + diag(reg, nrow = n) return(cov_matrix) }) # Create a empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Manhattan distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean in the group i for (j in 1:n) { if (i != j) { distances[i, j] <- sum(abs(mean_i - means[[j]])) } } } # If plot is TRUE, call the "plot_manhattan_distances" function and print the plot if (plot) { print(plot_manhattan_distances(distances, plot_title)) } # Return a matrix containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the mean impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Imputazione multipla utilizzando mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Manhattan distances plot plot_manhattan_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_cmanhattan #' @title Generate a Microsoft Word document about the Manhattan distance and the p-values matrices with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Manhattan distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Manhattan distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq".Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Manhattan distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cmanhattan(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cmanhattan(mtcars, ~am) #' #' @export generate_report_cmanhattan <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cmanhattan_results <- cmanhattan(dataset, formula) distances <- cmanhattan_results if (pvalue.method == "chisq") { p_values <- pvaluescmanh(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluescmanh(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluescmanh(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcmanhattan_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cmanhattan.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcmanhattan.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcmanhattan_files", recursive = TRUE) } #' @name pvaluescmanh #' @title Calculate the p_values matrix for each species, using Manhattan distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Manhattan distance for the distances calculation. #' @param dataset A dataframe #' @param formula A factor which you want to calculate Manhattan distances. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A matrix containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluescmanh(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluescmanh(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' #' @export pvaluescmanh <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Manhattan distances using "cmanhattan" function cmanhattan_results <- cmanhattan(dataset, formula, plot = FALSE) distances <- cmanhattan_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cmanhattan(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cmanhattan(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) } #' @name cchebyshev #' @title Calculate the p_values matrix for each species, using Chebyshev distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using the Chebyshev distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Chebyshev distance. #' @param plot If TRUE, displays a plot of distances. #' @param plot_title The title of plot. #' @return A matrix containing distances and, optionally, the plot. #' @examples #' # Example with iris dataset #' #' cchebyshev(iris, ~Species, plot = TRUE, plot_title = "Chebyshev Distance Between Groups") #' #' # Example with mtcars dataset #' #' cchebyshev(mtcars, ~am, plot = TRUE, plot_title = "Chebyshev Distance Between Groups") #' #' #' @export cchebyshev <- function(dataset, formula, plot = TRUE, plot_title = "Chebyshev Distance Between Groups") { # Verify that the input is a data frame if (!is.data.frame(dataset)) { stop("The input must be a dataframe") } # Extract the response and predictor variables from the formula response <- all.vars(formula)[1] predictors <- all.vars(formula)[-1] # Split the data into groups based on the response variable groups <- split(dataset, dataset[[response]]) # Select only numeric variables in each group groups <- lapply(groups, function(df) { df <- df[, sapply(df, is.numeric)] # Select only numeric columns return(df) }) # Replace missing values with arithmetic mean in each dataframe into the list groups <- lapply(groups, impute_with_multiple_imputation) # Obtain the number of groups n <- length(groups) # Precalculate means and covariances means <- lapply(groups, colMeans) covariances <- lapply(groups, function(df) { cov_matrix <- cov(df) n <- nrow(cov_matrix) reg <- 0.01 # Regularization value cov_matrix <- cov_matrix + diag(reg, nrow = n) return(cov_matrix) }) # Create a empty matrix to store distances distances <- matrix(0, nrow = n, ncol = n) # Calculate Chebyshev distance between each couple of groups for (i in 1:n) { mean_i <- means[[i]] # Precalculated mean in the group i for (j in 1:n) { if (i != j) { distances[i, j] <- max(abs(mean_i - means[[j]])) } } } # If plot is TRUE, call the "plot_chebyshev_distances" function and print the plot if (plot) { print(plot_chebyshev_distances(distances, plot_title)) } # Return a list containing distances return(list(distances = distances)) } # Auxiliary function to impute missing values with the mean impute_with_multiple_imputation <- function(df, m = 5, method = "cart") { # Multiple imputation using mice imp <- mice(df, m = m, method = method) imputedData <- complete(imp, action = 1) return(imputedData) } # Auxiliary function to print the Mahalanobis distances plot plot_chebyshev_distances <- function(distances, plot_title) { requireNamespace("ggplot2") requireNamespace("reshape2") output_df <- as.data.frame(distances) output_df$Species <- rownames(output_df) output_df_long <- reshape2::melt(output_df, id.vars = "Species") colnames(output_df_long) <- c("Species", "Comparison", "Distance") ggplot2::ggplot(output_df_long, ggplot2::aes(x = Species, y = Distance, fill = Comparison)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = plot_title, x = "Species", y = "Distance", fill = "Comparison") + ggplot2::theme_minimal() } #' @name generate_report_cchebyshev #' @title Generate a Microsoft Word document about the Chebyshev distance matrix and the p-values matrix with corresponding plots. #' #' @description #' This function takes a dataframe, a factor and returns a Microsoft Word document about the Chebyshev distance matrix and the p-values matrix with corresponding plots. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate the Chebyshev distance matrix and the p_values matrix. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @return A Microsoft Word document about the Chebyshev distance matrix and the p_values matrix. #' @examples #' # Generate a report about "Species" factor in iris dataset #' generate_report_cchebyshev(iris, ~Species) #' #' # Generate a report about "am" factor in mtcars dataset #' generate_report_cchebyshev(mtcars, ~am) #' #' @export generate_report_cchebyshev <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 10, num.bootstraps = 10) { requireNamespace("rmarkdown") cchebyshev_results <- cchebyshev(dataset, formula) distances <- cchebyshev_results if (pvalue.method == "chisq") { p_values <- pvaluesccheb(dataset, formula, pvalue.method = "chisq") } else if (pvalue.method == "permutation") { p_values <- pvaluesccheb(dataset, formula, pvalue.method = "permutation") } else if (pvalue.method == "bootstrap") { p_values <- pvaluesccheb(dataset, formula, pvalue.method = "bootstrap") } dir_path <- file.path(getwd(), "reportcchebyshev_files/figure-docx") if (!dir.exists(dir_path)) { dir.create(dir_path, recursive = TRUE) } template_path <- system.file("rmarkdown", "template_report_cchebyshev.Rmd", package = "cmahalanobis") if (file.exists(template_path)) { message("Template found at: ", template_path) } else { stop("Template not found!") } output_file <- "reportcchebyshev.docx" tryCatch({ rmarkdown::render(template_path, params = list(distances = distances, p_values = p_values), output_file = output_file) }, error = function(e) { message("Error during rendering: ", e$message) }) get_desktop_path <- function() { home <- Sys.getenv("HOME") sysname <- Sys.info()["sysname"] if (sysname == "Windows") { return(file.path(Sys.getenv("USERPROFILE"), "Desktop")) } else if (sysname == "Darwin") { return(file.path(home, "Desktop")) } else if (sysname == "Linux") { return(file.path(home, "Desktop")) } else { stop("Unsupported OS") } } desktop_path <- get_desktop_path() file.rename(output_file, file.path(desktop_path, output_file)) unlink("reportcchebyshev_files", recursive = TRUE) } #' @name pvaluesccheb #' @title Calculate the p_values matrix for each species, using Chebyshev distance as a base. #' @description #' This function takes a dataset, a factor, a p_value method, number of bootstraps and permutation when necessary, and returns a p_values matrix between each pair of species and a plot if the user select TRUE using Chebyshev distance for the distances calculation. #' @param dataset A dataframe. #' @param formula A factor which you want to calculate Chebyshev distance. #' @param pvalue.method A p_value method used to calculate the matrix, the default value is "chisq". Other methods are "permutation" and "bootstrap". #' @param num.permutations Number of permutation to specify if you select "permutation" in "pvalue.method". The default value is 100. #' @param num.bootstraps Number of bootstrap to specify if you select "bootstrap" in "p_value method". The default value is 10. #' @param plot if TRUE, plot the p_values heatmap. The default value is TRUE. #' @return A list containing a matrix of p_values and, optionally, the plot. #' @examples #' # Calculate p_values of "Species" variable in iris dataset #' pvaluesccheb(iris,~Species, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' # Calculate p_values of "am" variable in mtcars dataset #' pvaluesccheb(mtcars,~am, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10) #' @export pvaluesccheb <- function(dataset, formula, pvalue.method = "chisq", num.permutations = 100, num.bootstraps = 10, plot = TRUE) { # Verify that input is a dataframe if (!is.data.frame(dataset)) { stop("dataset must be a dataframe") } # Extract the response variable name by the formula response <- all.vars(formula)[1] # Verify the presence of the response variable if (!response %in% names(dataset)) { stop(paste("The response variable", response, "isn't present")) } # Calculate Chebyshev distances using "cchebyshev" function cchebyshev_results <- cchebyshev(dataset, formula, plot = FALSE) distances <- cchebyshev_results$distances # Obtain the groups number n <- nrow(distances) # Initialize p_value matrix p_values <- matrix(NA, nrow = n, ncol = n) # Calculate p_values for (i in 1:n) { df <- ncol(dataset) - 1 # Degrees of freedom (adjusted for matrix covariance estimate) for (j in 1:n) { if (i != j) { # Choose p_values method calculation based user inputs if (pvalue.method == "chisq") { p_values[i, j] <- pchisq(distances[i, j], df, lower.tail = FALSE, log.p = TRUE) } else if (pvalue.method == "permutation") { # Permutation test observed_distance <- distances[i, j] permutation_distances <- replicate(num.permutations, { # Permute labels group and recalculate the distance permuted_data <- dataset permuted_data[[response]] <- sample(dataset[[response]]) permuted_results <- cchebyshev(permuted_data, formula, plot = FALSE) permuted_distances <- permuted_results$distances return(permuted_distances[i, j]) }) p_values[i, j] <- mean(permutation_distances >= observed_distance) } else if (pvalue.method == "bootstrap") { # Bootstrap observed_distance <- distances[i, j] bootstrap_distances <- replicate(num.bootstraps, { # Extract a sample with repetition bootstrap_sample <- sample(nrow(dataset), replace = TRUE) bootstrap_data <- dataset[bootstrap_sample, ] bootstrap_results <- cchebyshev(bootstrap_data, formula, plot = FALSE) bootstrap_distance <- bootstrap_results$distances[i, j] return(bootstrap_distance) }) p_values[i, j] <- mean(abs(bootstrap_distances) >= abs(observed_distance)) } else { stop("p_values calculation method not supported. Use 'chisq', 'permutation' or 'bootstrap'.") } } } } # Create heatmap if plot is TRUE if (plot) { requireNamespace("reshape2") requireNamespace("ggplot2") p_values_melt <- reshape2::melt(p_values, na.rm = TRUE) colnames(p_values_melt) <- c("Var1", "Var2", "value") print(ggplot2::ggplot(data = p_values_melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient(low = "white", high = "red") + ggplot2::labs(title = "p_values heatmap", x = "", y = "", fill = "p_value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))) } return(p_values) }