diff --git a/tools/convex_dispersion/.shed.yml b/tools/convex_dispersion/.shed.yml new file mode 100644 index 000000000..0938c66c3 --- /dev/null +++ b/tools/convex_dispersion/.shed.yml @@ -0,0 +1,6 @@ +categories: [Metabolomics] +description: 'Generates convex hull plots of metabolite intensities relative injection order.' +homepage_url: http://workflow4metabolomics.org +long_description: 'The R script generates convex hull plots of metabolite intensities relative to the injection order of their sample, across different batches. It also provides dispersion indicators of the intensity values based on the convex hull.' +name: convex_dispersion +owner: workflow4metabolomics diff --git a/tools/convex_dispersion/README.md b/tools/convex_dispersion/README.md new file mode 100644 index 000000000..1d1bbc99d --- /dev/null +++ b/tools/convex_dispersion/README.md @@ -0,0 +1,49 @@ +# Table Merge + +Metadata +----------- + + * **@name**: Convex dispersion + * **@galaxyID**: convex_dispersion + * **@version**: 0.1+galaxy1 + * **@authors**: Original code: Brice Mulot (PFEM - UNH - INRAE) - Maintainer: Etienne Jules (PFEM - UNH - INRAE - MetaboHUB) + * **@init date**: 2025, July + * **@main usage**: This tool displays convex hulls of metabolite intensities by injection order per batch. + + +Context +----------- + +This tool generates convex hull plots for metabolite intensity data by injection order across different batches. This can be used to assess intensity values dispersion of ions on similar samples across batches and/or projects (QC, pool, reference materials). + +Configuration +----------- + +### Requirement: + * R software: version = 4.1.2 + * r-ggplot2 = 3.3.5 + * r-optparse = 1.6.6 + * r-dplyr = 1.0.10 + +Technical description +----------- + +Main files: + +- plot_convex_hull.R: R function (core script) +- plot_convex_hull.xml: XML wrapper (interface for Galaxy) + + +Services provided +----------- + + * Help and support: https://community.france-bioinformatique.fr/c/workflow4metabolomics/10 + + + +License +----------- + + * Cea Cnrs Inria Logiciel Libre License, version 2.1 (CECILL-2.1) + + diff --git a/tools/convex_dispersion/plot_convex_hull.R b/tools/convex_dispersion/plot_convex_hull.R new file mode 100644 index 000000000..5d934c4ac --- /dev/null +++ b/tools/convex_dispersion/plot_convex_hull.R @@ -0,0 +1,295 @@ +#!/usr/bin/env Rscript + +# Load required libraries +library(ggplot2) +library(optparse) +library(tools) +library(dplyr) + +#### ---- Define command-line options ---- +option_list <- list( + make_option(c("-q", "--dataMatrix"), + type = "character", + help = "dataMatrix containing the data", + metavar = "FILE"), + make_option(c("-s", "--sampleMetadata"), + type = "character", + help = "sampleMetadata containing the data", + metavar = "FILE"), + make_option(c("-v", "--variableMetadata"), + type = "character", + help = "variableMetadata containing the data", + metavar = "FILE"), + make_option(c("-g", "--global"), + type = "logical", + default = FALSE, + help = "Injection Order Global used", + metavar = "BOOL"), + make_option(c("-x", "--multiple"), + type = "logical", + default = FALSE, + help = "multiple plot depend of the number of Intensity", + metavar = "BOOL"), + make_option(c("-m", "--metabolite"), + type = "character", + default = "Metabolite X", + help = "Name of the metabolite for the plot title (e.g. Intensity1)", + metavar = "NAME"), + make_option(c("-p", "--points"), + type = "logical", + default = TRUE, + help = "Display points (TRUE/FALSE)", + metavar = "BOOL"), + make_option(c("-o", "--output"), + type = "character", + default = "plot_convex_hull.pdf", + help = "Name of the output file", + metavar = "OUTPUT") +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +#### ---- Read Data ---- +read_data_file <- function(file, description) { + if (!file.exists(file)) { + stop(paste(description, "file does not exist:", file)) + } + df <- tryCatch({ + read.table(file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) + }, error = function(e) { + stop(paste("Error reading", description, "file:", conditionMessage(e))) + }) + return(df) +} + +dataMatrix <- read_data_file(opt$dataMatrix, "dataMatrix") +sampleMetadata <- read_data_file(opt$sampleMetadata, "sampleMetadata") +variableMetadata <- read_data_file(opt$variableMetadata, "variableMetadata") + +#### ---- Verification for tests ---- +# Check the colnames +required_cols_data <- character(ncol(dataMatrix) - 1) +for (i in 1:(ncol(dataMatrix) - 1)) { + required_cols_data[i] <- paste0("S", i) +} +missing_cols <- setdiff(required_cols_data, names(dataMatrix)) +if (length(missing_cols) > 0) { + stop(paste("Error : Missing columns in the dataMatrix File :", paste(missing_cols, collapse = ", "))) +} + +required_cols_sample <- c("InjectionOrder", "Batch") +missing_cols <- setdiff(required_cols_sample, names(sampleMetadata)) +if (length(missing_cols) > 0) { + stop(paste("Error : Missing columns in the sampleMetadata File :", paste(missing_cols, collapse = ", "))) +} + +required_cols_variable <- c("compoundname") +missing_cols <- setdiff(required_cols_variable, names(variableMetadata)) +if (length(missing_cols) > 0) { + stop(paste("Error : Missing columns in the sampleMetadata File :", paste(missing_cols, collapse = ", "))) +} + +# Check if the value have the good format +numeric_cols <- c(required_cols_data) +non_numeric <- numeric_cols[!sapply(dataMatrix[numeric_cols], is.numeric)] +if (length(non_numeric) > 0) { + stop(paste("The following columns msut be numeric :", paste(non_numeric, collapse = ", "))) +} + + +#### ---- Create data ---- +dataMatrix_t <- as.data.frame(t(dataMatrix[-1])) +colnames(dataMatrix_t) <- dataMatrix[[1]] +dataMatrix_t$sampleMetadata <- rownames(dataMatrix_t) +dataMatrix_t <- dataMatrix_t[, c("sampleMetadata", setdiff(names(dataMatrix_t), "sampleMetadata"))] +pool_s <- merge(sampleMetadata, dataMatrix_t, by = "sampleMetadata") +pool_s <- pool_s[order(pool_s$InjectionOrder), ] + +# Use global injection order or not +if (!opt$global) { + pool_s$InjectionOrder <- ave( + pool_s$InjectionOrder, + pool_s$Batch, + FUN = function(x) seq_along(x) + ) +} + +#### ---- Function to plot the convex hull ---- +plot_convex_hull <- function(data, metabolite_name, output_file, show_points) { + + # Base plot + plot <- ggplot(data, aes(x = .data$InjectionOrder, + y = .data$Intensity1, + color = .data$Batch)) + + labs(title = paste("Convex Hull by Batch for", metabolite_name), + x = "Injection Order", y = "Intensity") + + theme_minimal() + + # Add points if requested + if (show_points) { + plot <- plot + geom_point(size = 2) + } + + # Add convex hulls + plot <- plot + geom_polygon( + data = do.call(rbind, by(data, data$Batch, function(batch) { + hull_indices <- chull(batch$InjectionOrder, batch$Intensity1) + hull_indices <- c(hull_indices, hull_indices[1]) + batch[hull_indices, ] + })), + aes(x = .data$InjectionOrder, + y = .data$Intensity1, + fill = .data$Batch), + alpha = 0.2 + ) + + ## ---- Compute intraD ---- + intra_distances <- data %>% + group_by(Batch) %>% + summarise( + intraD = if (n() > 1) { + mean(dist(cbind(InjectionOrder, Intensity1))) + } else { + NA + } + ) + + intraD <- median(intra_distances$intraD, na.rm = TRUE) + + ## ---- Compute interD ---- + centroids <- data %>% + group_by(Batch) %>% + summarise( + x = mean(InjectionOrder), + y = mean(Intensity1) + ) + + if (nrow(centroids) > 1) { + interD <- mean(dist(cbind(centroids$x, centroids$y))) + } else { + interD <- NA + } + + ## ---- Compute Ratio ---- + ratio <- intraD / (1 + interD) + + # ---- Format indicators for annotation ---- + indicator_text <- paste0( + "intraD: ", round(intraD, 3), "\n", + "interD: ", round(interD, 3), "\n", + "Ratio: ", round(ratio, 3) + ) + + # Add annotation with the indicators + plot <- plot + annotate("text", + x = Inf, y = -Inf, + hjust = 1.1, vjust = -0.2, + label = indicator_text, + size = 3.5, color = "black") + + # ---- Save plot as PDF ---- + ggsave(filename = output_file, plot = plot, device = "pdf") +} + +#### ---- Function for multiple metabolite ---- + +plot_convex_hull_metabolite <- function(data, injection_order_col, intensity_col, metabolite_name) { + + # ---- Compute intraD ---- + intra_distances <- data %>% + group_by(Batch) %>% + summarise( + intraD = if (n() > 1) { + mean(dist(cbind(.data[[injection_order_col]], .data[[intensity_col]]))) + } else { + NA + } + ) + + intraD <- median(intra_distances$intraD, na.rm = TRUE) + + # ---- Compute interD ---- + centroids <- data %>% + group_by(Batch) %>% + summarise( + x = mean(.data[[injection_order_col]]), + y = mean(.data[[intensity_col]]) + ) + + if (nrow(centroids) > 1) { + interD <- mean(dist(cbind(centroids$x, centroids$y))) + } else { + interD <- NA + } + + # ---- Compute Ratio ---- + ratio <- if (!is.na(intraD) && !is.na(interD) && interD != 0) { + intraD / (1 + interD) + } else { + NA + } + + # ---- Format indicator text ---- + indicator_text <- paste0( + "intraD: ", ifelse(!is.na(intraD), round(intraD, 3), "NA"), "\n", + "interD: ", ifelse(!is.na(interD), round(interD, 3), "NA"), "\n", + "Ratio: ", ifelse(!is.na(ratio), round(ratio, 3), "NA") + ) + + # ---- Create base plot ---- + plot <- ggplot(data, aes_string(x = injection_order_col, y = intensity_col, color = "Batch")) + + geom_point(size = 2) + + geom_polygon( + data = do.call(rbind, by(data, data$Batch, function(batch) { + if (nrow(batch) >= 3) { + hull_indices <- chull(batch[[injection_order_col]], batch[[intensity_col]]) + hull_indices <- c(hull_indices, hull_indices[1]) + batch[hull_indices, ] + } else { + batch[NULL, ] + } + })), + aes_string(x = injection_order_col, y = intensity_col, fill = "Batch"), + alpha = 0.5, inherit.aes = FALSE + ) + + labs(title = paste("Convex Hull for", metabolite_name), + x = "Injection Order", y = "Intensity") + + annotate("text", + x = Inf, y = -Inf, + hjust = 1.1, vjust = -0.2, + label = indicator_text, + size = 3.5, color = "black") + + theme_minimal() + + theme(legend.position = "bottom") + + return(plot) +} + +plot_convex_metabolites <- function(data, injection_order_col, intensity_cols, output_pdf = "convex_hulls.pdf") { + + pdf(output_pdf, width = 8, height = 6) + + for (intensity_col in intensity_cols) { + p <- plot_convex_hull_metabolite(data, injection_order_col, intensity_col, intensity_col) + print(p) + } + + dev.off() + cat("All plots saved in", output_pdf, "\n") +} +#### ---- Call plotting function ---- +if (opt$multiple) { + intensity_cols <- grep("^Intensity", colnames(pool_s), value = TRUE) + injectionorder = 'InjectionOrder' + plot_convex_metabolites(pool_s, injectionorder, intensity_cols) +}else{ + plot_convex_hull(pool_s, opt$metabolite, opt$output, opt$points) + if (!file.exists(opt$output)) { + stop("Plot file was not created: ", opt$output) + } + + cat("Plot saved as", opt$output, "\n") +} + + diff --git a/tools/convex_dispersion/plot_convex_hull.xml b/tools/convex_dispersion/plot_convex_hull.xml new file mode 100644 index 000000000..438ee66b2 --- /dev/null +++ b/tools/convex_dispersion/plot_convex_hull.xml @@ -0,0 +1,217 @@ + + This script plots a convex polygon around intensities based on injection order to visualize batch variability. + + topic_3172 + topic_0092 + + + + + + + r-base + r-ggplot2 + r-optparse + r-dplyr + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + `_ + | + +------------ +OUTPUT FILES +------------ + +The output is a pdf or png file and cannot be used as input to another tool. +The output can be used to asses the dispersion of intensity values of metabolites on similar repeated samples within batches (QC, pool, reference materials...) + +---------------------------- +EXAMPLE OF WORKFLOW POSITION +---------------------------- + +|- After a MS extraction workflow, before batch correction +|- After batch correction + +--------------------------------------------------- + +=============== +TOOL PARAMETERS +=============== +global + | Boolean + | Whether to use the global injection order or the injection order within the batch. + +multiple + | Boolean + | Whether to plot all the metabolites in different plots or on the same one. + +points + | Boolean + | Whether to display individual points for each intensity value on the plot. + +metabolite + | ?? probably useless parameter + +output_name + | Str + | Name of the output file. + + +--------------------------------------------------- + +================== +OUTPUT DESCRIPTION +================== + +List of the different possible outputs generated by the Galaxy module, with a content description. + +If information relevant to the users is displayed in the "standard output" when running the module, do not forget to mention it here. + +--------------------------------------------------- + +============== +EXAMPLE OF USE +============== + +Can be either the following: + +- a working with accessible inputs, the list of parameters to use and the display of expected outputs +- a step-by-step procedure to illustrate how to set the different parameters and what result should be expected +- a reference to existing examples or use-cases, for example specific sections of existing GTN materials using the tool + +If not mentioned elsewhere, you can also add a subsection regarding known issues (when relevant). + +Example: + +------------ +KNOWN ISSUES +------------ + +A list of issues users may run into when using the Galaxy module (*e.g.* case-sensitivity issues) + +--------------------------------------------------- + +========== +CHANGE LOG +========== + +------------------------------- +Version 0.1+galaxy1 +------------------------------- + +Initial version + + ]]> + + + + 10.1016/j.chemolab.2024.105148 + + + + diff --git a/tools/convex_dispersion/test-data/convexdispersion_dataMatrix.txt b/tools/convex_dispersion/test-data/convexdispersion_dataMatrix.txt new file mode 100644 index 000000000..d51dc37a3 --- /dev/null +++ b/tools/convex_dispersion/test-data/convexdispersion_dataMatrix.txt @@ -0,0 +1,3 @@ +dataMatrix S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S20 +Intensity1 102.38 99.64 89.72 113.02 98.23 105.67 95.81 110.53 92.76 108.94 190.45 202.31 197.52 205.62 198.91 193.87 210.38 195.92 202.17 199.34 +Intensity2 96.11 110.52 118.61 103.62 91.98 84.27 119.68 80.52 93.32 97.03 204.10 213.40 197.71 205.32 207.13 190.33 186.92 178.41 185.36 213.71 diff --git a/tools/convex_dispersion/test-data/convexdispersion_sampleMetadata.txt b/tools/convex_dispersion/test-data/convexdispersion_sampleMetadata.txt new file mode 100644 index 000000000..772efe333 --- /dev/null +++ b/tools/convex_dispersion/test-data/convexdispersion_sampleMetadata.txt @@ -0,0 +1,21 @@ +sampleMetadata Batch InjectionOrder +S1 Batch1 1 +S2 Batch1 2 +S3 Batch1 3 +S4 Batch1 4 +S5 Batch1 5 +S6 Batch1 6 +S7 Batch1 7 +S8 Batch1 8 +S9 Batch1 9 +S10 Batch1 10 +S11 Batch2 11 +S12 Batch2 12 +S13 Batch2 13 +S14 Batch2 14 +S15 Batch2 15 +S16 Batch2 16 +S17 Batch2 17 +S18 Batch2 18 +S19 Batch2 19 +S20 Batch2 20 \ No newline at end of file diff --git a/tools/convex_dispersion/test-data/convexdispersion_variableMetadata.txt b/tools/convex_dispersion/test-data/convexdispersion_variableMetadata.txt new file mode 100644 index 000000000..cd363a177 --- /dev/null +++ b/tools/convex_dispersion/test-data/convexdispersion_variableMetadata.txt @@ -0,0 +1,4 @@ +dataMatrix compoundname compoundform +Intensity1 Mannitol [M+H-C2H10O5] +Intensity2 Valine [M+H-CH2O2] +