diff --git a/DESCRIPTION b/DESCRIPTION index 6deb90e..9c93b36 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,7 +4,8 @@ Version: 1.0.0 Authors@R: c( person("Karin", "Schork", , "karin.schork@rub.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3756-4347")), - person("Katharina", "Neuhaus", , "katharina.neuhaus@rub.de", role = c("aut"))) + person("Katharina", "Neuhaus", , "katharina.neuhaus@rub.de", role = c("aut")), + person("Martin", "Eisenacher", , "martin.eisenacher@rub.de", role=c("aut", "fnd"))) Description: This package offers workflows for the statistical analysis of proteomics data. License: MIT + file LICENSE Encoding: UTF-8 diff --git a/R/WIP_ANOVA.R b/R/ANOVA.R similarity index 95% rename from R/WIP_ANOVA.R rename to R/ANOVA.R index 965ddcb..6208b08 100644 --- a/R/WIP_ANOVA.R +++ b/R/ANOVA.R @@ -24,14 +24,12 @@ #' The minimum number of observations per group. #' @param min_perc_per_group \strong{integer} \cr #' The minimum ratio of observations per group as an alternative to min_obs_per_group. -#' @param filename \strong{character} \cr -#' The name of the output file. #' #' @return A data.frame with p-values and fold changes #' @export #' #' @examples -#' +#' ANOVA <- function(D, id = NULL, @@ -43,8 +41,7 @@ ANOVA <- function(D, delog_for_FC = TRUE, log_base = 2, min_obs_per_group = 3, - min_perc_per_group = NULL, - filename = "results_ANOVA.xlsx") { + min_perc_per_group = NULL) { if (!is.null(min_obs_per_group) & !is.null(min_perc_per_group)) stop("Please specify only one of min_obs_per_group or min_perc_per_group.") @@ -68,6 +65,7 @@ ANOVA <- function(D, } else { ### Welch ANOVA (unequal variances) print("Welch ANOVA") + i <<- 0 RES <- pbapply::pbapply(D, 1, ANOVA_Welch_single_row, group = group, log_before_test = log_before_test, delog_for_FC = delog_for_FC, min_obs_per_group = min_obs_per_group, min_perc_per_group = min_perc_per_group, @@ -84,7 +82,7 @@ ANOVA <- function(D, if (!is.null(id)) { D <- cbind(id, D) } - openxlsx::write.xlsx(cbind(D, RES), filename, keepNA = TRUE, overwrite = TRUE) + #openxlsx::write.xlsx(cbind(D, RES), filename, keepNA = TRUE, overwrite = TRUE) return(cbind(D, RES)) } @@ -201,9 +199,6 @@ ANOVA_standard_single_row <- function(x, x2 <- x } - - ### TODO: fold changes nur berechnen, wenn für die Gruppe auch ein posthoc test berechnet werden konnte - fcs <- NULL name.fcs <- NULL for (j in 1:(nr_groups-1)) { @@ -227,11 +222,6 @@ ANOVA_standard_single_row <- function(x, ################################################################################ -#### TODO: Derzeit müssen Factor-levels der Gruppen alphabetisch sortiert sein, -### damit die Ergebnisse des posthoc tests richtig ausgelesen werden können -### TODO: Option einbauen, dass nur getestet wird, wenn alle Gruppen vorhanden sind - - #' Repeated measures ANOVA (paired samples) #' #' @param x Numeric vector of protein intensities. @@ -344,8 +334,6 @@ ANOVA_repeatedMeasurements_single_row <- function(x, x2 <- x } - - ### TODO: fold changes nur berechnen, wenn für die Gruppe auch ein posthoc test berechnet werden konnte fcs <- NULL name.fcs <- NULL for (j in 1:(nr_groups - 1)) { @@ -441,7 +429,14 @@ ANOVA_Welch_single_row <- function(x, } model <- stats::lm(intensity ~ group, data = D_tmp_naomit) - ANOVA <- suppressMessages(car::Anova(model, white.adjust = TRUE)) + ANOVA <- try({suppressMessages(car::Anova(model, white.adjust = TRUE))}) + + if ("try-error" %in% class(ANOVA)) { + res <- rep(NA, 3 * nr_comparisons + 2) + names(res) <- cnames + return(res) + } + p.anova <- ANOVA$`Pr(>F)`[1] p.anova.fdr <- NA @@ -465,8 +460,6 @@ ANOVA_Welch_single_row <- function(x, x2 <- x } - - ### TODO: fold changes nur berechnen, wenn für die Gruppe auch ein posthoc test berechnet werden konnte fcs <- NULL name.fcs <- NULL for (j in 1:(nr_groups-1)) { diff --git a/R/Boxplots.R b/R/Boxplots.R index 318a751..b05727c 100644 --- a/R/Boxplots.R +++ b/R/Boxplots.R @@ -46,20 +46,20 @@ Boxplots <- function(D_long, mess <- "" - if(is.null(use_groups)){ - if(is.na(D_long$group[1])){ + if (is.null(use_groups)) { + if (is.na(D_long$group[1])) { use_groups <- FALSE } else{use_groups <- TRUE} } else{ - if(use_groups && is.na(D_long$group[1])){ + if (use_groups && is.na(D_long$group[1])) { use_groups <- FALSE } } # log-transform data if necessary - if(do_log_transformation) { + if (do_log_transformation) { D_long$value <- log(D_long$value, base = log_base) } @@ -80,12 +80,12 @@ Boxplots <- function(D_long, pl_boxplot <- pl_boxplot + ggplot2::theme_bw(base_size = base_size) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust=1)) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust = 1)) + ggplot2::ylab("Log intensity") + ggplot2::xlab("Sample") + ggplot2::scale_x_discrete(limits = x_axis, drop = FALSE, na.translate = TRUE) - if (method == "violinplot"){ + if (method == "violinplot") { pl_boxplot <- pl_boxplot + ggplot2::geom_violin() mess <- paste0("Violin Plot generated ", mess) } diff --git a/R/PCA_Plot.R b/R/PCA_Plot.R index d37ea87..cfa864e 100644 --- a/R/PCA_Plot.R +++ b/R/PCA_Plot.R @@ -143,6 +143,7 @@ PCA_Plot <- function(D, message(mess) return(list("plot" = pl, "D_PCA_plot" = cbind(D_PCA, "Sample" = colnames(D)), + "filtered_D" = filtered_D, "pca" = pca, "message" = mess)) } diff --git a/R/WIP_Heatmap.R b/R/WIP_Heatmap.R index 15becaf..f67ce28 100644 --- a/R/WIP_Heatmap.R +++ b/R/WIP_Heatmap.R @@ -9,11 +9,11 @@ #' @param protein_names_col \strong{integer} \cr #' The column with protein or gene names, if the names should be plotted. #' @param na_method \strong{character} \cr -#' The method with which missing values are handeled. +#' The method with which missing values are handeled. #' Options are "na.omit" (proteins with any missing values will be removed), "impute" (missing values will be imputed) and "keep" (missing values will be kept). #' Note that clustering may not work when too many missing values are present. #' @param filtermissings \strong{integer} \cr -#' The threshold for missing values. +#' The threshold for missing values. #' If a protein has more missing values, it will be filtered out. #' Note that rows with only 1 or 2 valid values may cause problems with clustering. #' @param groups \strong{character factor} \cr @@ -64,10 +64,10 @@ #' @param ... Further arguments to Heatmap #' #' @return A heatmap of the given data. -#' @export #' -#' @examples -#' +#' +#' @examples +#' Heatmap_with_groups <- function(D, id, protein_names_col = NULL, na_method = "na.omit", filtermissings = 2, diff --git a/R/WIP_OnOff_Heatmap.R b/R/WIP_OnOff_Heatmap.R index 45603ff..4dc76c0 100644 --- a/R/WIP_OnOff_Heatmap.R +++ b/R/WIP_OnOff_Heatmap.R @@ -16,12 +16,12 @@ #' The column in id parameter containing the protein IDs used for mapping. #' #' @return A data.frame with the number of valid values per group (absolute and relative) and on/off status -#' @export -#' +#' +#' # @seealso [Onoff_plus_heatmap()] #' -#' @examples -#' +#' @examples +#' calculate_onoff <- function(D, id, group, max_vv_off, min_vv_on, protein_id_col = 1) { @@ -81,14 +81,14 @@ calculate_onoff <- function(D, id, group, max_vv_off, min_vv_on, protein_id_col # The column name of the proteins. # @param relative \strong{logical} \cr # If \code{TRUE}, ? -# +# # @return A data.frame with the number of valid values per group (absolute and relative) and on/off status # @export -# +# # @seealso [calculate_onoff()] # -# @examples -# +# @examples +# Onoff_plus_heatmap <- function(RES_onoff, protein_name_column = "Gene.names", diff --git a/R/helper_ttest.R b/R/helper_ttest.R index 7ebbea5..b19ac53 100644 --- a/R/helper_ttest.R +++ b/R/helper_ttest.R @@ -5,7 +5,7 @@ #' The path to an .xlsx file containing the input data. #' @param intensity_columns \strong{integer vector} \cr #' The intensity columns of the table. -#' +#' #' @return A list containing an intensity data.frame, an IDs data frame and a factor of the sample groups. #' #' @examples @@ -19,19 +19,19 @@ prepareTtestData <- function(data_path, intensity_columns ){ - + D <- openxlsx::read.xlsx(data_path, na.strings = c("NA", "NaN", "Filtered","#NV")) - + id <- D[, -intensity_columns] D <- D[, intensity_columns] - + D[D == 0] <- NA - + group <- factor(limma::strsplit2(colnames(D), "_")[,1]) number_of_groups <- length(levels(group)) - + sample <- factor(limma::strsplit2(colnames(D), "_")[,2]) - + return(list("D" = D, "ID" = id, "group" = group, "number_of_groups" = number_of_groups, "sample" = sample)) } @@ -59,20 +59,20 @@ prepareTtestData <- function(data_path, #' @return A factor with the three significance categories. #' @export #' -#' @examples -#' +#' @examples +#' calculate_significance_categories_ttest <- function(p, p_adj, fc, thres_fc=2, thres_p=0.05) { - + significance <- dplyr::case_when( p_adj <= thres_p & p <= thres_p & (fc >= thres_fc | fc <= 1/thres_fc) & !is.na(p) ~ "significant after FDR correction", p_adj > thres_p & p <= thres_p & (fc >= thres_fc | fc <= 1/thres_fc) & !is.na(p) ~ "significant", (p > thres_p | (fc < thres_fc & fc > 1/thres_fc)) & !is.na(p) ~ "not significant", is.na(p) ~ NA_character_ ) - + significance <- factor(significance, levels = c("not significant", "significant", "significant after FDR correction")) - + return(significance) } @@ -97,18 +97,18 @@ calculate_significance_categories_ttest <- function(p, p_adj, fc, thres_fc=2, th #' @export #' #' @examples -#' +#' calculate_significance_categories_ANOVA <- function(p_posthoc, p_anova_adj, p_anova, fc, thres_fc=2, thres_p=0.05) { - + significance <- dplyr::case_when( p_anova_adj <= thres_p & p_posthoc <= thres_p & (fc >= thres_fc | fc <= 1/thres_fc) & !is.na(p_posthoc) & !is.na(p_anova) ~ "significant after FDR correction", # ANOVA significant after FDR, posthoc also significant, fulfills FC threshold p_anova_adj > thres_p & p_anova <= thres_p & p_posthoc <= thres_p & (fc >= thres_fc | fc <= 1/thres_fc) & !is.na(p_posthoc) & !is.na(p_anova) ~ "significant", # ANOVA significant before FDR, posthoc also significant, fulfills FC threshold (p_anova > thres_p | p_posthoc > thres_p | (fc < thres_fc & fc > 1/thres_fc)) & !is.na(p_posthoc) & !is.na(p_anova) ~ "not significant", # ANOVA not significant or posthoc not significant or FC does not fulfill threshold is.na(p_posthoc) | is.na(p_anova) ~ NA_character_ ) - + significance <- factor(significance, levels = c("not significant", "significant", "significant after FDR correction")) - + return(significance) -} \ No newline at end of file +} diff --git a/R/volcanoPlots.R b/R/volcanoPlots.R index 561d5a5..9334608 100644 --- a/R/volcanoPlots.R +++ b/R/volcanoPlots.R @@ -5,8 +5,8 @@ #' @param FC \strong{numeric factor} \cr #' The values for the fold changes. #' @param significance_category \strong{character factor} \cr -#' The significance categories for the volcano plot. -#' +#' The significance categories for the volcano plot. +#' #' @param log_base_fc \strong{numeric} \cr #' The base for the fold changes log-transformation. #' @param log_base_p \strong{numeric} \cr @@ -24,22 +24,22 @@ #' @param symmetric_x \strong{logical} \cr #' If \code{TRUE}, x-axis limits will be made symmetric (not used if xlim is defined). #' @param legend_position \strong{character} \cr -#' The positioning of the legend. +#' The positioning of the legend. #' Options are "none", "left", "right", "bottom", "top" and "inside". #' @param base_size \strong{numeric} \cr #' The base size of the font. #' @param xlim \strong{numeric} \cr -#' The limits for x-axis. +#' The limits for x-axis. #' @param ylim \strong{numeric} \cr -#' The limits for y-axis. +#' The limits for y-axis. #' #' @return A ggplot showing the volcano plot. #' @export -#' +#' #' @seealso [VolcanoPlot_ttest()], [VolcanoPlot_ANOVA()] #' #' @examples -#' +#' VolcanoPlot <- function(p, FC, @@ -55,7 +55,8 @@ VolcanoPlot <- function(p, legend_position = "bottom", base_size = NULL, xlim = NULL, - ylim = NULL) { + ylim = NULL, alpha = 0.5, + point_size = 3) { ### transform p-values and fold changes and thresholds @@ -66,20 +67,24 @@ VolcanoPlot <- function(p, log_thres_p <- -log(thres_p, base = log_base_p) log_thres_fc <- log(thres_fc, base = log_base_fc) - RES <- data.frame(transformed_FC = transformed_FC, + RES <<- data.frame(transformed_FC = transformed_FC, transformed_p = transformed_p, significance = significance_category) significance <- RES$significance - + plot <- ggplot2::ggplot(data = RES, ggplot2::aes(x = transformed_FC, y = transformed_p, colour = significance)) + - ggplot2::geom_point(alpha = 5/10) + - ggplot2::scale_colour_manual(values = c("not significant" = colour1, "significant" = colour2, "significant after FDR correction" = colour3), drop = FALSE) + - ### TODO: axis labels with expressions - ### TODO: what if I do not have these categories? + ggplot2::geom_point(alpha = alpha, show.legend = TRUE, size = point_size) + + ggplot2::scale_colour_manual(values = c("not significant" = colour1, + "significant" = colour2, + "significant after FDR correction" = colour3), + drop = FALSE, + na.translate = FALSE) + + ggplot2::xlab(paste0("log",log_base_fc,"(FC)")) + - ggplot2::ylab(paste0("-log",log_base_p,"(p)")) + ggplot2::ylab(paste0("-log",log_base_p,"(p)")) + + ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = point_size*1.5))) if (!is.null(base_size)) { @@ -88,7 +93,6 @@ VolcanoPlot <- function(p, plot <- plot + ggplot2::theme_bw() } - # TODO: legend_position einstellbar machen plot <- plot + ggplot2::theme(legend.position = legend_position) ### if xlim is not set, use max/min FC and make it symmetric @@ -161,7 +165,7 @@ VolcanoPlot <- function(p, #' @param symmetric_x \strong{logical} \cr #' If \code{TRUE}, x-axis limits will be made symmetric (not used if xlim is defined). #' @param legend_position \strong{character} \cr -#' The positioning of the legend. +#' The positioning of the legend. #' Options are "none", "left", "right", "bottom", "top" and "inside". #' @param plot_height \strong{numeric} \cr #' The height of plot. @@ -170,7 +174,7 @@ VolcanoPlot <- function(p, #' @param plot_dpi \strong{integer} \cr #' The resolution of plot. #' @param plot_device \strong{character} \cr -#' The plot device that is used for the resulting plot. +#' The plot device that is used for the resulting plot. #' Options are "pdf" and "png". #' @param output_path \strong{character} \cr #' The path for output file. @@ -183,40 +187,33 @@ VolcanoPlot <- function(p, #' #' @return A ggplot object of the volcano plot from a ttest result. #' @export -#' +#' #' @seealso [VolcanoPlot()], [VolcanoPlot_ANOVA()], [add_labels()] #' -#' @examples -#' +#' @examples +#' VolcanoPlot_ttest <- function(RES, columnname_p = "p", columnname_padj = "padj", columnname_FC = "FC", + thres_p = 0.05, + thres_fc = 2, log_base_fc = 2, log_base_p = 10, is_FC_log = FALSE, is_p_log = FALSE, - thres_fc = 2, - thres_p = 0.05, show_thres_line = TRUE, - colour1 = "grey", - colour2 = "black", - colour3 = "orange", groupname1 = "group1", groupname2 = "group2", - xlim = NULL, - ylim = NULL, - symmetric_x = FALSE, - legend_position = "bottom", plot_height = 15, plot_width = 15, plot_dpi = 300, plot_device="pdf", output_path = NULL, suffix = NULL, - base_size = NULL, - add_annotation = TRUE){ + add_annotation = TRUE, + ...) { # make check work @@ -247,18 +244,7 @@ VolcanoPlot_ttest <- function(RES, plot <- VolcanoPlot(p = p, FC = FC, significance_category = RES$significance, - log_base_fc = log_base_fc, - log_base_p = log_base_p, - thres_p = thres_p, - thres_fc = thres_fc, - colour1 = colour1, - colour2 = colour2, - colour3 = colour3, - symmetric_x = symmetric_x, - legend_position = legend_position, - base_size = base_size, - xlim = xlim, - ylim = ylim) + ...) ## TODO: annotation of number of significant proteins @@ -326,7 +312,7 @@ VolcanoPlot_ttest <- function(RES, #' @param symmetric_x \strong{logical} \cr #' If \code{TRUE}, x-axis limits will be made symmetric (not used if xlim is defined). #' @param legend_position \strong{character} \cr -#' The positioning of the legend. +#' The positioning of the legend. #' Options are "none", "left", "right", "bottom", "top" and "inside". #' @param base_size \strong{numeric} \cr #' The base size for theme. @@ -335,15 +321,15 @@ VolcanoPlot_ttest <- function(RES, #' @param ylim \strong{numeric} \cr #' The limits for y-axis. #' @param add_labels \strong{logical} \cr -#' If \code{TRUE}, labels will be added. +#' If \code{TRUE}, labels will be added. #' #' @return A list of ggplots of the volcano plot from an ANOVA result. #' @export -#' +#' #' @seealso [VolcanoPlot()], [VolcanoPlot_ttest()], [add_labels()] #' #' @examples -#' +#' VolcanoPlot_ANOVA <- function(RES, columnname_p_ANOVA = "p.anova", @@ -376,9 +362,12 @@ VolcanoPlot_ANOVA <- function(RES, #nr_comparisons <- choose(n = nr_groups, k = 2) # pairwise comparisons between two groups # names of the comparisons - comp_names <- colnames(RES)[columns_p_posthoc] - comp_names <- stringr::str_replace_all(comp_names, "p.posthoc.", "") - comp_names <- stringr::str_replace_all(comp_names, "_", " ") + comp_names <- colnames(RES)[columns_FC] + comp_names <- substring(comp_names, 4) # remove "FC_" at beginning + comp_names <- stringr::str_replace_all(comp_names, "devided_by_", "vs") + + #comp_names <- stringr::str_replace_all(comp_names, "FC_", "") + #comp_names <- stringr::str_replace_all(comp_names, "_", " ") Volcano_plots <- list() for (i in 1:nr_comparisons) { @@ -460,11 +449,11 @@ VolcanoPlot_ANOVA <- function(RES, #' #' @return A ggplot object with labels. #' @export -#' +#' #' @seealso [VolcanoPlot()], [VolcanoPlot_ttest()], [VolcanoPlot_ANOVA()] #' -#' @examples -#' +#' @examples +#' add_labels <- function(RES_Volcano, label_type = "FDR", @@ -512,10 +501,6 @@ add_labels <- function(RES_Volcano, labels_down[ind_label_down] <- protein_names[ind_label_down] } - ### TODO: what if genenames are too long? - ### TODO: what if genenames are not unique? - ### TODO: what if genenames are not available? - nudge_x = 0.2 nudge_y = 0.2 diff --git a/R/workflow_QC.R b/R/workflow_QC.R index d971ac5..ffdf2d0 100644 --- a/R/workflow_QC.R +++ b/R/workflow_QC.R @@ -1,4 +1,14 @@ -#' The main workflow for quality control of quantitative proteomics data +#' QC Workflow (QCQuant) +#' +#' @description +#' Workflow for quality control of quantitative proteomics data +#' +#' @details +#' This function performs quality control of quantitative proteomics data. +#' As a first step, the data can be normalized. +#' Then the following plots are generated: a valid value plot, boxplots, MA-plots and a PCA plot. +#' +#' #' #' @param data_path \strong{character} \cr #' The path to an .xlsx file containing the input data. @@ -28,13 +38,12 @@ #' The name for the group variable. #' @param group_colours \strong{character vector} \cr #' The hex codes for the group colors, if the data has groups. If \code{NULL}, a default color scale will be used. -### TODO: ideally, group_colours would be a named vector! #' #' @param suffix \strong{character} \cr #' The suffix for the output files. It needs to start with an underscore. #' # general plot parameters -#' @param base_size \strong{numeric} \cr +#' @param base_size \strong{numeric} \cr #' The base size of the font. #' @param plot_device \strong{character} \cr #' The type of the output file. Options are "pdf" or "png". @@ -54,7 +63,7 @@ #' @param generate_MAplots \strong{logical} \cr #' If \code{TRUE}, MA plots will be generated, if \code{FALSE} they will not be generated (mostly for debugging purposes). #' @param MA_maxPlots \strong{integer} \cr -#' The maximum number of MA plots that should be generated. +#' The maximum number of MA plots that should be generated. #' @param MA_alpha \strong{logical} \cr #' If \code{TRUE}, the data points of the MA plots will be transparent. #' @param MA_sampling \strong{numeric} \cr @@ -97,21 +106,21 @@ #' #' @return The workflow saves several plots and excel files and returns a message log of the workflow. #' @export -#' -#' @seealso Functions used in this workflow: -#' [prepareData()], [ValidValuePlot()], [Boxplots()], [MA_Plots()], [PCA_Plot()] +#' +#' @seealso Functions used in this workflow: +#' [prepareData()], [ValidValuePlot()], [Boxplots()], [MA_Plots()], [PCA_Plot()] #' #' @examples #' #' # 1. Set the character of your data path, leading to an .xlsx file. #' in_path <- "C:/Users/thisuser/Documents/dataFolder/data.xlsx" -#' +#' #' # 2. Set the integer vector of the columns, which contain the intensities. #' int_col <- 3:17 -#' +#' #' # 3. Set the character of the output path, leading to a folder for the results. #' out_path <- "C:/Users/thisuser/Documents/resultsFolder/" -#' +#' #' # 4. Run the QC workflow with the parameters you set. #' \dontrun{ #' result <- workflow_QC(data_path = in_path, @@ -123,29 +132,29 @@ workflow_QC <- function(data_path, output_path, - + intensity_columns, normalization_method = "loess", lts_quantile = 0.8, use_groups = TRUE, - + na_strings = c("NA", "NaN", "Filtered","#NV"), zero_to_NA = TRUE, do_log_transformation = TRUE, log_base = 2, - + groupvar_name = "Group", group_colours = NULL, - + base_size = 15, plot_device = "pdf", plot_height = 10, plot_width = 15, plot_dpi = 300, - + suffix = "_", - - + + boxplot_method = "boxplot", generate_MAplots = TRUE, @@ -155,7 +164,7 @@ workflow_QC <- function(data_path, #PCA_groupvar1 = "group", #PCA_groupvar2 = NULL, - + PCA_impute = FALSE, PCA_impute_method = "mean", PCA_propNA = 0, PCA_scale. = TRUE, PCA_PCx = 1, PCA_PCy = 2, @@ -163,25 +172,25 @@ workflow_QC <- function(data_path, #PCA_groupvar2_name = NULL, PCA_alpha = 1, PCA_label = FALSE, PCA_label_seed = NA, PCA_label_size = 4, PCA_xlim = NULL, PCA_ylim = NULL, PCA_point.size = 4 - + ){ - + mess = "" - - + + #### Prepare Data #### - + prepared_data <- prepareData(data_path = data_path, intensity_columns = intensity_columns, na_strings = na_strings, zero_to_NA = zero_to_NA, do_log_transformation = do_log_transformation, log_base = log_base, use_groups = use_groups, group_colours = group_colours, normalization = normalization_method, lts_quantile = lts_quantile) - + mess <- paste0(mess, prepared_data[["message"]]) - + group <- prepared_data$group - + utils::write.csv(x = prepared_data$ID, file = paste0(output_path, "/ID", suffix, ".csv"), row.names = FALSE) utils::write.csv(x = prepared_data$D, file = paste0(output_path, "/D_norm_wide", suffix, ".csv"), row.names = FALSE) utils::write.csv(x = prepared_data$D_long, file = paste0(output_path, "/D_norm_long", suffix, ".csv"), row.names = FALSE) @@ -189,10 +198,10 @@ workflow_QC <- function(data_path, openxlsx::write.xlsx(x = cbind(prepared_data$ID, prepared_data$D), file = paste0(output_path, "/D_norm_ID", suffix, ".xlsx"), rowNames = FALSE, overwrite = TRUE, keepNA = TRUE) - - + + #### Calculate Valid Value Plot #### - + vv_plot_data <- ValidValuePlot(D_long = prepared_data[["D_long"]], use_groups = use_groups, groupvar_name = groupvar_name, group_colours = group_colours, base_size = base_size) @@ -203,29 +212,29 @@ workflow_QC <- function(data_path, vv_plot_data$table <- vv_plot_data$table[order(vv_plot_data$table$name),] mess <- paste0(mess, vv_plot_data[["message"]]) - - + + ggplot2::ggsave(paste0(output_path, "/valid_value_plot", suffix, ".", plot_device), plot = vv_plot_data[["plot"]], device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi, units = "cm") utils::write.csv(x = vv_plot_data$table, file = paste0(output_path, "/D_validvalues", suffix, ".csv"), row.names = FALSE) - - - + + + #### Calculate Boxlots #### - + boxplot_data <- Boxplots(D_long = prepared_data[["D_long"]], do_log_transformation = FALSE, log_base = log_base, use_groups = use_groups, groupvar_name = groupvar_name, group_colours = group_colours, - + base_size = base_size, method = boxplot_method) - + mess <- paste0(mess, boxplot_data[["message"]]) - + ggplot2::ggsave(paste0(output_path, "/boxplot", suffix, ".", plot_device), plot = boxplot_data[["plot"]], - + device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi, units = "cm") - - + + #### Calculate MA Plot #### if (generate_MAplots) { @@ -241,16 +250,16 @@ workflow_QC <- function(data_path, #### Calculate PCA Plot #### - - + + ### depending of groups should be used or not, the group variable is used in the PCA function if (use_groups) { PCA_groupvar1 <- group } else { PCA_groupvar1 <- NULL } - - + + pca_data <- PCA_Plot(D = prepared_data[["D"]], groupvar1 = group, groupvar2 = NULL, @@ -263,14 +272,15 @@ workflow_QC <- function(data_path, label = PCA_label, PCA_label_seed = NA, PCA_label_size = 4, xlim = PCA_xlim, ylim = PCA_ylim, point.size = PCA_point.size, base_size = base_size) - + mess <- paste0(mess, pca_data[["message"]]) - - + + ggplot2::ggsave(paste0(output_path, "/PCA_plot", suffix, ".", plot_device), plot = pca_data[["plot"]], device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi, units = "cm") utils::write.csv(x = pca_data$D_PCA_plot, file = paste0(output_path, "/D_PCA", suffix, ".csv"), row.names = FALSE) - - - return (list("message" = mess)) + utils::write.csv(x = pca_data$filtered_D, file = paste0(output_path, "/PCA_data_after_imputation", suffix, ".csv"), row.names = FALSE) + + + return(list("message" = mess)) } diff --git a/R/workflow_ttest.R b/R/workflow_ttest.R index c0d345c..afb0235 100644 --- a/R/workflow_ttest.R +++ b/R/workflow_ttest.R @@ -1,4 +1,17 @@ -#' The workflow for a t-test of quantitative proteomics data. +#' t-test workflow +#' +#' @description +#' Workflow for t-test analysis of quantitative proteomics data. +#' +#' @details +#' This function performs a t-test to compare two experimental groups in a quantitative proteomics dataset. +#' Ideally, the data should be already normalized before performing the t-test (e.g. by using the [workflow_QC()] function). +#' It is recommended to log-transform the data before the t-test (either use already log-transformed data and \code{log_before_test = FALSE} +#' or log-transform the data on the fly with \code{log_before_test = TRUE}). +#' The function generates a volcano plot, a histogram of p-values and fold changes, +#' boxplots of the significant candidates, and a heatmap of the significant candidates. +#' +#' #' #' @param data_path \strong{character} \cr #' The path to an .xlsx file containing the input data. @@ -6,7 +19,7 @@ #' The path to the output folder. #' @param intensity_columns \strong{integer vector} \cr #' The numbers of the intensity columns in the table. -#' +#' #' @param paired \strong{logical} \cr #' If \code{TRUE}, a paired test will be done, otherwise an unpaired test. #' @param var.equal \strong{logical} \cr @@ -17,14 +30,14 @@ #' If \code{TRUE}, the fold change will be calculated without the log-transformation. #' @param p_value_zeros_to_min \strong{logical} \cr #' If \code{TRUE}, then \code{p_values == 0} will be set to the next smallest value of the p-values. -#' +#' #' @param significant_after_FDR \strong{logical} \cr #' If \code{TRUE}, candidates for the boxplots and heatmap need to be significant after FDR correction, otherwise all significant candidates will be used. #' @param max_valid_values_off \strong{integer} \cr #' The maximum number of valid values to be an off protein. #' @param min_valid_values_on \strong{integer} \cr #' The minimum number of valid values to be an on protein. -#' +#' #' @param suffix \strong{character} \cr #' The suffix of the file names should have one. #' @param plot_device \strong{character} \cr @@ -35,205 +48,210 @@ #' The plot width in cm. #' @param plot_dpi \strong{integer} \cr #' The "dots per inch" of the plot aka. the plot resolution. -#' -#' +#' +#' #' @return Returns a message log of the workflow. The log contains an overview of the settings and gives some information e.g number of significant candidates or on-off-proteins. #' @export -#' +#' #' @seealso [workflow_ANOVA()] in case of more than two groups in the sample.\cr -#' Functions used in this workflow: -#' [prepareTtestData()], [ttest()], [VolcanoPlot_ttest()], [pvalue_foldchange_histogram()], -#' [calculate_significance_categories_ttest()], [Boxplots_candidates()], +#' Functions used in this workflow: +#' [prepareTtestData()], [ttest()], [VolcanoPlot_ttest()], [pvalue_foldchange_histogram()], +#' [calculate_significance_categories_ttest()], [Boxplots_candidates()], #' [Heatmap_with_groups()], [calculate_onoff()] #' #' @examples -#' +#' #' # 1. Set the character of your data path, leading to an .xlsx file. #' in_path <- "C:/Users/thisuser/Documents/dataFolder/data.xlsx" -#' +#' #' # 2. Set the integer vector of the columns, which contain the intensities. #' int_col <- 3:8 -#' +#' #' # 3. Set the character of the output path, leading to a folder for the results. #' out_path <- "C:/Users/thisuser/Documents/resultsFolder/" -#' +#' #' # 4. Run the ttest with the parameters you set. #' \dontrun{ #' result <- workflow_ttest(data_path = in_path, #' output_path = out_path, #' intensity_columns = int_col) } -#' +#' workflow_ttest <- function(data_path, output_path, intensity_columns, - + paired = FALSE, var.equal = FALSE, log_before_test = TRUE, delog_for_FC = TRUE, p_value_zeros_to_min = TRUE, - + + volcano_base_size = 25, + significant_after_FDR = TRUE, max_valid_values_off = 0, min_valid_values_on = NULL, - + suffix = "", plot_device = "pdf", plot_height = 15, plot_width = 15, - plot_dpi = 300 - ){ - + plot_dpi = 300, + + column_name_protein = "Protein" + ) { + mess = "" - - + + #### Prepare Data #### - + data <- prepareTtestData(data_path = data_path , intensity_columns = intensity_columns) - - - + + + #### Calculate ttest #### - - test_results <- ttest(D = data[["D"]], id = data[["ID"]], - group = data[["group"]], sample = data[["sample"]], + + test_results <<- ttest(D = data[["D"]], id = data[["ID"]], + group = data[["group"]], sample = data[["sample"]], paired = paired, var.equal = var.equal, log_before_test = log_before_test, delog_for_FC = delog_for_FC, log_base = 2, min_obs_per_group = 3, min_obs_per_group_ratio = NULL, filename = paste0(output_path, "results_ttest", suffix, ".xlsx")) - - mess <- paste0(mess, - ifelse(paired, "Paired", "Unaired"), - " t-test calculated with the variance assumed to be ", + + mess <- paste0(mess, + ifelse(paired, "Paired", "Unaired"), + " t-test calculated with the variance assumed to be ", ifelse(var.equal, "equal", "unequal"), ". \n", - "Data was ", - ifelse(log_before_test, "", "not"), - "log-transformed before the t-test and ", + "Data was ", + ifelse(log_before_test, "", "not"), + "log-transformed before the t-test and ", ifelse(delog_for_FC, "", "not"), "de-log-transformed for the fold change. \n") - - + + if(p_value_zeros_to_min){ - + p_value_zero <- which(test_results$p == 0) - + if(length(p_value_zero) > 0){ next_smallest_value <- sort(unique(test_results$p))[2] - + test_results$p[test_results$p == 0] <- next_smallest_value - + mess <- paste0(mess, "There were ", length(p_value_zero), " p_values, which were 0. ", "They were set to the next smallest occuring value ", next_smallest_value, ". \n") } } - - + + fc_col_name <- paste0("FC_", levels(data[["group"]])[[1]], "_divided_by_", levels(data[["group"]])[[2]]) - - - + + + #### Create Volcano Plot #### - - volcano_plot <- VolcanoPlot_ttest(RES = test_results, - columnname_p = "p", columnname_padj = "p.fdr", - columnname_FC = fc_col_name) - + + volcano_plot <- VolcanoPlot_ttest(RES = test_results, + columnname_p = "p", columnname_padj = "p.fdr", + columnname_FC = fc_col_name, base_size = volcano_base_size) + ggplot2::ggsave(paste0(output_path, "volcano_plot", suffix, ".", plot_device), plot = volcano_plot, device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi) - + mess <- paste0(mess, "Volcano plot calculated. \n") - - - + + + #### Create Histogram for p-values and fold changes #### - - histograms <- pvalue_foldchange_histogram(RES = test_results, - columnname_p = "p", columnname_padj = "p.fdr", + + histograms <- ProtStatsWF::pvalue_foldchange_histogram(RES = test_results, + columnname_p = "p", columnname_padj = "p.fdr", columnname_FC = fc_col_name) - + ggplot2::ggsave(paste0(output_path, "histogram_p_value", suffix, ".", plot_device), plot = histograms[["histogram_p_value"]], device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi) ggplot2::ggsave(paste0(output_path, "histogram_adjusted_p_value", suffix, ".", plot_device), plot = histograms[["histogram_adjusted_p_value"]], device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi) ggplot2::ggsave(paste0(output_path, "histogram_fold_change", suffix, ".", plot_device), plot = histograms[["histogram_fold_change"]], device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi) - + mess <- paste0(mess, "p-value, adjusted p-value and fold change histograms calculated. \n") - - + + #### Get significant candidates #### - - significance <- calculate_significance_categories_ttest(p = test_results[["p"]], + + significance <- ProtStatsWF::calculate_significance_categories_ttest(p = test_results[["p"]], p_adj = test_results[["p.fdr"]], fc = test_results[[fc_col_name]]) - + candidates <- as.character(significance) - + if(significant_after_FDR){ candidates <- which(candidates == "significant after FDR correction") }else{ candidates <- which(candidates == "significant" | candidates == "significant after FDR correction") } - - mess <- paste0(mess, "There are ", length(candidates), " candidates, which were significant", + + mess <- paste0(mess, "There are ", length(candidates), " candidates, which were significant", ifelse(significant_after_FDR, " after FDR correction. \n", ". \n")) - - - + + + #### Create Boxplots of Biomarker Candidates #### - Boxplots_candidates(D = data[["D"]][candidates, ], - protein.names = data[["ID"]][candidates, "protein"], + ProtStatsWF::Boxplots_candidates(D = data[["D"]][candidates, ], + protein.names = data[["ID"]][candidates, column_name_protein], group = data[["group"]], suffix = suffix, - output_path = paste0(output_path)) - + output_path = paste0(output_path), + log_data = log_before_test) + mess <- paste0(mess, "Boxplots made for the candidates. \n") - - - + + + #### Create Heatmap #### - - set.seed(14) - - t_heatmap <- Heatmap_with_groups(D = data[["D"]][candidates, ], + + set.seed(14) + + t_heatmap <- Heatmap_with_groups(D = data[["D"]][candidates, ], id = data[["ID"]][candidates, ], groups = data[["group"]]) - + grDevices::pdf(paste0(output_path, "heatmap", suffix, ".pdf"), height = plot_height, width = plot_width) graphics::plot(t_heatmap[["heatmap"]]) grDevices::dev.off() - + openxlsx::write.xlsx(cbind(data[["ID"]][candidates, ], zscore = t_heatmap[["data_as_matrix"]]), paste0(output_path, "heatmap_data", suffix, ".xlsx"), overwrite = TRUE, keepNA = TRUE) - + mess <- paste0(mess, "Heatmap made for the candidates. \n") - - - + + + #### Create On-Off Heatmap #### - + if(is.null(min_valid_values_on)){ min_valid_values_on <- length(intensity_columns) } - - t_on_off_heatmap <- calculate_onoff(D = data[["D"]], + + t_on_off_heatmap <- calculate_onoff(D = data[["D"]], id = data[["ID"]], group = data[["group"]], max_vv_off = max_valid_values_off, min_vv_on = min_valid_values_on, protein_id_col = 1) - + grDevices::pdf(paste0(output_path, "on_off_heatmap", suffix, ".pdf"), height = plot_height, width = plot_width) graphics::plot(t_on_off_heatmap) grDevices::dev.off() - + mess <- paste0(mess, "On-Off-Heatmap made. \n", "There were ", sum(t_on_off_heatmap[["isonoff"]]), " on/off proteins.") - - - + + + #### Save message log #### - + cat(mess, file = paste0(output_path, "message_log_ttest", suffix, ".txt")) return(list("message" = mess)) @@ -254,7 +272,7 @@ workflow_ttest <- function(data_path, #' The path to the output folder. #' @param intensity_columns \strong{integer vector} \cr #' The numbers of the intensity columns in the table. -#' +#' #' @param paired \strong{logical} \cr #' If \code{TRUE}, a paired test will be done, otherwise an unpaired test. #' @param var.equal \strong{logical} \cr @@ -265,14 +283,14 @@ workflow_ttest <- function(data_path, #' If \code{TRUE}, the fold change will be calculated without the log-transformation. #' @param p_value_zeros_to_min \strong{logical} \cr #' If \code{TRUE}, then \code{p_values == 0} will be set to the next smallest value of the p-values. -#' +#' #' @param significant_after_FDR \strong{logical} \cr #' If \code{TRUE}, candidates for the boxplots and heatmap need to be significant after FDR correction, otherwise all significant candidates will be used. #' @param max_valid_values_off \strong{integer} \cr #' The maximum number of valid values to be an off protein. #' @param min_valid_values_on \strong{integer} \cr #' The minimum number of valid values to be an on protein. -#' +#' #' @param suffix \strong{character} \cr #' The suffix of the file names should have one. #' @param plot_device \strong{character} \cr @@ -283,30 +301,30 @@ workflow_ttest <- function(data_path, #' The plot width in cm. #' @param plot_dpi \strong{integer} \cr #' The "dots per inch" of the plot aka. the plot resolution. -#' -#' +#' +#' #' @return Message log of the workflow -#' @export +#' #' #' @seealso [workflow_ttest()] in case of only two groups in the sample.\cr -#' Functions used in this workflow: -#' [prepareTtestData()], [ttest()], [VolcanoPlot_ttest()], [pvalue_foldchange_histogram()], -#' [calculate_significance_categories_ttest()], [Boxplots_candidates()], +#' Functions used in this workflow: +#' [prepareTtestData()], [ttest()], [VolcanoPlot_ttest()], [pvalue_foldchange_histogram()], +#' [calculate_significance_categories_ttest()], [Boxplots_candidates()], #' [Heatmap_with_groups()], [calculate_onoff()] -#' -#' +#' +#' #' @examples -#' -#' +#' +#' #' # 1. Set the character of your data path, leading to an .xlsx file. #' in_path <- "C:/Users/thisuser/Documents/dataFolder/data.xlsx" -#' +#' #' # 2. Set the integer vector of the columns, which contain the intensities. #' int_col <- 3:17 -#' +#' #' # 3. Set the character of the output path, leading to a folder for the results. #' out_path <- "C:/Users/thisuser/Documents/resultsFolder/" -#' +#' #' # 4. Run the ANOVA with the parameters you set. #' \dontrun{ #' result <- workflow_ANOVA(data_path = in_path, @@ -314,202 +332,206 @@ workflow_ttest <- function(data_path, #' intensity_columns = int_col) } #' + + + + workflow_ANOVA <- function(data_path, output_path, intensity_columns, - + paired = FALSE, var.equal = TRUE, log_before_test = TRUE, delog_for_FC = TRUE, p_value_zeros_to_min = TRUE, - + significant_after_FDR = TRUE, max_valid_values_off = 0, min_valid_values_on = NULL, - + suffix = "", plot_device = "pdf", plot_height = 15, plot_width = 15, plot_dpi = 300 ){ - + mess = "" - - + + #### Prepare Data #### - + data <- prepareTtestData(data_path = data_path , intensity_columns = intensity_columns) - + mess <- paste0(mess, "Data file contained ", length(levels(data[["group"]])), " groups with ", length(levels(data[["sample"]])), " samples. \n") - - - + + + #### Calculate ANOVA #### - - ANOVA_results <- ANOVA(D = data[["D"]], id = data[["ID"]], - group = data[["group"]], sample = data[["sample"]], + + ANOVA_results <- ANOVA(D = data[["D"]], id = data[["ID"]], + group = data[["group"]], sample = data[["sample"]], paired = paired, var.equal = var.equal, log_before_test = log_before_test, delog_for_FC = delog_for_FC, log_base = 2, min_obs_per_group = 3, min_perc_per_group = NULL, filename = paste0(output_path, "results_ANOVA", suffix, ".xlsx")) - + mess <- paste0(mess, "ANOVA calculated. \n") - - + + if(p_value_zeros_to_min){ - + p_value_zero <- which(ANOVA_results$p.anova == 0) - + if(length(p_value_zero) > 0){ next_smallest_value <- sort(unique(ANOVA_results$p.anova))[2] - + ANOVA_results$p.anova[ANOVA_results$p.anova == 0] <- next_smallest_value - + mess <- paste0(mess, "There were ", length(p_value_zero), " p_values, which were 0. ", "They were set to the next smallest occuring value ", next_smallest_value, ". \n") } } - - - + + + #### Create Volcano Plot #### - + p_posthoc_columns <- grep("p.posthoc.", colnames(ANOVA_results)) fc_columns <- grep("FC_", colnames(ANOVA_results)) - + # filter FC columns because FC is calculated "twice" e.g. for state1_divided_state2 and state1_divided_state2 if(fc_columns[[1]]%%2 == 0){ fc_columns <- fc_columns[fc_columns %% 2 == 0] }else{ fc_columns <- fc_columns[fc_columns %% 2 != 0] } - - volcano_plots <- VolcanoPlot_ANOVA(RES = ANOVA_results, + + volcano_plots <- VolcanoPlot_ANOVA(RES = ANOVA_results, columnname_p_ANOVA = "p.anova", columnname_p_ANOVA_adj = "p.anova.fdr", columns_FC = fc_columns, columns_p_posthoc = p_posthoc_columns ) - + mess <- paste0(mess, length(volcano_plots), " volcano plots calculated. \n") - + grDevices::pdf(paste0(output_path, "volcano_plots", suffix ,".pdf"), height = plot_height, width = plot_width) for (v_plot in volcano_plots) { graphics::plot(x = v_plot) } grDevices::dev.off() - + rm(volcano_plots, v_plot) - - - + + + #### Create Histogram for p-values and fold changes #### - - histograms <- pvalue_foldchange_histogram(RES = ANOVA_results, - columnname_p = "p.anova", columnname_padj = "p.anova.fdr", + + histograms <- pvalue_foldchange_histogram(RES = ANOVA_results, + columnname_p = "p.anova", columnname_padj = "p.anova.fdr", columnname_FC = colnames(ANOVA_results)[[fc_columns[[1]]]]) - + ggplot2::ggsave(paste0(output_path, "histogram_p_value", suffix, ".", plot_device), plot = histograms[["histogram_p_value"]], device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi) ggplot2::ggsave(paste0(output_path, "histogram_adjusted_p_value", suffix, ".", plot_device), plot = histograms[["histogram_adjusted_p_value"]], device = plot_device, height = plot_height, width = plot_width, dpi = plot_dpi) - + mess <- paste0(mess, "p-value and adjusted p-value histograms calculated. \n") - + rm(histograms) - - - + + + #### Get significant candidates #### - + candidates <- list() - + for (i in 1:length(p_posthoc_columns)) { significance <- factor(calculate_significance_categories_ANOVA(p_anova = ANOVA_results[["p.anova"]], p_anova_adj = ANOVA_results[["p.anova.fdr"]], p_posthoc = ANOVA_results[[p_posthoc_columns[[i]]]], fc = ANOVA_results[[fc_columns[[i]]]])) - + candidates[[i]] <- as.character(significance) - + if(significant_after_FDR){ candidates[[i]] <- which(candidates[[i]] == "significant after FDR correction") }else{ candidates[[i]] <- which(candidates[[i]] == "significant" | candidates[[i]] == "significant after FDR correction") } - - + + } - + union_candidates <- unique(unlist(candidates)) - - mess <- paste0(mess, "There are ", length(union_candidates), " in the union of candidates, which were significant", + + mess <- paste0(mess, "There are ", length(union_candidates), " in the union of candidates, which were significant", ifelse(significant_after_FDR, " after FDR correction. \n", ". \n")) - - - + + + #### Create Boxplots of Biomarker Candidates #### - - Boxplots_candidates(D = data[["D"]][union_candidates, ], + + Boxplots_candidates(D = data[["D"]][union_candidates, ], protein.names = data[["ID"]][union_candidates, "protein"], group = data[["group"]], suffix = suffix, output_path = paste0(output_path)) - + mess <- paste0(mess, "Boxplots made for the ", length(union_candidates) ," candidates. \n") - - - + + + #### Create Heatmap #### - + set.seed(27) #set seed so colors can be differentiated well: 14, 27, 101 - - t_heatmap <- Heatmap_with_groups(D = data[["D"]][union_candidates, ], + + t_heatmap <- Heatmap_with_groups(D = data[["D"]][union_candidates, ], id = data[["ID"]][union_candidates, ], groups = data[["group"]]) - + grDevices::pdf(paste0(output_path, "heatmap", suffix, ".pdf"), height = plot_height, width = plot_width) graphics::plot(t_heatmap[["heatmap"]]) grDevices::dev.off() - + remaining_candidates <- as.integer(row.names(t_heatmap[["data_as_matrix"]])) openxlsx::write.xlsx(cbind(data[["ID"]][remaining_candidates, ], zscore = t_heatmap[["data_as_matrix"]]), paste0(output_path, "heatmap_data", suffix, ".xlsx"), overwrite = TRUE, keepNA = TRUE) - + mess <- paste0(mess, "Heatmap made for the union of all candidates. \n") - + rm(t_heatmap, remaining_candidates) - - - + + + #### Create On-Off Heatmap #### - + if(is.null(min_valid_values_on)){ min_valid_values_on <- length(intensity_columns) } - - t_on_off_heatmap <- calculate_onoff(D = data[["D"]], + + t_on_off_heatmap <- calculate_onoff(D = data[["D"]], id = data[["ID"]], group = data[["group"]], max_vv_off = max_valid_values_off, min_vv_on = min_valid_values_on, protein_id_col = 1) - + grDevices::pdf(paste0(output_path, "on_off_heatmap", suffix, ".pdf"), height = plot_height, width = plot_width) graphics::plot(t_on_off_heatmap) grDevices::dev.off() - + mess <- paste0(mess, "On-Off-Heatmap made. \n", "There were ", sum(t_on_off_heatmap[["isonoff"]]), " on/off proteins.") - + rm(t_on_off_heatmap) - - - + + + #### Save message log #### - + cat(mess, file = paste0(output_path, "message_log_anova", suffix, ".txt")) - - - + + + return(list("message" = mess)) -} \ No newline at end of file +} diff --git a/README.md b/README.md index ecdee70..5403e7b 100644 --- a/README.md +++ b/README.md @@ -12,3 +12,12 @@ You can install the development version of ProtStatsWF from devtools::install_github("mpc-bioinformatics/ProtStatsWF") library(ProtStatsWF) ``` + +New features will be introduced in github branches, which are merged as soon as the feature has been properly tested. +To install a specific branch of the package please use + +``` r +# install.packages("devtools") +devtools::install_github("mpc-bioinformatics/ProtStatsWF", ref = "") +library(ProtStatsWF) +``` diff --git a/man/ANOVA.Rd b/man/ANOVA.Rd index 55477a9..8c3b985 100644 --- a/man/ANOVA.Rd +++ b/man/ANOVA.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/WIP_ANOVA.R +% Please edit documentation in R/ANOVA.R \name{ANOVA} \alias{ANOVA} \title{ANOVA to compare three or more experimental groups} @@ -15,8 +15,7 @@ ANOVA( delog_for_FC = TRUE, log_base = 2, min_obs_per_group = 3, - min_perc_per_group = NULL, - filename = "results_ANOVA.xlsx" + min_perc_per_group = NULL ) } \arguments{ @@ -52,9 +51,6 @@ The minimum number of observations per group.} \item{min_perc_per_group}{\strong{integer} \cr The minimum ratio of observations per group as an alternative to min_obs_per_group.} - -\item{filename}{\strong{character} \cr -The name of the output file.} } \value{ A data.frame with p-values and fold changes diff --git a/man/ANOVA_Welch_single_row.Rd b/man/ANOVA_Welch_single_row.Rd index 27180fa..6515daa 100644 --- a/man/ANOVA_Welch_single_row.Rd +++ b/man/ANOVA_Welch_single_row.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/WIP_ANOVA.R +% Please edit documentation in R/ANOVA.R \name{ANOVA_Welch_single_row} \alias{ANOVA_Welch_single_row} \title{Welch ANOVA (unequal variances)} diff --git a/man/ANOVA_repeatedMeasurements_single_row.Rd b/man/ANOVA_repeatedMeasurements_single_row.Rd index a6d8b6a..a3fd951 100644 --- a/man/ANOVA_repeatedMeasurements_single_row.Rd +++ b/man/ANOVA_repeatedMeasurements_single_row.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/WIP_ANOVA.R +% Please edit documentation in R/ANOVA.R \name{ANOVA_repeatedMeasurements_single_row} \alias{ANOVA_repeatedMeasurements_single_row} \title{Repeated measures ANOVA (paired samples)} diff --git a/man/ANOVA_standard_single_row.Rd b/man/ANOVA_standard_single_row.Rd index 9b5dc92..54c6ee3 100644 --- a/man/ANOVA_standard_single_row.Rd +++ b/man/ANOVA_standard_single_row.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/WIP_ANOVA.R +% Please edit documentation in R/ANOVA.R \name{ANOVA_standard_single_row} \alias{ANOVA_standard_single_row} \title{Standard ANOVA (equal variances)} diff --git a/man/workflow_QC.Rd b/man/workflow_QC.Rd index 1a3542b..af2bc42 100644 --- a/man/workflow_QC.Rd +++ b/man/workflow_QC.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/workflow_QC.R \name{workflow_QC} \alias{workflow_QC} -\title{The main workflow for quality control of quantitative proteomics data} +\title{QC Workflow (QCQuant)} \usage{ workflow_QC( data_path, @@ -160,7 +160,12 @@ The size of the data points.} The workflow saves several plots and excel files and returns a message log of the workflow. } \description{ -The main workflow for quality control of quantitative proteomics data +Workflow for quality control of quantitative proteomics data +} +\details{ +This function performs quality control of quantitative proteomics data. +As a first step, the data can be normalized. +Then the following plots are generated: a valid value plot, boxplots, MA-plots and a PCA plot. } \examples{ diff --git a/man/workflow_ttest.Rd b/man/workflow_ttest.Rd index 66214c4..da47d70 100644 --- a/man/workflow_ttest.Rd +++ b/man/workflow_ttest.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/workflow_ttest.R \name{workflow_ttest} \alias{workflow_ttest} -\title{The workflow for a t-test of quantitative proteomics data.} +\title{t-test workflow} \usage{ workflow_ttest( data_path, @@ -20,7 +20,8 @@ workflow_ttest( plot_device = "pdf", plot_height = 15, plot_width = 15, - plot_dpi = 300 + plot_dpi = 300, + column_name_protein = "Protein" ) } \arguments{ @@ -76,7 +77,12 @@ The "dots per inch" of the plot aka. the plot resolution.} Returns a message log of the workflow. The log contains an overview of the settings and gives some information e.g number of significant candidates or on-off-proteins. } \description{ -The workflow for a t-test of quantitative proteomics data. +#' @description +Workflow for t-test analysis of quantitative proteomics data. +} +\details{ +This function performs a t-test to compare two experimental groups in a quantitative proteomics dataset. +Ideally, the data should be already normalized before performing the t-test (e.g. by using the \code{\link[=workflow_QC]{workflow_QC()}} function). } \examples{ @@ -94,7 +100,7 @@ out_path <- "C:/Users/thisuser/Documents/resultsFolder/" result <- workflow_ttest(data_path = in_path, output_path = out_path, intensity_columns = int_col) } - + } \seealso{ \code{\link[=workflow_ANOVA]{workflow_ANOVA()}} in case of more than two groups in the sample.\cr diff --git a/tests/testthat/_snaps/VolcanoPlot/volcano-plot-anova-file-2.svg b/tests/testthat/_snaps/VolcanoPlot/volcano-plot-anova-file-2.svg index ba324c1..fb2c195 100644 --- a/tests/testthat/_snaps/VolcanoPlot/volcano-plot-anova-file-2.svg +++ b/tests/testthat/_snaps/VolcanoPlot/volcano-plot-anova-file-2.svg @@ -43,16 +43,16 @@ - - - - - - - - - - + + + + + + + + + + @@ -77,19 +77,17 @@ 2.5 log2(FC) -log10(p) - -significance - - - - - - - -not significant -significant -significant after FDR correction -NA -state1 vs state3 + +significance + + + + + + +not significant +significant +significant after FDR correction +state1_divided_by_state3 diff --git a/tests/testthat/_snaps/VolcanoPlot/volcano-plot-ttest-file-2.svg b/tests/testthat/_snaps/VolcanoPlot/volcano-plot-ttest-file-2.svg index c03f31e..33596d3 100644 --- a/tests/testthat/_snaps/VolcanoPlot/volcano-plot-ttest-file-2.svg +++ b/tests/testthat/_snaps/VolcanoPlot/volcano-plot-ttest-file-2.svg @@ -40,18 +40,18 @@ - - - - - - - - - - - - + + + + + + + + + + + + @@ -74,19 +74,17 @@ 2 log2(FC) -log10(p) - -significance - - - - - - - -not significant -significant -significant after FDR correction -NA + +significance + + + + + + +not significant +significant +significant after FDR correction volcano_plot_ttest_file_2 diff --git a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_1 b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_1 index 63a044e..2c6d1fb 100644 Binary files a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_1 and b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_1 differ diff --git a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_2 b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_2 index b2348eb..30894d3 100644 Binary files a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_2 and b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_2 differ diff --git a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_3 b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_3 index de5543b..a4bc89d 100644 Binary files a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_3 and b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_3 differ diff --git a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_4 b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_4 index d9fc6a6..0123f7a 100644 Binary files a/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_4 and b/tests/testthat/_snaps/candidate-boxplots/candidate_boxplots_4 differ diff --git a/tests/testthat/test-ANOVA.R b/tests/testthat/test-ANOVA.R index 1389deb..7d89cff 100644 --- a/tests/testthat/test-ANOVA.R +++ b/tests/testthat/test-ANOVA.R @@ -1,22 +1,21 @@ test_that("Calculate ANOVA ", { - + # Create a temporary directory so no permanent files are put on a package users directory temp_dir <- tempfile(pattern = "test_dir") dir.create(temp_dir) - on.exit(unlink(temp_dir, recursive = TRUE)) - + on.exit(unlink(temp_dir, recursive = TRUE)) + data <- prepareTtestData(data_path = test_path("testdata", "test_file_2.xlsx"), intensity_columns = 3:11) - + expect_snapshot(data[["D"]]) expect_snapshot(data[["ID"]]) - - pData <- ANOVA(D = data[["D"]], id = data[["ID"]], - group = data[["group"]], sample = data[["sample"]], + + pData <- ANOVA(D = data[["D"]], id = data[["ID"]], + group = data[["group"]], sample = data[["sample"]], paired = FALSE, var.equal = TRUE, log_before_test = TRUE, delog_for_FC = TRUE, log_base = 2, - min_obs_per_group = 3, min_perc_per_group = NULL, - filename = paste0(file.path(temp_dir, "result_ANOVA.xlsx"))) - + min_obs_per_group = 3, min_perc_per_group = NULL) + expect_snapshot(pData) - -}) \ No newline at end of file + +})