Multiscale-Genomics · markmcdowall · May 11, 2018 · May 11, 2018 · May 11, 2018 · May 11, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -25,6 +25,7 @@ cache:
     directories:
         - $HOME/.cache/pip
         - ${HOME}/lib
+        - ${HOME}/R
 
 before_cache:
     - rm -f $HOME/.cache/pip/log/debug.log
@@ -33,6 +34,7 @@ env:
     matrix:
         - TESTENV=docs
         - TESTENV=code
+        - TESTENV=rcode
         - TESTENV=wgbs_code_1
         - TESTENV=wgbs_code_2
         - TESTENV=pylint
@@ -93,10 +95,12 @@ before_install:
     - cd ${HOME}/build/Multiscale-Genomics/mg-process-fastq
     - sudo chmod +x scripts/travis/includeMAC2.sh
     - sudo chmod +x scripts/travis/includeTADbit.sh
+    - sudo chmod +x scripts/travis/install_r_code.sh
     - sudo chmod +x scripts/travis/install_wgbs_code.sh
 
-    - if [[ "$TESTENV" == "code" ]]; then sudo apt-get install r-base-core; fi
-    - if [[ "$TESTENV" == "code" ]]; then sudo apt-get install python-rpy2; fi
+    - if [[ "$TESTENV" == "rcode" ]]; then sudo apt-get install r-base-core; fi
+    - if [[ "$TESTENV" == "rcode" ]]; then sudo apt-get install python-rpy2; fi
+    - if [[ "$TESTENV" == "rcode" ]]; then sudo ./scripts/travis/install_r_code.sh; fi
     - if [[ "$TESTENV" == "code" ]]; then sudo ./scripts/travis/install_code_test_dependencies.sh; fi
     - if [[ "$TESTENV" == "code" ]]; then ./scripts/travis/includeMAC2.sh; fi
     - if [[ "$TESTENV" == "code" ]]; then ./scripts/travis/includeTADbit.sh; fi
@@ -204,7 +208,13 @@ before_script:
     # - echo "options(repos = c(CRAN = 'http://mirrors.ebi.ac.uk/CRAN/'))" > ${HOME}/.Rprofile
     # - echo ".libPaths('~/R')" >> ${HOME}/.Rprofile
     # - echo 'message("Using library:", .libPaths()[1])' >> ${HOME}/.Rprofile
-    # - Rscript scripts/install_packages.R
+    # - if [[ "$TESTENV" == "code" ]]; then Rscript scripts/install_packages.R; fi
+
+    - echo "R_LIB=${HOME}/R" > ${HOME}/.Renviron
+    - echo "options(repos = c(CRAN = 'http://mirrors.ebi.ac.uk/CRAN/'))" > ${HOME}/.Rprofile
+    - echo ".libPaths('~/R')" >> ${HOME}/.Rprofile
+    - echo 'message("Using library:", .libPaths()[1])' >> ${HOME}/.Rprofile
+    - if [[ "$TESTENV" == "rcode" ]]; then Rscript scripts/install_sleuth.R; fi
 
 
     - cd ${HOME}/build/Multiscale-Genomics/mg-process-fastq
@@ -218,6 +228,7 @@ before_script:
 
     - cd ${HOME}/build/Multiscale-Genomics/mg-process-fastq
     - chmod +x scripts/travis/harness.sh
+    - chmod +x scripts/travis/r_harness.sh
     - chmod +x scripts/travis/wgbs_harness.sh
     - chmod +x scripts/travis/docs_harness.sh
     - chmod +x scripts/travis/pylint_harness.sh
@@ -227,6 +238,7 @@ before_script:
     - export PATH="${HOME}/bin:$PATH"
 
 script:
+    - if [[ "$TESTENV" == "rcode" ]]; then ./scripts/travis/r_harness.sh; fi
     - if [[ "$TESTENV" == "code" ]]; then ./scripts/travis/harness.sh; fi
     - if [[ "$TESTENV" == "wgbs_code_1" ]]; then ./scripts/travis/wgbs_harness.sh; fi
     - if [[ "$TESTENV" == "wgbs_code_2" ]]; then ./scripts/travis/wgbs_harness.sh; fi

diff --git a/docs/adr.rst b/docs/adr.rst
@@ -58,6 +58,14 @@ Added compression of the split FASTQ files to reduce the amount of space require
 The code has been modified so that there is a single decompression of the BWA and Bowtie2 common indexes. The index files are then explicitly handed to the alignment task rather than handing over the compressed index. The decompression is performed as a @task so that the index files are already in the COMPSs system. This means that handing the index files to the alignment tasks creates a single symlink in the sandbox temporary file directory rather than duplicating the whole of the index structure for each job.
 
 
+2018-05-11 - Sleuth gene differential analysis pipeline
+-------------------------------------------------------
+
+This allows for the comparison of multiple RNA-seq experiments to determine if there are any genes that are differentially expressed. This has required changes to the output of the kallisto_quant tool so that it generates only a single tar file containing the abundance and run_info files. There is also the introduction of the bootstrap-sample parameter as part of the quantification to determine the accuracy of the counts.
+
+The first tool uses Sleuth to generate an R object of all the processed tracks. Separate tools are written for each visualisation to allow for a certain amount of parallelisation with the results being saved to an archive file.
+
+
 2018-05-22 - GEM Naming
 -----------------------
 
@@ -69,7 +77,7 @@ Update so that the gem files are name <genome-file>.gem.gz inline with requests
 
 To try and improve the quality of the reads that are used for numerous pipelines, TrimGalore has been included as a pipeline to aid in the clipping and removal of low quality regions of reads. The pipeline can be run on single or paired end FASTQ files. A report of the trimmed data is also returned for the user to identify what changes were made.
 
-2018-06-01 - Separated WGBS Vode Testing
+2018-06-01 - Separated WGBS Code Testing
 ----------------------------------------
 
 To bring down the run time for the TravisCI, the WGBS has been moved to a separate track. This has the benefit of getting the testing started earlier and allowing the other tests to finish sooner.

diff --git a/docs/full_installation.rst b/docs/full_installation.rst
@@ -409,12 +409,18 @@ Install iDEAR
 .. code-block:: none
    :linenos:
 
-   cd ${HOME}/lib
-   source("https://bioconductor.org/biocLite.R")
-   biocLite("BSgenome")
-   biocLite("DESeq2")
-   if(!require("devtools")) install.packages("devtools")
-   devtools::install_bitbucket("juanlmateo/idear")
+   cd ${HOME}/code/mg-process-fastq
+   Rscript scripts/install_packages.R
+
+
+Install Sleuth
+^^^^^^^^^^^^^^
+
+.. code-block:: none
+   :linenos:
+
+   cd ${HOME}/code/mg-process-fastq
+   Rscript scripts/install_sleuth.R
 
 Install TADbit
 ^^^^^^^^^^^^^^

diff --git a/docs/install.rst b/docs/install.rst
@@ -35,12 +35,15 @@ Software
 - HDF5
 - iNPS
 - Kallisto
+- Sleuth
 - libmaus2
 - pyenv
 - R 2.9.1+
 - SAMtools
 - MCL
 - pigz
+- iDEAR
+
 
 Python Modules
 ^^^^^^^^^^^^^^

diff --git a/docs/pipelines.rst b/docs/pipelines.rst
@@ -1012,7 +1012,7 @@ RNA-Seq Analysis
 
    Example
    -------
-   When running the pipeline on a local machinewithout COMPSs:
+   When running the pipeline on a local machine without COMPSs:
 
    .. code-block:: none
       :linenos:
@@ -1045,12 +1045,69 @@ RNA-Seq Analysis
       :members:
 
 
+.. automodule:: process_sleuth
+
+   This pipeline can process multiple outputs from the process_rnaseq Kallisto
+   pipeline to identify genes that are differentially expressed between datasets.
+
+   Running from the command line
+   =============================
+
+   Parameters
+   ----------
+   config : str
+      Configuration JSON file
+   in_metadata : str
+      Location of input JSON metadata for files
+   out_metadata : str
+      Location of output JSON metadata for files
+
+   Returns
+   -------
+   R data object : file
+      Sleuth R object
+
+   Example
+   -------
+   When running the pipeline on a local machine without COMPSs:
+
+   .. code-block:: none
+      :linenos:
+
+      python process_sleuth.py                                       \
+         --config tests/json/config_sleuth.json \
+         --in_metadata tests/json/input_sleuth.json \
+         --out_metadata tests/json/output_sleuth.json \
+         --local
+
+   When using a local version of the [COMPS virtual machine](https://www.bsc.es/research-and-development/software-and-apps/software-list/comp-superscalar/):
+
+   .. code-block:: none
+      :linenos:
+
+      runcompss                                        \
+         --lang=python                                 \
+         --library_path=${HOME}/bin                    \
+         --pythonpath=/<pyenv_virtenv_dir>/lib/python2.7/site-packages/ \
+         --log_level=debug                             \
+         process_sleuth.py                             \
+            --config tests/json/config_sleuth.json \
+            --in_metadata tests/json/input_sleuth.json \
+            --out_metadata tests/json/output_sleuth.json
+
+   Methods
+   =======
+   .. autoclass:: process_sleuth.process_sleuth
+      :members:
+
+
 TrimGalore
 ----------
 .. automodule:: process_trim_galore
 
    This pipeline can process FASTQ to trim poor base quality or adapter contamination.
 
+
    Running from the command line
    =============================
 

diff --git a/docs/test_data/index.rst b/docs/test_data/index.rst
@@ -34,6 +34,7 @@ Sample Data
    testData_iDamIDSeq
    testData_MNaseSeq
    testData_RNASeq
+   testData_Sleuth
    testData_WGBS
    tests_hic
 
@@ -155,6 +156,24 @@ There is a test for each of the tools. This uses the "process" scripts to run ea
    -----------
    :doc:`testData_RNASeq`
 
+   Sleuth
+   =======
+   To run the pipeline test:
+
+   .. code-block:: none
+
+      pytest tests/test_pipeline_sleuth.py
+
+
+   Methods
+   -------
+   .. automodule:: tests.test_pipeline_sleuth
+      :members:
+
+   Sample Data
+   -----------
+   :doc:`testData_Sleuth`
+
    Whole Genome Bisulfate Sequencing (WGBS)
    ========================================
    To run the pipeline test:

diff --git a/docs/test_data/testData_Sleuth.rst b/docs/test_data/testData_Sleuth.rst
@@ -0,0 +1,112 @@
+.. See the NOTICE file distributed with this work for additional information
+   regarding copyright ownership.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Sleuth Test Data
+================
+
+Test Data
+---------
+
+Dataset
+^^^^^^^
+
++------------+--------------------------------------------------------------+
+| Stable IDs | ERR030856, ERR030857, ERR030858, ERR030872, ERR030903        |
++------------+--------------------------------------------------------------+
+| Project    | `PRJEB2445 <https://www.ebi.ac.uk/ena/data/view/PRJEB2445>`_ |
++------------+--------------------------------------------------------------+
+
+Genome
+^^^^^^
+
+CDNA was downloaded from `ensembl 92 <http://ftp.ensembl.org/pub/release-92/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz>`_
+
++-------------+--------+
+| Assembly    | GRCh38 |
++-------------+--------+
+| Transcripts | 1000   |
++-------------+--------+
+
+Method
+------
+The full dataset was downloaded from ENA aligned to the cDNA using kallisto producing a pseudo alignment bam file. Sleuth was used to calculate the most significant hits of which the top 1000 were picked. These were used to select the matching FASTQ reads from the pseudo alignment files
+
+
+hiseq_info.txt file:
+
+.. code-block:: none
+   :linenos:
+
+   ERR030856\tcontrol
+   ERR030857\tcontrol
+   ERR030858\tcontrol
+   ERR030872\tthyroid
+   ERR030903\tthyroid
+
+.. code-block:: R
+   :linenos:
+
+   library("sleuth")
+   sample_id <- dir(file.path("data", "results"))
+   kal_dirs <- file.path("data", "results", sample_id, "kallisto")
+
+   s2c <- read.table(file.path("data", "hiseq_info.txt"), header = TRUE, stringsAsFactors=FALSE)
+   s2c <- dplyr::select(s2c, sample, condition)
+   s2c <- dplyr::mutate(s2c, path = kal_dirs)
+
+   so <- sleuth_prep(s2c, extra_bootstrap_summary = TRUE, num_cores = 1)
+   so <- sleuth_fit(so, ~condition, 'full')
+   so <- sleuth_fit(so, ~1, 'reduced')
+   so <- sleuth_lrt(so, 'reduced', 'full')
+
+   sleuth_table <- sleuth_results(so, 'reduced:full', 'lrt', show_all = FALSE)
+   sleuth_significant <- dplyr::filter(sleuth_table, qval <= 0.05)
+
+   # Generate a set of transcripts to use for code testing
+   sample(sleuth_significant$target_id, 1000)
+
+
+.. code-block:: none
+   :linenos:
+
+   # Kallisto Quantification
+   kallisto quant -i GRCh38.cdna.fasta.idx -o ERR030856 --pseudobam --single -l 100 -s 0.01 ERR030856/ERR030856.fastq > ERR030856/ERR030856.sam
+   kallisto quant -i GRCh38.cdna.fasta.idx -o ERR030857 --pseudobam --single -l 100 -s 0.01 ERR030857/ERR030857.fastq > ERR030857/ERR030857.sam
+   kallisto quant -i GRCh38.cdna.fasta.idx -o ERR030858 --pseudobam --single -l 100 -s 0.01 ERR030858/ERR030858.fastq > ERR030858/ERR030858.sam
+   kallisto quant -i GRCh38.cdna.fasta.idx -o ERR030903 --pseudobam --single -l 75 -s 0.0133333 ERR030903/ERR030903.fastq > ERR030903/ERR030903.sam
+
+
+.. code-block:: none
+   :linenos:
+
+   # Extract the FASTQ read IDs for the selected transcripts
+   grep -f sleuth_sample_transcripts.txt ERR030872/ERR030872.sam | tr "\t" "~" | cut -d"~" -f1 | grep -v @ > ERR030872/ERR030872.reads
+   grep -f sleuth_sample_transcripts.txt ERR030903/ERR030903.sam | tr "\t" "~" | cut -d"~" -f1 | grep -v @ > ERR030903/ERR030903.reads
+   grep -f sleuth_sample_transcripts.txt ERR030856/ERR030856.sam | tr "\t" "~" | cut -d"~" -f1 | grep -v @ > ERR030856/ERR030856.reads
+   grep -f sleuth_sample_transcripts.txt ERR030857/ERR030857.sam | tr "\t" "~" | cut -d"~" -f1 | grep -v @ > ERR030857/ERR030857.reads
+   grep -f sleuth_sample_transcripts.txt ERR030858/ERR030858.sam | tr "\t" "~" | cut -d"~" -f1 | grep -v @ > ERR030858/ERR030858.reads
+
+
+.. code-block:: none
+   :linenos:
+
+   # Extract the original reads from teh FASTQ files
+   python scripts/ExtractRowsFromFASTQs.py --input_1 ERR030856/ERR030856.fastq --rows ERR030856/ERR030856.reads --prop 0.1 --output_tag subset
+   python scripts/ExtractRowsFromFASTQs.py --input_1 ERR030857/ERR030857.fastq --rows ERR030857/ERR030857.reads --prop 0.1 --output_tag subset
+   python scripts/ExtractRowsFromFASTQs.py --input_1 ERR030858/ERR030858.fastq --rows ERR030858/ERR030858.reads --prop 0.1 --output_tag subset
+   python scripts/ExtractRowsFromFASTQs.py --input_1 ERR030872/ERR030872_1.fastq --input_2 ERR030872/ERR030872_2.fastq --rows ERR030872/ERR030872.reads --prop 0.1 --output_tag subset
+   python scripts/ExtractRowsFromFASTQs.py --input_1 ERR030903/ERR030903.fastq --rows ERR030903/ERR030903.reads --prop 0.1 --output_tag subset
+
+Due to the number of reads that match to the transcripts, only 1% have been kept for code testing
diff --git a/docs/tools.rst b/docs/tools.rst
@@ -139,6 +139,15 @@ Tools for processing FastQ files
       :members:
 
 
+   Analysis
+   ========
+
+   Sleuth
+   ------
+   .. autoclass:: tool.sleuth.sleuthTool
+      :members:
+
+
    Hi-C Parsing
    ============
 

diff --git a/process_idear.py b/process_idear.py
@@ -19,9 +19,6 @@
 
 from __future__ import print_function
 
-# Required for ReadTheDocs
-from functools import wraps  # pylint: disable=unused-import
-
 import argparse
 
 from basic_modules.workflow import Workflow