diff --git a/cap2/api.py b/cap2/api.py index 76be1e3d..925ae695 100644 --- a/cap2/api.py +++ b/cap2/api.py @@ -1,8 +1,11 @@ import luigi +from .sample import Sample from .pipeline.databases import MODULES as DB_MODULES - +from .pipeline.full_pipeline import FullPipeline +from .pipeline.utils.cap_task import BaseCapTask +from .pipeline.utils.conda import CondaPackage, PyPiPackage from .constants import ( STAGES, STAGES_GROUP, @@ -22,6 +25,23 @@ def run_db_stage(config_path='', cores=1, **kwargs): luigi.build(instances, local_scheduler=True, **kwargs) +def install_software(config_path='', cores=1, workers=1, **kwargs): + sample = Sample('not_a_real_sample', '/this/will/never/be/run') + modules, queue = [], [FullPipeline.from_sample(sample, config_path, cores=cores)] + while queue: + obj = queue.pop() + if isinstance(obj, BaseCapTask): + try: + for dependency in obj.requires(): + queue.append(dependency) + except TypeError: + queue.append(obj.requires()) + elif isinstance(obj, CondaPackage) or isinstance(obj, PyPiPackage): + modules.append(obj) + luigi.build(modules, local_scheduler=True, workers=workers, **kwargs) + + + def run_stage(samples, stage_name, config_path='', cores=1, workers=1, **kwargs): """Run a subpipeline on a list of samples. stage_name can be one of `qc`, `pre`, `reads`.""" modules = STAGES[stage_name] diff --git a/cap2/cli.py b/cap2/cli.py index cd9b2024..b4aa3134 100644 --- a/cap2/cli.py +++ b/cap2/cli.py @@ -6,6 +6,7 @@ from .api import ( run_db_stage, run_stage, + install_software, ) from .sample import Sample from .constants import ( @@ -73,5 +74,17 @@ def cap_pipeline(workers, threads, config, stage, manifest): run_stage(samples, stage, config_path=config, cores=threads, workers=workers) +@run.command('install') +@click.option('-w', '--workers', default=1) +@click.option('-c', '--config', type=click.Path(), default='') +def cap_pipeline(workers, config): + """Install the software needed by the MCAP. + + This command will make a series of calls to conda in + order to install the requisite software for the MCAP. + """ + install_software(config_path=config, workers=workers) + + if __name__ == '__main__': main() diff --git a/cap2/pipeline/config.py b/cap2/pipeline/config.py index da627f19..29d1e8a9 100644 --- a/cap2/pipeline/config.py +++ b/cap2/pipeline/config.py @@ -10,6 +10,8 @@ class PipelineConfig: def __init__(self, filename): if filename: self.blob = load(open(filename).read()) + elif 'CAP2_CONFIG' in environ: + self.blob = load(open(environ['CAP2_CONFIG']).read()) else: self.blob = {} self.out_dir = self.blob.get('out_dir', environ.get('CAP2_OUT_DIR', 'results')) diff --git a/cap2/pipeline/utils/conda.py b/cap2/pipeline/utils/conda.py index 7d954a12..7af77da7 100644 --- a/cap2/pipeline/utils/conda.py +++ b/cap2/pipeline/utils/conda.py @@ -59,7 +59,7 @@ def get_path(self, tool): def save_spec(self): proc = subprocess.Popen( - ' '.join(['conda', 'env', 'export', '--name', self.name]), + ' '.join(['conda', 'env', 'export']), stdout=subprocess.PIPE, shell=True ) @@ -111,7 +111,7 @@ def install(self, package, channel="anaconda"): except: print(f'Subprocess failed from {os.getcwd()}: {cmd}', file=sys.stderr) raise - self.save_spec() + # self.save_spec() self.add_to_path() def pypi_install(self, package): diff --git a/containers/README.md b/containers/README.md new file mode 100644 index 00000000..15812dec --- /dev/null +++ b/containers/README.md @@ -0,0 +1,41 @@ +# Containerization + + +## Running the MCAP with Docker + + +To run the MCAP with docker you will need to create two directories: 1) one where databases should be stored on your machine's filesystem and 2) one where the output from the mcap should go. For simplicity we will call these 1) `/my/databases` and `/my/outputs`. + +To run the MCAP use this command: + +``` +docker run \ + --mount source=/my/databases,target=/mcap/dbs,type=bind,readonly \ + --mount source=/my/outputs,target=/mcap/out,type=bind \ + -it mcap \ + cap2 --help +``` + +This command will print a help message and then exit. To run the mcap pipeline replace `cap2 --help` with a more complete command. + + +### Using external databases + +Typically you won't want to download new databases every time you use the MCAP. Unfortunately the databases used by the MCAP are too large to fit comfortably into a docker image. As a work around the MCAP docker image can be set to look for already existing databases in your machine's filesystem. To do this you need to 1) load databases onto your local filesystem, 2) instruct docker to connect to these databases when it runs. + +#### Loading databases to your local filesystem + + +#### Letting docker connect to your databases + +Suppose you have the MCAP databases downloaded into a directory on your local filesystem called `/path/to/mcap/databases`. You can instruct docker to connect to this folder at runtime using the following command: + +``` +docker run \ + --mount source=/path/to/mcap/databases,target=/mcap/dbs,type=bind,readonly \ + -it mcap \ + /bin/bash + +``` + +This command will make it possible for commands run in this docker image to read data from `/path/to/mcap/databases`. This command is read-only meaning the docker image will not be able to edit files. \ No newline at end of file diff --git a/containers/docker/latest/Dockerfile b/containers/docker/latest/Dockerfile new file mode 100644 index 00000000..bb2c45a7 --- /dev/null +++ b/containers/docker/latest/Dockerfile @@ -0,0 +1,100 @@ +FROM --platform=linux/amd64 ubuntu:20.04 + +WORKDIR /mcap +RUN mkdir -p /mcap/dbs /mcap/demo + +# Necessary for R +ENV TZ=America/New_York +ENV DEBIAN_FRONTEND=noninteractive +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + + +RUN apt-get update -y && apt-get install -y \ + bzip2 \ + build-essential \ + zlib1g-dev \ + libc6 \ + libncurses5-dev \ + libncursesw5-dev \ + libnss-sss \ + libbz2-dev \ + liblzma-dev \ + less \ + libcurl4-openssl-dev \ + wget \ + unzip \ + zip \ + r-base \ + r-base-core \ + r-recommended \ + default-jre \ + default-jdk \ + python \ + python3 \ + python3-pip \ + mafft \ + curl \ + rsync \ + git && \ + rm -rf /var/lib/apt/lists/* + +# Install Miniconda +ENV PATH="/root/miniconda3/bin:${PATH}" +ARG PATH="/root/miniconda3/bin:${PATH}" +RUN wget \ + https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh \ + && mkdir /root/.conda \ + && bash Miniconda3-py38_4.10.3-Linux-x86_64.sh -b \ + && rm -f Miniconda3-py38_4.10.3-Linux-x86_64.sh +RUN conda --version + +# Install software with conda +# Based on https://jcristharif.com/conda-docker-tips.html +RUN conda install --yes -c bioconda \ + numpy \ + pandas \ + scipy + +RUN pip install \ + pangea_api \ + Jinja2==3.0.0a1 \ + biopython==1.76 \ + click==6.7 \ + pysam \ + python-louvain + +RUN pip install \ + gimmebio.seqs \ + bloom_filter \ + luigi==3.0.0b2 \ + PyYaml==5.3.1 + +RUN conda install --yes -c conda-forge matplotlib +RUN conda clean -afy + +WORKDIR /mcap +RUN mkdir -p /mcap/out /mcap/conda/config/envs /mcap/conda/vendor/conda +COPY mcap_config.yaml /mcap/config.yaml +ENV CAP2_CONFIG=/mcap/config.yaml + + +# install most of the software here +# so future updates will be faster +RUN pip install cap2==0.5.3 +RUN cap2 --help +RUN cap2 run install + +RUN pip install cap2==0.5.3 + +RUN cap2 --help + +ADD demo /mcap/demo/ +WORKDIR /mcap/demo +# RUN cap2 run pipeline --stage qc manifest.txt + +WORKDIR /mcap +RUN cap2 run install + + + +CMD ["bash"] \ No newline at end of file diff --git a/containers/docker/latest/Makefile b/containers/docker/latest/Makefile new file mode 100644 index 00000000..d98a66e5 --- /dev/null +++ b/containers/docker/latest/Makefile @@ -0,0 +1,17 @@ + +build: + docker buildx build --platform linux/amd64 -t mcap . + +shell: build + docker run \ + --mount source=`pwd`/dbs,target=/mcap/dbs,type=bind,readonly \ + --mount source=`pwd`/out,target=/mcap/out,type=bind \ + -it mcap \ + /bin/bash + +mcap: build + docker run \ + --mount source=`pwd`/dbs,target=/mcap/dbs,type=bind,readonly \ + --mount source=`pwd`/out,target=/mcap/out,type=bind \ + -it mcap \ + cap2 --help \ No newline at end of file diff --git a/containers/docker/latest/dbs/README.txt b/containers/docker/latest/dbs/README.txt new file mode 100644 index 00000000..6afecb59 --- /dev/null +++ b/containers/docker/latest/dbs/README.txt @@ -0,0 +1 @@ +this file should only be visible in the docker container if dbs was mounted properly \ No newline at end of file diff --git a/containers/docker/latest/demo/Makefile b/containers/docker/latest/demo/Makefile new file mode 100644 index 00000000..6574984d --- /dev/null +++ b/containers/docker/latest/demo/Makefile @@ -0,0 +1,10 @@ + +qc: + cap2 run pipeline -s qc -c config.yaml manifest.txt + + +clean: + -rm -r config + -rm -r demo_db + -rm -r demo_out + -rm -r vendor diff --git a/containers/docker/latest/demo/README.md b/containers/docker/latest/demo/README.md new file mode 100644 index 00000000..c74eb482 --- /dev/null +++ b/containers/docker/latest/demo/README.md @@ -0,0 +1,14 @@ +# CAP2 Demo + +This directory is a demo for running the CAP from the command line. + +## Installation + +See `README.md` in main directory. + +## Running + +Run the demo by entering the command `cap2 run qc config.yaml manifest.txt` or `make qc` + + +## Notes diff --git a/containers/docker/latest/demo/config.yaml b/containers/docker/latest/demo/config.yaml new file mode 100644 index 00000000..4151df1e --- /dev/null +++ b/containers/docker/latest/demo/config.yaml @@ -0,0 +1,2 @@ +out_dir: demo_out +db_dir: demo_db diff --git a/containers/docker/latest/demo/manifest.txt b/containers/docker/latest/demo/manifest.txt new file mode 100644 index 00000000..fa0eaee0 --- /dev/null +++ b/containers/docker/latest/demo/manifest.txt @@ -0,0 +1 @@ +zymo_pos_cntrl zymo_pos_cntrl.r1.fq.gz zymo_pos_cntrl.r2.fq.gz \ No newline at end of file diff --git a/containers/docker/latest/demo/zymo_pos_cntrl.r1.fq.gz b/containers/docker/latest/demo/zymo_pos_cntrl.r1.fq.gz new file mode 100644 index 00000000..45e5eed0 Binary files /dev/null and b/containers/docker/latest/demo/zymo_pos_cntrl.r1.fq.gz differ diff --git a/containers/docker/latest/demo/zymo_pos_cntrl.r2.fq.gz b/containers/docker/latest/demo/zymo_pos_cntrl.r2.fq.gz new file mode 100644 index 00000000..ee0911ad Binary files /dev/null and b/containers/docker/latest/demo/zymo_pos_cntrl.r2.fq.gz differ diff --git a/containers/docker/latest/mcap_config.yaml b/containers/docker/latest/mcap_config.yaml new file mode 100644 index 00000000..6ed2b1d6 --- /dev/null +++ b/containers/docker/latest/mcap_config.yaml @@ -0,0 +1,6 @@ +# This is the config file that is used in the MCAP docker container + +out_dir: out +db_dir: dbs +conda_spec_dir: /mcap/conda/config/envs +conda_base_path: /mcap/conda/vendor/conda \ No newline at end of file diff --git a/containers/docker/latest/out/README.txt b/containers/docker/latest/out/README.txt new file mode 100644 index 00000000..56c27e70 --- /dev/null +++ b/containers/docker/latest/out/README.txt @@ -0,0 +1 @@ +this file should only be visible in the docker container if out was mounted properly \ No newline at end of file diff --git a/setup.py b/setup.py index c9e6db4f..4379e27c 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setuptools.setup( name='cap2', - version='0.4.1', + version='0.5.3', description="CAP2", author="David C. Danko", author_email='dcdanko@gmail.com',