Refactor exome pipeline (resolves #270, resolves #238, resolves #276, r…

…esolves #282, resolves #279, resolves #284, resolves #274)
BD2KGenomics · May 28, 2016 · 22b80fe · 22b80fe
1 parent 9e28c2f
commit 22b80fe
Show file tree

Hide file tree

Showing 21 changed files with 868 additions and 909 deletions.
diff --git a/jenkins.sh b/jenkins.sh
@@ -13,6 +13,7 @@ virtualenv venv
 . venv/bin/activate
 make develop
 make test
+make clean
 rm -rf bin s3am
 make pypi
 rm -rf venv
diff --git a/setup.py b/setup.py
@@ -32,7 +32,8 @@
     install_requires=[
         'toil==' + toil_version,
         'boto==2.38.0', # FIXME: Make an extra
-        'tqdm==3.8.0'], # FIXME: Remove once ADAM stops using it (superfluous import)
+        'tqdm==3.8.0', # FIXME: Remove once ADAM stops using it (superfluous import)
+        'pyyaml==3.11'],
     tests_require=[
         'pytest==2.8.3'],
     package_dir={'': 'src'},
@@ -43,7 +44,7 @@
             'toil-rnaseq = toil_scripts.rnaseq_cgl.rnaseq_cgl_pipeline:main',
             'toil-rnaseq-unc = toil_scripts.rnaseq_unc.rnaseq_unc_pipeline:main',
             'toil-spladder = toil_scripts.spladder_pipeline.spladder_pipeline:main',
-            'toil-variant = toil_scripts.exome_variant_pipeline.exome_variant_pipeline:main']})
+            'toil-exome = toil_scripts.exome_variant_pipeline.exome_variant_pipeline:main']})
 
 
 class PyTest(TestCommand):

diff --git a/src/toil_scripts/adam_pipeline/adam_preprocessing.py b/src/toil_scripts/adam_pipeline/adam_preprocessing.py
@@ -97,7 +97,6 @@ def call_conductor(masterIP, inputs, src, dst):
                  "--conf", "spark.driver.memory=%sg" % inputs["driverMemory"],
                  "--conf", "spark.executor.memory=%sg" % inputs["executorMemory"],
                  "--", "-C", src, dst],
-                sudo = inputs['sudo'],
                 mock=False)
 
 
@@ -117,7 +116,6 @@ def call_adam(masterIP, inputs, arguments):
                 tool = "quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80",
                 docker_parameters = masterIP.docker_parameters(["--net=host"]),
                 parameters = default_params + arguments,
-                sudo = inputs['sudo'],
                 mock=False)
 
 

diff --git a/src/toil_scripts/batch_alignment/README.md b/src/toil_scripts/batch_alignment/README.md
@@ -38,7 +38,7 @@ It is likely that the job store positional argument, `--workDir`, and `--output-
 To run a pipeline after dependencies have been installed, simply:
 
 * `git clone https://github.com/BD2KGenomics/toil-scripts`
-* `/toil-scripts/src/toil_scripts/spladder_pipeline/launch_bwa_hg38_no_alt.sh`
+* `/toil-scripts/src/toil_scripts/batch_alignment/launch_bwa_hg38_no_alt.sh`
 
 Due to PYTHONPATH issues, help can be found by typing:
 

diff --git a/src/toil_scripts/batch_alignment/bwa_alignment.py b/src/toil_scripts/batch_alignment/bwa_alignment.py
@@ -37,10 +37,11 @@ def download_shared_files(job, inputs):
     """
     Downloads shared files that are used by all samples for alignment
 
+    :param JobFunctionWrappingJob job: passed automatically by Toil
     :param JobFunctionWrappingJob job: Passed by Toil automatically
     :param Namespace inputs: Input arguments (see main)
     """
-    job.fileStore.logToMaster('Downloading shared files for aligment.')
+    job.fileStore.logToMaster('Downloading shared files for alignment.')
 
     shared_files = [inputs.ref, inputs.amb, inputs.ann, inputs.bwt, inputs.pac, inputs.sa, inputs.fai]
     if inputs.alt:
@@ -133,7 +134,7 @@ def run_bwa(job, inputs, ids):
     outputs = {'aligned.aln.bam': inputs.mock_bam}
 
     docker_call(tool='quay.io/ucsc_cgl/bwakit:0.7.12--528bb9bf73099a31e74a7f5e6e3f2e0a41da486e',
-                parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir, sudo=inputs.sudo)
+                parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir)
 
     # BWA insists on adding an `*.aln.sam` suffix, so rename the output file
     output_file = os.path.join(work_dir, '{}.bam'.format(inputs.uuid))
@@ -165,9 +166,6 @@ def build_parser():
                         help='Alternate file for reference build (alt). Necessary for alt aware alignment')
     parser.add_argument('--ssec', default=None, help='Path to Key File for SSE-C Encryption')
     parser.add_argument('--output-dir', default=None, help='full path where final results will be output')
-    parser.add_argument('--sudo', dest='sudo', default=False, action='store_true',
-                        help='Docker usually needs sudo to execute locally, but not when running Mesos '
-                             'or when a member of a Docker group.')
     parser.add_argument('--s3-dir', default=None, help='S3 Directory, starting with bucket name. e.g.: '
                                                        'cgl-driver-projects/ckcc/rna-seq-samples/')
     parser.add_argument('--file-size', default='50G', help='Approximate input file size. Should be given as %d[TGMK], '

diff --git a/src/toil_scripts/batch_alignment/test/test_bwa_alignment.py b/src/toil_scripts/batch_alignment/test/test_bwa_alignment.py
@@ -4,7 +4,7 @@
 from boto.s3.connection import S3Connection, Bucket, Key
 
 
-def test_rnaseq_cgl(tmpdir):
+def test_bwa(tmpdir):
     work_dir = str(tmpdir)
     create_config(work_dir)
     subdir = '/mnt/ephemeral/toil-scripts/bwa'

diff --git a/src/toil_scripts/exome_variant_pipeline/README.md b/src/toil_scripts/exome_variant_pipeline/README.md
@@ -1,92 +1,137 @@
 ## University of California, Santa Cruz Genomics Institute
-### Guide: Running Exome Variant Pipeline (GATK pre-processing, MuTect, MuSe, Pindel)
+### Guide: Running the CGL Exome Pipeline using Toil
 
 This guide attempts to walk the user through running this pipeline from start to finish. If there are any questions
 please contact John Vivian (jtvivian@gmail.com). If you find any errors or corrections please feel free to make a 
 pull request.  Feedback of any kind is appreciated.
 
 ## Overview
-This pipeline accepts a tumor/normal (T/N) pair of exome BAMFILES and produces a tarball (tar.gz) file for a
-given sample that contains:
 
-    MuTect output (.vcf, .cov, .out)
-    Pindel output (including .vcf conversions)
-    MuSe output
-
-The output tarball is *stamped* with the UUID for the sample (e.g. UUID.tar.gz).
+A pair of Tumor/Normal exome BAMs are preprocessed (GATK), indels are found (Pindel), and variants are
+called with two mutation callers (MuTect and MuSe).  This pipeline is modular — any part of the 
+pipeline can be run on it's own. If preprocessing is selected, it will always occur before any of the other tools.
+
+This pipeline produces a tarball (tar.gz) file for a given sample that contains:
+
+    MuTect: Mutect.vcf, Mutect.cov, Mutect.out
+    Pindel: 
+    MuSe: Muse.vcf
+
+The output tarball is *stamped* with the UUID for the sample (e.g. UUID.tar.gz). 
+
+## Installation
+
+Toil-scripts is now pip installable! `pip install toil-scripts` for a toil-stable version 
+or `pip install --pre toil-scripts` for cutting edge development version.
+
+Type: `toil-exome` to get basic help menu and instructions
 
 ## Dependencies
+
 This pipeline has been tested on Ubuntu 14.04, but should also run on other unix based systems.  `apt-get` and `pip`
-often require `sudo` privilege, so if the below commands fail, try prepending `sudo`.  If you do not have sudo 
-privileges you will need to build these tools from source, or bug a sysadmin (they don't mind). 
+often require `sudo` privilege, so if the below commands fail, try prepending `sudo`.  If you do not have `sudo` 
+privileges you will need to build these tools from source, or bug a sysadmin about how to get them (they don't mind). 
 
 #### General Dependencies
+
     1. Python 2.7
     2. Curl         apt-get install curl
     3. Docker       http://docs.docker.com/engine/installation/
 
 #### Python Dependencies
+
     1. Toil         pip install toil
-    2. Boto         pip install boto (optional, only needed if uploading results to S3)
-
-## Getting Started
-#### Running a single sample locally
-From the BD2KGenomics toil-scripts Github repository, download the following files to the same directory.
-
-    1. toil-scripts/exome_variant_pipeline/exome_variant_pipeline.py
-    2. toil-scripts/exome_variant_pipeline/launch_variant_hg19.sh
-
-The bash script `launch_variant_hg19.sh` contains all of the parameters required to run this pipeline, although you 
-will likely want to modify a couple lines as it assumes everything will be staged from your home directory.
-
-| Parameter                 | Function                                                                                                                              |
-|---------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
-| 1st argument (unlabelled) | Path to where the jobStore will exist.  The jobStore hosts intermediate files during runtime                                          |
-| `--config`                | Path to the config file. With the format:  UUID,Normal_URL,Tumor_URL                                                                  |
-| `--retryCount`            | OPTIONAL: Number of times a failed job will retried. Useful for non-systemic failures (HTTP requests, etc)                            |
-| `--ssec`                  | OPTIONAL: Path to a master key if input files are encrypted in S3                                                                     |
-| `--output_dir`            | OPTIONAL: Directory where final output of pipeline will be placed                                                                     |
-| `--s3_dir`                | OPTIONAL: S3 "Directory" (bucket + directories)                                                                                       |
-| `--workDir`               | OPTIONAL: Location where tmp files will be placed during pipeline run.,If not used, defaults to TMPDIR environment variable.          |
-| `--sudo`                  | OPTIONAL: Prepends "sudo" to all docker commands. Necessary if user is not a member of a docker group or does not have root privilege |
-| `--restart`               | OPTIONAL: Restarts pipeline after failure, requires presence of an existing jobStore.                                                 |
-
-The first argument (location of the jobStore) and the directory set in `--workDir`, need *plenty* of space to store 
-intermediate files during pipeline execution.  Change those parameters to point to the appropriate scratch space or
-wherever there exists sufficient storage. The servers I have tested on have 700GB of disk space, which is plenty,
-but ultimately this is contingent upon sample size.
-
-#### Running a sample on a batch system (gridEngine, Parasol, etc).
-To run your pipeline using the gridEngine batch system, simply add the argument `--batchSystem=gridEngine` to the launch
-script.  We currently support Grid Engine, Parasol, and Mesos. 
-
-#### Running batches of samples
-`--config` accepts a path to a CSV file that **must** follow the format of one sample 
-per line: UUID,url_to_fastq1,url_to_fastq2
-
-# Advanced: Running the Pipeline on a Distributed Cloud Cluster (using Mesos)
-From the BD2KGenomics toil-scripts Github repository, download the following files which will run on the head node.
-
-    1. toil-scripts/exome_variant_pipeline/exome_variant_pipeline.py
-    2. toil-scripts/exome_variant_pipeline/launch_variant_hg19_mesos.sh
-    3. toil-scripts/exome_variant_pipeline/exome_variant_config.csv
-
-It is outside the scope of this guide to explain how to setup a distributed cloud cluster.  I recommend taking a 
-look at the BD2KGenomics tool: [CGCloud](https://github.com/BD2KGenomics/cgcloud), which can setup a distributed 
-cloud cluster using the Mesos batch system in AWS.  Please do not direct questions related to CGCloud or 
-setting up a distributed cluster to the author of this pipeline. 
-
-A launch script (`launch_variant_hg19_mesos.sh`) has been prepared that will run on the head node of the Mesos cluster, scheduling jobs to the worker
-nodes that exist within the cluster.
-
-Explanation of additional parameters
-
-| Parameter     | Function                                                                                                                 |
-|---------------|--------------------------------------------------------------------------------------------------------------------------|
-| 1st argument  | This now points to an AWS jobStore                                                                                       |
-| `--batchSystem` | Path to the config csv file OR the sample.tar.  UUID for the sample is based off the filename before the .tar extension. |
-| `--masterIP`    | A boilerplate argument that indicates what port to use                                                                   |
-| `--sseKey`      | OPTIONAL: Encrypts intermediate files when using cloud jobStore.
+    2. S3AM         pip install --pre s3am (optional, needed for uploading output to S3)
+
+## Inputs
+
+The CGL exome pipeline requires input files in order to run. These files are hosted on Synapse and can 
+be downloaded after creating an account which takes about 1 minute and is free. 
+
+* Register for a [Synapse account](https://www.synapse.org/#!RegisterAccount:0)
+* Either download the samples from the [website GUI](https://www.synapse.org/#!Synapse:syn5886029) or use the Python API
+* `pip install synapseclient`
+* `python`
+    * `import synapseclient`
+    * `syn = synapseclient.Synapse()`
+    * `syn.login('foo@bar.com', 'password')`
+    * Get the Reference Genome (3 G)
+        * `syn.get('syn6128232', downloadLocation='.')`
+    * Get the Phase VCF (0.3 G)
+        * `syn.get('syn6128233', downloadLocation='.')`
+    * Get the Mills VCF (0.1 G)
+        * `syn.get('syn6128236', downloadLocation='.')`
+    * Get the DBSNP VCF (10 G)
+        * `syn.get('syn6128237', downloadLocation='.')`
+    * Get the Cosmic VCF (0.01 G)
+        * `syn.get('syn6128235', downloadLocation='.')`
+
+
+## General Usage
 
-## Additional Information
-Launch scripts are provided for bams that have been aligned to b37, hg19, and hg38.
+1. Type `toil-exome generate` to create an editable manifest and config in the current working directory.
+2. Parameterize the pipeline by editing the config.
+3. Fill in the manifest with information pertaining to your samples.
+4. Type `toil-exome run [jobStore]` to execute the pipeline.
+
+## Example Commands
+
+Run sample(s) locally using the manifest
+1. `toil-exome generate`
+2. Fill in config and manifest
+3. `toil-rnaseq run ./example-jobstore`
+
+Toil options can be appended to `toil-exome run`, for example:
+`toil-exome run ./example-jobstore --retryCount=1 --workDir=/data`
+
+For a complete list of Toil options, just type `toil-exome run -h`
+
+Run a variety of samples locally
+1. `toil-exome generate-config`
+2. Fill in config
+3. `toil-exome run ./example-jobstore --retryCount=1 --workDir=/data --samples \
+    s3://example-bucket/sample_1.tar file:///full/path/to/sample_2.tar https://sample-depot.com/sample_3.tar`
+
+## Example Config
+
+HG19
+```
+reference: s3://cgl-pipeline-inputs/variant_hg19/hg19.fa                     
+phase: s3://cgl-pipeline-inputs/variant_hg19/1000G_phase1.indels.hg19.sites.vcf                  
+mills: s3://cgl-pipeline-inputs/variant_hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
+dbsnp: s3://cgl-pipeline-inputs/variant_hg19/dbsnp_138.hg19.vcf
+cosmic: s3://cgl-pipeline-inputs/variant_hg19/cosmic.hg19.vcf                 
+run-mutect: true        
+run-pindel: true        
+run-muse: true          
+preprocessing: true     
+output-dir: /data/my-toil-run          
+s3-dir: s3://my-bucket/test/exome
+ssec:                   
+gtkey:                  
+ci-test:
+```
+
+B37
+```
+reference: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/Homo_sapiens_assembly19.fasta
+phase: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/1000G_phase1.indels.hg19.sites.fixed.vcf
+mills: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf
+dbsnp: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/dbsnp_132_b37.leftAligned.vcf
+cosmic: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/b37_cosmic_v54_120711.vcf
+run-mutect: true        
+run-pindel: true        
+run-muse: true          
+preprocessing: true     
+output-dir:          
+s3-dir:                 
+ssec:                   
+gtkey:                  
+ci-test:
+```
+
+## Distributed Run
+
+To run on a distributed AWS cluster, see [CGCloud](https://github.com/BD2KGenomics/cgcloud) for instance provisioning, 
+then run `toil-exome run aws:us-west-2:example-jobstore-bucket --batchSystem=mesos --mesosMaster mesos-master:5050`
+to use the AWS job store and mesos batch system. 
diff --git a/src/toil_scripts/exome_variant_pipeline/exome_variant_config.csv b/src/toil_scripts/exome_variant_pipeline/exome_variant_config.csv