Skip to content

Commit

Permalink
Refactor exome pipeline (resolves #270, resolves #238, resolves #276, r…
Browse files Browse the repository at this point in the history
…esolves #282, resolves #279, resolves #284, resolves #274)
  • Loading branch information
jvivian committed May 28, 2016
1 parent 9e28c2f commit 22b80fe
Show file tree
Hide file tree
Showing 21 changed files with 868 additions and 909 deletions.
1 change: 1 addition & 0 deletions jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ virtualenv venv
. venv/bin/activate
make develop
make test
make clean
rm -rf bin s3am
make pypi
rm -rf venv
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
install_requires=[
'toil==' + toil_version,
'boto==2.38.0', # FIXME: Make an extra
'tqdm==3.8.0'], # FIXME: Remove once ADAM stops using it (superfluous import)
'tqdm==3.8.0', # FIXME: Remove once ADAM stops using it (superfluous import)
'pyyaml==3.11'],
tests_require=[
'pytest==2.8.3'],
package_dir={'': 'src'},
Expand All @@ -43,7 +44,7 @@
'toil-rnaseq = toil_scripts.rnaseq_cgl.rnaseq_cgl_pipeline:main',
'toil-rnaseq-unc = toil_scripts.rnaseq_unc.rnaseq_unc_pipeline:main',
'toil-spladder = toil_scripts.spladder_pipeline.spladder_pipeline:main',
'toil-variant = toil_scripts.exome_variant_pipeline.exome_variant_pipeline:main']})
'toil-exome = toil_scripts.exome_variant_pipeline.exome_variant_pipeline:main']})


class PyTest(TestCommand):
Expand Down
2 changes: 0 additions & 2 deletions src/toil_scripts/adam_pipeline/adam_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def call_conductor(masterIP, inputs, src, dst):
"--conf", "spark.driver.memory=%sg" % inputs["driverMemory"],
"--conf", "spark.executor.memory=%sg" % inputs["executorMemory"],
"--", "-C", src, dst],
sudo = inputs['sudo'],
mock=False)


Expand All @@ -117,7 +116,6 @@ def call_adam(masterIP, inputs, arguments):
tool = "quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80",
docker_parameters = masterIP.docker_parameters(["--net=host"]),
parameters = default_params + arguments,
sudo = inputs['sudo'],
mock=False)


Expand Down
2 changes: 1 addition & 1 deletion src/toil_scripts/batch_alignment/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ It is likely that the job store positional argument, `--workDir`, and `--output-
To run a pipeline after dependencies have been installed, simply:

* `git clone https://github.com/BD2KGenomics/toil-scripts`
* `/toil-scripts/src/toil_scripts/spladder_pipeline/launch_bwa_hg38_no_alt.sh`
* `/toil-scripts/src/toil_scripts/batch_alignment/launch_bwa_hg38_no_alt.sh`

Due to PYTHONPATH issues, help can be found by typing:

Expand Down
8 changes: 3 additions & 5 deletions src/toil_scripts/batch_alignment/bwa_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ def download_shared_files(job, inputs):
"""
Downloads shared files that are used by all samples for alignment
:param JobFunctionWrappingJob job: passed automatically by Toil
:param JobFunctionWrappingJob job: Passed by Toil automatically
:param Namespace inputs: Input arguments (see main)
"""
job.fileStore.logToMaster('Downloading shared files for aligment.')
job.fileStore.logToMaster('Downloading shared files for alignment.')

shared_files = [inputs.ref, inputs.amb, inputs.ann, inputs.bwt, inputs.pac, inputs.sa, inputs.fai]
if inputs.alt:
Expand Down Expand Up @@ -133,7 +134,7 @@ def run_bwa(job, inputs, ids):
outputs = {'aligned.aln.bam': inputs.mock_bam}

docker_call(tool='quay.io/ucsc_cgl/bwakit:0.7.12--528bb9bf73099a31e74a7f5e6e3f2e0a41da486e',
parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir, sudo=inputs.sudo)
parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir)

# BWA insists on adding an `*.aln.sam` suffix, so rename the output file
output_file = os.path.join(work_dir, '{}.bam'.format(inputs.uuid))
Expand Down Expand Up @@ -165,9 +166,6 @@ def build_parser():
help='Alternate file for reference build (alt). Necessary for alt aware alignment')
parser.add_argument('--ssec', default=None, help='Path to Key File for SSE-C Encryption')
parser.add_argument('--output-dir', default=None, help='full path where final results will be output')
parser.add_argument('--sudo', dest='sudo', default=False, action='store_true',
help='Docker usually needs sudo to execute locally, but not when running Mesos '
'or when a member of a Docker group.')
parser.add_argument('--s3-dir', default=None, help='S3 Directory, starting with bucket name. e.g.: '
'cgl-driver-projects/ckcc/rna-seq-samples/')
parser.add_argument('--file-size', default='50G', help='Approximate input file size. Should be given as %d[TGMK], '
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from boto.s3.connection import S3Connection, Bucket, Key


def test_rnaseq_cgl(tmpdir):
def test_bwa(tmpdir):
work_dir = str(tmpdir)
create_config(work_dir)
subdir = '/mnt/ephemeral/toil-scripts/bwa'
Expand Down
189 changes: 117 additions & 72 deletions src/toil_scripts/exome_variant_pipeline/README.md
Original file line number Diff line number Diff line change
@@ -1,92 +1,137 @@
## University of California, Santa Cruz Genomics Institute
### Guide: Running Exome Variant Pipeline (GATK pre-processing, MuTect, MuSe, Pindel)
### Guide: Running the CGL Exome Pipeline using Toil

This guide attempts to walk the user through running this pipeline from start to finish. If there are any questions
please contact John Vivian (jtvivian@gmail.com). If you find any errors or corrections please feel free to make a
pull request. Feedback of any kind is appreciated.

## Overview
This pipeline accepts a tumor/normal (T/N) pair of exome BAMFILES and produces a tarball (tar.gz) file for a
given sample that contains:

MuTect output (.vcf, .cov, .out)
Pindel output (including .vcf conversions)
MuSe output

The output tarball is *stamped* with the UUID for the sample (e.g. UUID.tar.gz).
A pair of Tumor/Normal exome BAMs are preprocessed (GATK), indels are found (Pindel), and variants are
called with two mutation callers (MuTect and MuSe). This pipeline is modular — any part of the
pipeline can be run on it's own. If preprocessing is selected, it will always occur before any of the other tools.

This pipeline produces a tarball (tar.gz) file for a given sample that contains:

MuTect: Mutect.vcf, Mutect.cov, Mutect.out
Pindel:
MuSe: Muse.vcf

The output tarball is *stamped* with the UUID for the sample (e.g. UUID.tar.gz).

## Installation

Toil-scripts is now pip installable! `pip install toil-scripts` for a toil-stable version
or `pip install --pre toil-scripts` for cutting edge development version.

Type: `toil-exome` to get basic help menu and instructions

## Dependencies

This pipeline has been tested on Ubuntu 14.04, but should also run on other unix based systems. `apt-get` and `pip`
often require `sudo` privilege, so if the below commands fail, try prepending `sudo`. If you do not have sudo
privileges you will need to build these tools from source, or bug a sysadmin (they don't mind).
often require `sudo` privilege, so if the below commands fail, try prepending `sudo`. If you do not have `sudo`
privileges you will need to build these tools from source, or bug a sysadmin about how to get them (they don't mind).

#### General Dependencies

1. Python 2.7
2. Curl apt-get install curl
3. Docker http://docs.docker.com/engine/installation/

#### Python Dependencies

1. Toil pip install toil
2. Boto pip install boto (optional, only needed if uploading results to S3)

## Getting Started
#### Running a single sample locally
From the BD2KGenomics toil-scripts Github repository, download the following files to the same directory.

1. toil-scripts/exome_variant_pipeline/exome_variant_pipeline.py
2. toil-scripts/exome_variant_pipeline/launch_variant_hg19.sh

The bash script `launch_variant_hg19.sh` contains all of the parameters required to run this pipeline, although you
will likely want to modify a couple lines as it assumes everything will be staged from your home directory.

| Parameter | Function |
|---------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
| 1st argument (unlabelled) | Path to where the jobStore will exist. The jobStore hosts intermediate files during runtime |
| `--config` | Path to the config file. With the format: UUID,Normal_URL,Tumor_URL |
| `--retryCount` | OPTIONAL: Number of times a failed job will retried. Useful for non-systemic failures (HTTP requests, etc) |
| `--ssec` | OPTIONAL: Path to a master key if input files are encrypted in S3 |
| `--output_dir` | OPTIONAL: Directory where final output of pipeline will be placed |
| `--s3_dir` | OPTIONAL: S3 "Directory" (bucket + directories) |
| `--workDir` | OPTIONAL: Location where tmp files will be placed during pipeline run.,If not used, defaults to TMPDIR environment variable. |
| `--sudo` | OPTIONAL: Prepends "sudo" to all docker commands. Necessary if user is not a member of a docker group or does not have root privilege |
| `--restart` | OPTIONAL: Restarts pipeline after failure, requires presence of an existing jobStore. |

The first argument (location of the jobStore) and the directory set in `--workDir`, need *plenty* of space to store
intermediate files during pipeline execution. Change those parameters to point to the appropriate scratch space or
wherever there exists sufficient storage. The servers I have tested on have 700GB of disk space, which is plenty,
but ultimately this is contingent upon sample size.

#### Running a sample on a batch system (gridEngine, Parasol, etc).
To run your pipeline using the gridEngine batch system, simply add the argument `--batchSystem=gridEngine` to the launch
script. We currently support Grid Engine, Parasol, and Mesos.

#### Running batches of samples
`--config` accepts a path to a CSV file that **must** follow the format of one sample
per line: UUID,url_to_fastq1,url_to_fastq2

# Advanced: Running the Pipeline on a Distributed Cloud Cluster (using Mesos)
From the BD2KGenomics toil-scripts Github repository, download the following files which will run on the head node.

1. toil-scripts/exome_variant_pipeline/exome_variant_pipeline.py
2. toil-scripts/exome_variant_pipeline/launch_variant_hg19_mesos.sh
3. toil-scripts/exome_variant_pipeline/exome_variant_config.csv

It is outside the scope of this guide to explain how to setup a distributed cloud cluster. I recommend taking a
look at the BD2KGenomics tool: [CGCloud](https://github.com/BD2KGenomics/cgcloud), which can setup a distributed
cloud cluster using the Mesos batch system in AWS. Please do not direct questions related to CGCloud or
setting up a distributed cluster to the author of this pipeline.

A launch script (`launch_variant_hg19_mesos.sh`) has been prepared that will run on the head node of the Mesos cluster, scheduling jobs to the worker
nodes that exist within the cluster.

Explanation of additional parameters

| Parameter | Function |
|---------------|--------------------------------------------------------------------------------------------------------------------------|
| 1st argument | This now points to an AWS jobStore |
| `--batchSystem` | Path to the config csv file OR the sample.tar. UUID for the sample is based off the filename before the .tar extension. |
| `--masterIP` | A boilerplate argument that indicates what port to use |
| `--sseKey` | OPTIONAL: Encrypts intermediate files when using cloud jobStore.
2. S3AM pip install --pre s3am (optional, needed for uploading output to S3)

## Inputs

The CGL exome pipeline requires input files in order to run. These files are hosted on Synapse and can
be downloaded after creating an account which takes about 1 minute and is free.

* Register for a [Synapse account](https://www.synapse.org/#!RegisterAccount:0)
* Either download the samples from the [website GUI](https://www.synapse.org/#!Synapse:syn5886029) or use the Python API
* `pip install synapseclient`
* `python`
* `import synapseclient`
* `syn = synapseclient.Synapse()`
* `syn.login('foo@bar.com', 'password')`
* Get the Reference Genome (3 G)
* `syn.get('syn6128232', downloadLocation='.')`
* Get the Phase VCF (0.3 G)
* `syn.get('syn6128233', downloadLocation='.')`
* Get the Mills VCF (0.1 G)
* `syn.get('syn6128236', downloadLocation='.')`
* Get the DBSNP VCF (10 G)
* `syn.get('syn6128237', downloadLocation='.')`
* Get the Cosmic VCF (0.01 G)
* `syn.get('syn6128235', downloadLocation='.')`


## General Usage

## Additional Information
Launch scripts are provided for bams that have been aligned to b37, hg19, and hg38.
1. Type `toil-exome generate` to create an editable manifest and config in the current working directory.
2. Parameterize the pipeline by editing the config.
3. Fill in the manifest with information pertaining to your samples.
4. Type `toil-exome run [jobStore]` to execute the pipeline.

## Example Commands

Run sample(s) locally using the manifest
1. `toil-exome generate`
2. Fill in config and manifest
3. `toil-rnaseq run ./example-jobstore`

Toil options can be appended to `toil-exome run`, for example:
`toil-exome run ./example-jobstore --retryCount=1 --workDir=/data`

For a complete list of Toil options, just type `toil-exome run -h`

Run a variety of samples locally
1. `toil-exome generate-config`
2. Fill in config
3. `toil-exome run ./example-jobstore --retryCount=1 --workDir=/data --samples \
s3://example-bucket/sample_1.tar file:///full/path/to/sample_2.tar https://sample-depot.com/sample_3.tar`

## Example Config

HG19
```
reference: s3://cgl-pipeline-inputs/variant_hg19/hg19.fa
phase: s3://cgl-pipeline-inputs/variant_hg19/1000G_phase1.indels.hg19.sites.vcf
mills: s3://cgl-pipeline-inputs/variant_hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
dbsnp: s3://cgl-pipeline-inputs/variant_hg19/dbsnp_138.hg19.vcf
cosmic: s3://cgl-pipeline-inputs/variant_hg19/cosmic.hg19.vcf
run-mutect: true
run-pindel: true
run-muse: true
preprocessing: true
output-dir: /data/my-toil-run
s3-dir: s3://my-bucket/test/exome
ssec:
gtkey:
ci-test:
```

B37
```
reference: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/Homo_sapiens_assembly19.fasta
phase: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/1000G_phase1.indels.hg19.sites.fixed.vcf
mills: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf
dbsnp: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/dbsnp_132_b37.leftAligned.vcf
cosmic: https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/variant_b37/b37_cosmic_v54_120711.vcf
run-mutect: true
run-pindel: true
run-muse: true
preprocessing: true
output-dir:
s3-dir:
ssec:
gtkey:
ci-test:
```

## Distributed Run

To run on a distributed AWS cluster, see [CGCloud](https://github.com/BD2KGenomics/cgcloud) for instance provisioning,
then run `toil-exome run aws:us-west-2:example-jobstore-bucket --batchSystem=mesos --mesosMaster mesos-master:5050`
to use the AWS job store and mesos batch system.

This file was deleted.

Loading

0 comments on commit 22b80fe

Please sign in to comment.