diff --git a/Dockerfile b/Dockerfile
index 29626ec..bcc4606 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,11 +69,13 @@ ENV PATH /opt/gradle/bin:${PATH}
COPY bin /tmp/xenocp/bin
COPY src /tmp/xenocp/src
COPY dependencies /tmp/xenocp/dependencies
+COPY gradle /tmp/xenocp/gradle
+COPY gradlew /tmp/xenocp/gradlew
COPY build.gradle /tmp/xenocp/build.gradle
COPY settings.gradle /tmp/xenocp/settings.gradle
RUN cd /tmp/xenocp \
- && gradle installDist \
+ && ./gradlew installDist \
&& cp -r build/install/xenocp /opt
FROM ubuntu:20.04
diff --git a/README.md b/README.md
index 94b3c11..6c28d83 100755
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ XenoCP can be easily incorporated into any workflow, as it takes a BAM file
as input and efficiently cleans up the mouse contamination. The output is a clean
human BAM file that could be used for downstream genomic analysis.
-## Getting started
+## Quick Start
XenoCP can be run in the cloud on DNAnexus at
https://platform.dnanexus.com/app/stjude_xenocp
@@ -39,7 +39,38 @@ XenoCP workflow:
-## Prerequisites
+## Reference Files
+
+XenoCP performs mapping against the host genome, so it requires indexes for the
+host reference genome and mapper being used.
+
+A common use case is cleansing DNA reads with a mouse host. For this use case,
+you can download the a BWA index for MGSCv37 from
+http://ftp.stjude.org/pub/software/xenocp/reference/MGSCv37
+
+To build your own reference files, first download the FASTA file for your genome
+assembly. Then, create the index for your mapper:
+
+### BWA for DNA Reads
+
+```
+$ bwa index -p $FASTA $FASTA
+```
+
+### STAR for RNA Reads
+
+Download an annotation file such as gencode, and then run:
+
+```
+$ STAR --runMode genomeGenerate --genomeDir STAR --genomeFastaFiles $FASTA --sjdbGTFfile $ANNOTATION --sjdbOverhang 125
+```
+
+## Local Usage without Docker
+
+### Prerequisites
+
+First, install the following prerequisites. Note that if you are only using one
+of the two mappers, bwa and STAR, you can omit the other.
* [bwa] =0.7.13
* [STAR] =2.7.1a
@@ -73,28 +104,25 @@ disabled.
[zlib]: https://www.zlib.net/
[sambamba]: http://lomereiter.github.io/sambamba/
+### Obtain and Build XenoCP
-
-## Local usage
-
-
-### Obtain XenoCP
-
-Clone XenoCP from GitHub:
+Clone XenoCP from GitHub:
```
git clone https://github.com/stjude/XenoCP.git
```
-### Build XenoCP
-
-Once the prerequisites are satisfied, build XenoCP using Gradle.
+Build XenoCP using Gradle:
```
$ gradle installDist
```
-Add the artifacts under `build/install/xenocp/lib` to your Java `CLASSPATH`.
-Add the artifacts under `build/install/xenocp/bin` to your `PATH`.
+Add the artifacts under `build/install/xenocp` to your `PATH` and your Java `CLASSPATH`:
+
+```
+export PATH=$PATH:`pwd`/build/install/xenocp/bin
+export CLASSPATH=$CLASSPATH:`pwd`/build/install/xenocp/lib/*
+```
### Inputs
@@ -134,25 +162,8 @@ output_prefix: xenocp-
output_extension: bam
```
-### Create Reference Files
-
-Download the FASTA file for your genome assembly and run the following commands to create other files:
-#### BWA reference files
-```
-$ bwa index -p $FASTA $FASTA
-```
-#### STAR reference files
-In addition the genomic FASTA, STAR reference should use an annotation file (e.g. gencode).
-```
-$ STAR --runMode genomeGenerate --genomeDir STAR --genomeFastaFiles $FASTA --sjdbGTFfile $ANNOTATION --sjdbOverhang 125
-```
-
[CWL inputs]: https://www.commonwl.org/user_guide/02-1st-example/index.html
-### Download MGSCv37 reference files
-
-Reference files are provided for version MGSCv37 of mouse and are available from http://ftp.stjude.org/pub/software/xenocp/reference/MGSCv37
-
### Run
XenoCP uses [CWL] to describe its workflow.
@@ -162,12 +173,12 @@ Then run the following.
```
$ mkdir results
-$ cwltool --outdir results cwl/xenocp.cwl sample_data/input_data/inputs_local.yml
+$ cwltool --preserve-environment CLASSPATH --no-container --outdir results cwl/xenocp.cwl sample_data/input_data/inputs_local.yml
```
[CWL]: https://www.commonwl.org/
-## Docker
+## Local Usage with Docker
XenoCP provides a [Dockerfile] that builds an image with all the included
dependencies. To use this image, install [Docker] for your platform.
@@ -216,6 +227,11 @@ $ docker run \
/data/inputs.yml
```
+Note: when running using Singularity on an HPC, problems can arise if the
+default temporary file location, /tmp, is small. To solve this, include
+`-W
` when executing via Singularity to redirect temp files to a
+larger directory ``.
+
[Dockerfile]: ./Dockerfile
## Evaluate test data results
diff --git a/bin/java.sh b/bin/java.sh
index 769e542..095d2b0 100755
--- a/bin/java.sh
+++ b/bin/java.sh
@@ -1,5 +1,10 @@
#!/usr/bin/env bash
+# If the classpath is already set, then delegate directly to java
+if [ "$CLASSPATH" != "" ]; then exec java "$@"; fi
+
+# Otherwise, build an appropriate classpath
+# This section assumes you are running inside the container
for arg in "$@"; do
case $arg in
org.stjude.compbio.*)
diff --git a/bin/picard b/bin/picard
index fea6036..460346a 100755
--- a/bin/picard
+++ b/bin/picard
@@ -1,2 +1,8 @@
#!/usr/bin/env bash
+
+# If the classpath is already set, then invoke PicardCommandLine, allowing
+# java to use the classpath.
+if [ "$CLASSPATH" != "" ]; then exec java picard.cmdline.PicardCommandLine "$@"; fi
+
+# Otherwise, use the full picard jar; this assumes you're running in the container.
java -jar /opt/picard/lib/picard*.jar "$@"
diff --git a/build.gradle b/build.gradle
index 2339503..d3a2359 100644
--- a/build.gradle
+++ b/build.gradle
@@ -20,24 +20,24 @@ distributions {
// Puts the compiled xenocp jar into lib
from jar
// Puts all of the compile dependencies into lib
- from configurations.compile
+ from configurations.compileClasspath
}
}
}
}
repositories {
- jcenter()
+ mavenCentral()
}
dependencies {
- compile 'com.github.broadinstitute:picard:2.6.0'
- compile 'commons-cli:commons-cli:1.2'
- compile 'com.github.samtools:htsjdk:2.19.0'
+ implementation 'com.github.broadinstitute:picard:2.19.2'
+ implementation 'commons-cli:commons-cli:1.2'
+ implementation 'com.github.samtools:htsjdk:2.19.0'
- compile files('dependencies/lib/java/xenocp-dependencies.jar')
+ implementation files('dependencies/lib/java/xenocp-dependencies.jar')
- testCompile 'junit:junit:4.10'
+ testImplementation 'junit:junit:4.10'
}
group = 'org.stjude.compbio.xenocp'
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..7454180
Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..ffed3a2
--- /dev/null
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,5 @@
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.2-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
diff --git a/gradlew b/gradlew
new file mode 100755
index 0000000..1b6c787
--- /dev/null
+++ b/gradlew
@@ -0,0 +1,234 @@
+#!/bin/sh
+
+#
+# Copyright © 2015-2021 the original authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+#
+# Gradle start up script for POSIX generated by Gradle.
+#
+# Important for running:
+#
+# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
+# noncompliant, but you have some other compliant shell such as ksh or
+# bash, then to run this script, type that shell name before the whole
+# command line, like:
+#
+# ksh Gradle
+#
+# Busybox and similar reduced shells will NOT work, because this script
+# requires all of these POSIX shell features:
+# * functions;
+# * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
+# «${var#prefix}», «${var%suffix}», and «$( cmd )»;
+# * compound commands having a testable exit status, especially «case»;
+# * various built-in commands including «command», «set», and «ulimit».
+#
+# Important for patching:
+#
+# (2) This script targets any POSIX shell, so it avoids extensions provided
+# by Bash, Ksh, etc; in particular arrays are avoided.
+#
+# The "traditional" practice of packing multiple parameters into a
+# space-separated string is a well documented source of bugs and security
+# problems, so this is (mostly) avoided, by progressively accumulating
+# options in "$@", and eventually passing that to Java.
+#
+# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
+# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
+# see the in-line comments for details.
+#
+# There are tweaks for specific operating systems such as AIX, CygWin,
+# Darwin, MinGW, and NonStop.
+#
+# (3) This script is generated from the Groovy template
+# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+# within the Gradle project.
+#
+# You can find Gradle at https://github.com/gradle/gradle/.
+#
+##############################################################################
+
+# Attempt to set APP_HOME
+
+# Resolve links: $0 may be a link
+app_path=$0
+
+# Need this for daisy-chained symlinks.
+while
+ APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
+ [ -h "$app_path" ]
+do
+ ls=$( ls -ld "$app_path" )
+ link=${ls#*' -> '}
+ case $link in #(
+ /*) app_path=$link ;; #(
+ *) app_path=$APP_HOME$link ;;
+ esac
+done
+
+APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
+
+APP_NAME="Gradle"
+APP_BASE_NAME=${0##*/}
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD=maximum
+
+warn () {
+ echo "$*"
+} >&2
+
+die () {
+ echo
+ echo "$*"
+ echo
+ exit 1
+} >&2
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "$( uname )" in #(
+ CYGWIN* ) cygwin=true ;; #(
+ Darwin* ) darwin=true ;; #(
+ MSYS* | MINGW* ) msys=true ;; #(
+ NONSTOP* ) nonstop=true ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+ # IBM's JDK on AIX uses strange locations for the executables
+ JAVACMD=$JAVA_HOME/jre/sh/java
+ else
+ JAVACMD=$JAVA_HOME/bin/java
+ fi
+ if [ ! -x "$JAVACMD" ] ; then
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+ fi
+else
+ JAVACMD=java
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
+ case $MAX_FD in #(
+ max*)
+ MAX_FD=$( ulimit -H -n ) ||
+ warn "Could not query maximum file descriptor limit"
+ esac
+ case $MAX_FD in #(
+ '' | soft) :;; #(
+ *)
+ ulimit -n "$MAX_FD" ||
+ warn "Could not set maximum file descriptor limit to $MAX_FD"
+ esac
+fi
+
+# Collect all arguments for the java command, stacking in reverse order:
+# * args from the command line
+# * the main class name
+# * -classpath
+# * -D...appname settings
+# * --module-path (only if needed)
+# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if "$cygwin" || "$msys" ; then
+ APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
+ CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
+
+ JAVACMD=$( cygpath --unix "$JAVACMD" )
+
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
+ for arg do
+ if
+ case $arg in #(
+ -*) false ;; # don't mess with options #(
+ /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
+ [ -e "$t" ] ;; #(
+ *) false ;;
+ esac
+ then
+ arg=$( cygpath --path --ignore --mixed "$arg" )
+ fi
+ # Roll the args list around exactly as many times as the number of
+ # args, so each arg winds up back in the position where it started, but
+ # possibly modified.
+ #
+ # NB: a `for` loop captures its iteration list before it begins, so
+ # changing the positional parameters here affects neither the number of
+ # iterations, nor the values presented in `arg`.
+ shift # remove old arg
+ set -- "$@" "$arg" # push replacement arg
+ done
+fi
+
+# Collect all arguments for the java command;
+# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
+# shell script including quotes and variable substitutions, so put them in
+# double quotes to make sure that they get re-expanded; and
+# * put everything else in single quotes, so that it's not re-expanded.
+
+set -- \
+ "-Dorg.gradle.appname=$APP_BASE_NAME" \
+ -classpath "$CLASSPATH" \
+ org.gradle.wrapper.GradleWrapperMain \
+ "$@"
+
+# Use "xargs" to parse quoted args.
+#
+# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
+#
+# In Bash we could simply go:
+#
+# readarray ARGS < <( xargs -n1 <<<"$var" ) &&
+# set -- "${ARGS[@]}" "$@"
+#
+# but POSIX shell has neither arrays nor command substitution, so instead we
+# post-process each arg (as a line of input to sed) to backslash-escape any
+# character that might be a shell metacharacter, then use eval to reverse
+# that process (while maintaining the separation between arguments), and wrap
+# the whole thing up as a single "set" statement.
+#
+# This will of course break if any of these variables contains a newline or
+# an unmatched quote.
+#
+
+eval "set -- $(
+ printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
+ xargs -n1 |
+ sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
+ tr '\n' ' '
+ )" '"$@"'
+
+exec "$JAVACMD" "$@"
diff --git a/gradlew.bat b/gradlew.bat
new file mode 100644
index 0000000..ac1b06f
--- /dev/null
+++ b/gradlew.bat
@@ -0,0 +1,89 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega