snakemake-workflows · johanneskoester · May 12, 2023 · Jun 2, 2022 · Jun 22, 2022 · Jun 22, 2022
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -1,3 +1,33 @@
+#name: My build action requiring more space
+#on: push
+#
+#jobs:
+#  build:
+#    name: Build my artifact
+#    runs-on: ubuntu-latest
+#    steps:
+#      - name: Maximize build space
+#        uses: easimon/maximize-build-space@master
+#        with:
+#          root-reserve-mb: 512
+#          swap-size-mb: 1024
+#          remove-dotnet: 'true'
+#          remove-android: 'true'
+#          remove-haskell: 'true'
+#      - name: Checkout
+#        uses: actions/checkout@v2
+#
+#      - name: Build
+#        run: |
+#          echo "Free space:"
+#          df -h
+#          echo "free space in ${{ github.workspace }}"
+#          du -hs $(ls -A) ${{ github.workspace }}/*
+#          rm -rf ${{ github.workspace }}/*
+#          echo "free space in .test"
+#          du -hs $(ls -A) .test/*
+
+
 name: Tests
 
 on:
@@ -30,7 +60,7 @@ jobs:
         snakefile: workflow/Snakefile
         args: "--lint"
 
-  run-workflow:
+  run-rna-workflow:
     runs-on: ubuntu-latest
     needs:
       - linting
@@ -47,8 +77,26 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --all-temp"
+        args: "--use-conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache --all-temp"
+
+  run-3prime-rna-workflow:
+    runs-on: ubuntu-latest
+    needs:
+      - linting
+      - formatting
+    steps:
 
+    - name: Checkout repository
+      uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Test 3-prime-workflow
+      uses: snakemake/snakemake-github-action@v1.23.0
+      with:
+        directory: .test/3-prime-config
+        snakefile: workflow/Snakefile
+        args: "--use-conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache --all-temp"
     # Disable report testing for now since we mark all output files as temporary above.
     # TODO: add some kind of test mode to report generation which does not really try to include
     # results.

diff --git a/.test/3-prime-config/config/config.yaml b/.test/3-prime-config/config/config.yaml
@@ -0,0 +1,138 @@
+samples: config/samples.tsv
+units: config/units.tsv
+
+experiment:
+  # If set to `true`, this option allows the workflow to analyse 3-prime RNA seq data obtained from Quantseq protocol by Lexogen.
+  # For more information https://www.lexogen.com/quantseq-3mrna-sequencing/
+  3-prime-rna-seq:
+    activate: true
+    # this allows to plot QC of aligned read postion for specific transcripts (or 'all' transcripts)
+    # Specify vendor of the used protocol. Currently, only lexogene is supported.
+    vendor: lexogen
+    plot-qc: all
+
+
+
+resources:
+  ref:
+    # ensembl species name
+    species: homo_sapiens
+    # ensembl release version
+    release: "104"
+    # genome build
+    build: GRCh38
+    # pfam release to use for annotation of domains in differential splicing analysis
+    pfam: "33.0"
+    representative_transcripts: canonical
+  ontology:
+    # gene ontology to download, used e.g. in goatools
+    gene_ontology: "http://current.geneontology.org/ontology/go-basic.obo"
+
+pca:
+  labels:
+    # columns of sample sheet to use for PCA
+    - condition
+
+scatter:
+  # for use as diagnostic plots
+  # all samples are compared in pairs to assess their correlation
+  # scatter plots are only created if parameter 'activate' is set to 'true'
+  activate: true
+
+diffexp:
+  # samples to exclude (e.g. outliers due to technical problems)
+  exclude:
+  # model for sleuth differential expression analysis
+  models:
+    model_X:
+      full: ~condition + batch_effect
+      reduced: ~batch_effect
+      # Binary valued covariate that shall be used for fold change/effect size
+      # based downstream analyses.
+      primary_variable: condition
+      base_level: untreated
+  # significance level to use for volcano, ma- and qq-plots
+  sig-level:
+    volcano-plot: 0.05
+    ma-plot: 0.05
+    qq-plot: 0.05
+  # Optional (comment in to use): provide a list of genes that shall be shown in a heatmap
+  # and for which bootstrap plots (see below) shall be created.  
+  genes_of_interest:
+    activate: false
+    genelist: "resources/gene_list.tsv"
+
+diffsplice:
+  activate: true
+  # codingCutoff parameter of isoformSwitchAnalyzer, see
+  # https://rdrr.io/bioc/IsoformSwitchAnalyzeR/man/analyzeCPAT.html
+  coding_cutoff: 0.725
+  # Should be set to true when using de-novo assembled transcripts.
+  remove_noncoding_orfs: false
+  # False discovery rate to control for.
+  fdr: 1.0
+  # Minimum size of differential isoform usage effect
+  # (see dIFcutoff, https://rdrr.io/github/kvittingseerup/IsoformSwitchAnalyzeR/man/IsoformSwitchTestDEXSeq.html)
+  min_effect_size: 0.0
+
+enrichment:
+  goatools:
+    # tool is only run if set to `true`
+    activate: true
+    fdr_genes: 0.05
+    fdr_go_terms: 0.05
+  fgsea:
+    gene_sets_file: "../ngs-test-data/ref/dummy.gmt"
+    # tool is only run if set to `true`
+    activate: true
+    # if activated, you need to provide a GMT file with gene sets of interest
+    fdr_gene_set: 0.05
+    eps: 0.0001
+  spia:
+    # tool is only run if set to `true`
+    activate: true
+    # pathway database to use in SPIA, needs to be available for
+    # the species specified by resources -> ref -> species above
+    pathway_database: "panther"
+
+bootstrap_plots:
+  # desired false discovery rate for bootstrap plots, i.e. a lower FDR will result in fewer boxplots generated
+  FDR: 0.01
+  # maximum number of bootstrap plots to generate, i.e. top n discoveries to plot
+  top_n: 3
+  color_by: condition
+  # for now, this will plot the sleuth-normalised kallisto count estimations with kallisto
+  # for all the transcripts of the respective genes
+
+plot_vars:
+  # significance level used for plot_vars() plots
+  sig_level: 0.1
+
+params:
+  kallisto: "-b 100"
+  # these cutadapt parameters need to contain the required flag(s) for
+  # the type of adapter(s) to trim, i.e.:
+  # * https://cutadapt.readthedocs.io/en/stable/guide.html#adapter-types
+  #   * `-a` for 3' adapter in the forward reads
+  #   * `-g` for 5' adapter in the forward reads
+  #   * `-b` for adapters anywhere in the forward reads
+  # also, separate capitalised letter flags are required for adapters in
+  # the reverse reads of paired end sequencing:
+  # * https://cutadapt.readthedocs.io/en/stable/guide.html#trimming-paired-end-reads
+  cutadapt-se:
+    adapters: "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
+    extra: "-q 20"
+  # reasoning behind parameters:
+  #   For reads that are produced by 3’-end sequencing, depending on the protocol, it might be recommended to remove some leading bases (e.g. see https://www.nature.com/articles/s41598-019-55434-x#Sec10)
+  #   * `--minimum-length 33`:
+  #     * kallisto needs non-empty reads in current versions (fixed for future releases:
+  #       https://github.com/pachterlab/kallisto/commit/64fe837ca86f3664496483bcd2787c9376584fed)
+  #     * kallisto default k-mer length is 31 and 33 should give at least 3 k-mers for a read
+  #   * `-e 0.005`: the default cutadapt maximum error rate of `0.2` is far too high, for Illumina
+  #     data the error rate is more in the range of `0.005` and setting it accordingly should avoid
+  #     false positive adapter matches
+  #   * `--minimum-overlap 7`: the cutadapt default minimum overlap of `5` did trimming on the level
+  #     of expected adapter matches by chance
+  cutadapt-pe: 
+    adapters: "-a ACGGATCGATCGATCGATCGAT -g GGATCGATCGATCGATCGAT -A ACGGATCGATCGATCGATCGAT -G GGATCGATCGATCGATCGAT"
+    extra: "--minimum-length 33 -e 0.005 --overlap 7"
diff --git a/.test/3-prime-config/config/samples.tsv b/.test/3-prime-config/config/samples.tsv
@@ -0,0 +1,5 @@
+sample	condition	batch_effect
+A	treated	batch1
+B	untreated	batch1
+C	treated	batch2
+D	untreated	batch2
diff --git a/.test/3-prime-config/config/units.tsv b/.test/3-prime-config/config/units.tsv
@@ -0,0 +1,6 @@
+sample	unit	fragment_len_mean	fragment_len_sd	fq1	fq2
+A	1	300	14	../ngs-test-data/reads/a.chr21.2.fq
+B	1	300	14	../ngs-test-data/reads/b.chr21.1.fq
+B	2	300	14	../ngs-test-data/reads/b.chr21.2.fq
+C	1	300	14	../ngs-test-data/reads/a.chr21.2.fq
+D	1	300	14	../ngs-test-data/reads/b.chr21.2.fq
diff --git a/.test/3-prime-config/workflow/Snakefile b/.test/3-prime-config/workflow/Snakefile
@@ -0,0 +1,33 @@
+from snakemake.utils import min_version
+
+min_version("7.17.0")
+
+
+configfile: "config/config.yaml"
+
+
+report: "report/workflow.rst"
+
+
+# this container defines the underlying OS for each job when using the workflow
+# with --use-conda --use-singularity
+container: "docker://continuumio/miniconda3"
+
+
+include: "rules/common.smk"
+include: "rules/trim.smk"
+include: "rules/trim_3prime.smk"
+include: "rules/qc_3prime.smk"
+include: "rules/ref.smk"
+include: "rules/ref_3prime.smk"
+include: "rules/quant.smk"
+include: "rules/quant_3prime.smk"
+include: "rules/diffexp.smk"
+include: "rules/diffsplice.smk"
+include: "rules/enrichment.smk"
+include: "rules/datavzrd.smk"
+
+
+rule all:
+    input:
+        all_input,
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -1,6 +1,16 @@
 samples: config/samples.tsv
 units: config/units.tsv
 
+experiment:
+  # If set to `true`, this option allows the workflow to analyse 3-prime RNA seq data obtained from Quantseq protocol by Lexogen.
+  # For more information https://www.lexogen.com/quantseq-3mrna-sequencing/
+  3-prime-rna-seq:
+    activate: false
+    # Specify vendor of the used protocol. Currently, only lexogene is supported.
+    vendor: lexogen
+    # this allows to plot QC of aligned read postion for specific transcripts (or 'all' transcripts)
+    plot-qc: all
+
 resources:
   ref:
     # ensembl species name
@@ -44,6 +54,11 @@ diffexp:
     volcano-plot: 0.05
     ma-plot: 0.05
     qq-plot: 0.05
+  # Optional (comment in to use): provide a list of genes that shall be shown in a heatmap
+  # and for which bootstrap plots (see below) shall be created.  
+  genes_of_interest:
+    activate: true
+    genelist: "resources/gene_list.tsv"
 
 diffsplice:
   activate: true
@@ -86,8 +101,6 @@ bootstrap_plots:
   color_by: condition
   # for now, this will plot the sleuth-normalised kallisto count estimations with kallisto
   # for all the transcripts of the respective genes
-  genes_of_interest:
-    - A4galt
 
 plot_vars:
   # significance level used for plot_vars() plots
@@ -108,6 +121,7 @@ params:
     adapters: "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
     extra: "-q 20"
   # reasoning behind parameters:
+  #   For reads that are produced by 3’-end sequencing, depending on the protocol, it might be recommended to remove some leading bases (e.g. see https://www.nature.com/articles/s41598-019-55434-x#Sec10)
   #   * `--minimum-length 33`:
   #     * kallisto needs non-empty reads in current versions (fixed for future releases:
   #       https://github.com/pachterlab/kallisto/commit/64fe837ca86f3664496483bcd2787c9376584fed)
@@ -119,4 +133,4 @@ params:
   #     of expected adapter matches by chance
   cutadapt-pe: 
     adapters: "-a ACGGATCGATCGATCGATCGAT -g GGATCGATCGATCGATCGAT -A ACGGATCGATCGATCGATCGAT -G GGATCGATCGATCGATCGAT"
-    extra: "--minimum-length 33 -e 0.005 --overlap 7"
+    extra: "--minimum-length 33 -e 0.005 --overlap 7"
diff --git a/.test/resources/gene_list.tsv b/.test/resources/gene_list.tsv
@@ -0,0 +1,18 @@
+STAT1
+IRF1
+HLA-A
+HLA-DRB1
+TYR
+PMEL
+DCT
+MLANA
+MITF
+CDK2
+SOX10
+ERBB3
+LEF1
+CTNNB1
+CDH1
+FN1
+NGFR
+AXL
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,6 +1,18 @@
 samples: config/samples.tsv
 units: config/units.tsv
 
+experiment:
+  # If set to `true`, this option allows the workflow to analyse 3-prime RNA seq data obtained from Quantseq protocol by Lexogen.
+  # For more information https://www.lexogen.com/quantseq-3mrna-sequencing/
+  3-prime-rna-seq:
+    activate: false
+    # Specify vendor of the used protocol. Currently, only lexogen is supported.
+    vendor: lexogen
+    # this allows to plot QC of aligned read postion for specific transcripts (or 'all' transcripts)
+    plot-qc: all
+
+
+
 resources:
   ref:
     # ensembl species name
@@ -52,6 +64,11 @@ diffexp:
     volcano-plot: 0.05
     ma-plot: 0.05
     qq-plot: 0.05
+  # Optional (comment in to use): provide a list of genes that shall be shown in a heatmap
+  # and for which bootstrap plots (see below) shall be created.  
+  genes_of_interest:
+    activate: false
+    genelist: "resources/gene_list.tsv"
 
 diffsplice:
   activate: true
@@ -95,14 +112,14 @@ bootstrap_plots:
   color_by: condition
   # for now, this will plot the sleuth-normalised kallisto count estimations with kallisto
   # for all the transcripts of the respective genes
-  genes_of_interest:
-    - A4galt
 
 plot_vars:
   # significance level used for plot_vars() plots
   sig_level: 0.1
 
 params:
+  #For reads that are produced by 3’-end sequencing, the --single-overhang option does not discard
+  #reads where the expected fragment size goes beyond the transcript start
   kallisto: "-b 100"
   # these cutadapt parameters need to contain the required flag(s) for
   # the type of adapter(s) to trim, i.e.:
@@ -113,10 +130,12 @@ params:
   # also, separate capitalised letter flags are required for adapters in
   # the reverse reads of paired end sequencing:
   # * https://cutadapt.readthedocs.io/en/stable/guide.html#trimming-paired-end-reads
+
   cutadapt-se:
     adapters: "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
     extra: "-q 20"
   # reasoning behind parameters:
+  #   For reads that are produced by 3’-end sequencing, depending on the protocol, it might be recommended to remove some leading bases (e.g. see https://www.nature.com/articles/s41598-019-55434-x#Sec10)
   #   * `--minimum-length 33`:
   #     * kallisto needs non-empty reads in current versions (fixed for future releases:
   #       https://github.com/pachterlab/kallisto/commit/64fe837ca86f3664496483bcd2787c9376584fed)