Skip to content

Commit

Permalink
Merge pull request #23 from NBISweden/dev
Browse files Browse the repository at this point in the history
Merge changes from dev into main for version 0.4.2
  • Loading branch information
verku authored Sep 5, 2022
2 parents 6a4639a + 1fe7ac1 commit 485b645
Show file tree
Hide file tree
Showing 24 changed files with 151 additions and 63 deletions.
25 changes: 19 additions & 6 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,46 +23,59 @@ jobs:
steps:
- uses: actions/checkout@v2

- uses: eWaterCycle/setup-singularity@v5
- uses: eWaterCycle/setup-singularity@v7
with:
singularity-version: 3.6.4
singularity-version: 3.8.3

- uses: s-weigand/setup-conda@v1
- uses: conda-incubator/setup-miniconda@v2
with:
update-conda: true
activate-conda: true
mamba-version: "*"
activate-environment: generode
environment-file: environment.yml
auto-activate-base: false

- name: conda_environment
shell: bash -l {0}
run: |
conda env update -n base -f environment.yml
conda info
conda list
- name: mitogenome_mapping_dry
shell: bash -l {0}
run: |
snakemake -npr --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity
- name: mitogenome_mapping
shell: bash -l {0}
run: |
snakemake --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity
- name: mlRho_options_dry
shell: bash -l {0}
run: |
snakemake -npr --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity
- name: mlRho_options
shell: bash -l {0}
run: |
snakemake --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity
- name: pca_roh_dry
shell: bash -l {0}
run: |
snakemake -npr --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity
- name: pca_roh
shell: bash -l {0}
run: |
snakemake --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity
- name: snpeff_gerp_dry
shell: bash -l {0}
run: |
snakemake -npr --configfile .test/config/config_snpeff_gerp.yaml -j 4 --cores 1 --use-singularity
- name: snpeff_gerp
shell: bash -l {0}
run: |
snakemake --configfile .test/config/config_snpeff_gerp.yaml -j 4 --cores 1 --use-singularity
21 changes: 17 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
results/
data/logs/
data/raw_reads_symlinks/
data/mitogenomes/*.fasta.amb
data/mitogenomes/*.fasta.ann
data/mitogenomes/*.fasta.bwt
data/mitogenomes/*.fasta.pac
data/mitogenomes/*.fasta.sa

data/raw_reads_symlinks/
resources/
results/
tmpConsensi.fa
*.html

.cache/
.java/
.snakemake/
.test/data/references/repeat*
.test/data/references/*bed
.test/data/references/*dict
.test/data/references/*amb
.test/data/references/*ann
.test/data/references/*bwt
.test/data/references/*fai
.test/data/references/*pac
.test/data/references/*sa
.test/data/references/*genome
.test/data/references/*upper.fasta
2 changes: 1 addition & 1 deletion .test/config/config_mitogenomes.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.1 #
# Configuration settings for the GenErode pipeline 0.4.2 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
2 changes: 1 addition & 1 deletion .test/config/config_mlRho_options.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.1 #
# Configuration settings for the GenErode pipeline 0.4.2 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
2 changes: 1 addition & 1 deletion .test/config/config_pca_roh.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.1 #
# Configuration settings for the GenErode pipeline 0.4.2 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
2 changes: 1 addition & 1 deletion .test/config/config_snpeff_gerp.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.1 #
# Configuration settings for the GenErode pipeline 0.4.2 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This is the Snakefile of the GenErode pipeline for historical or #
# ancient and modern samples to study patterns of genome erosion #
# #
# Pipeline version 0.4.1 #
# Pipeline version 0.4.2 #
# #
# Written by Verena Kutschera, Marcin Kierczak and Tom van der Valk #
# Email: generode@nbis.se #
Expand Down
6 changes: 4 additions & 2 deletions config/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ vcf2plink_hwe:
repmasked_bcf2vcf_snpEff:
time: 05:00:00
cpus-per-task: 2
build_snpEff_db:
time: 05:00:00
filter_biallelic_missing_vcf_snpEff:
time: 1-00:00:00
cpus-per-task: 6
Expand Down Expand Up @@ -280,6 +282,6 @@ merge_gerp_alleles_gz:
time: 1-00:00:00
cpus-per-task: 4
relative_mutational_load_per_sample:
time: 10-00:00:00
cpus-per-task: 4
time: 1-00:00:00
cpus-per-task: 2
###
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.1 #
# Configuration settings for the GenErode pipeline 0.4.2 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
30 changes: 30 additions & 0 deletions workflow/docker/bedtools-2.29.2/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM ubuntu:18.04 AS builder
MAINTAINER NBIS Sweden <generode@nbis.se>
RUN apt update -y && apt install -y \
autoconf \
automake \
gcc \
libbz2-dev \
libcurl4-gnutls-dev \
liblzma-dev \
libncurses5-dev \
libssl-dev \
make \
perl \
wget \
zlib1g-dev

WORKDIR /bedtools

RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools.static.binary && \
mv bedtools.static.binary /bedtools/bedtools && \
chmod a+x /bedtools/bedtools

FROM ubuntu:18.04 AS production
RUN apt update -y && apt install -y \
libcurl4-gnutls-dev
COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/bgzip /usr/bin
COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/tabix /usr/bin
# COPY --from=verku/htslib-1.15.1:latest /htslib/bgzip /usr/bin # container used for development
# COPY --from=verku/htslib-1.15.1:latest /htslib/tabix /usr/bin # container used for development
COPY --from=builder /bedtools/bedtools /usr/bin/
26 changes: 26 additions & 0 deletions workflow/docker/htslib-1.15.1/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
FROM ubuntu:18.04 AS builder
MAINTAINER NBIS Sweden <generode@nbis.se>
RUN apt update -y && apt install -y \
autoconf \
automake \
gcc \
libbz2-dev \
libcurl4-gnutls-dev \
liblzma-dev \
libncurses5-dev \
libssl-dev \
make \
perl \
wget \
zlib1g-dev

WORKDIR /htslib
RUN wget https://github.com/samtools/htslib/releases/download/1.15.1/htslib-1.15.1.tar.bz2 && \
bunzip2 htslib-1.15.1.tar.bz2 && \
tar xfv htslib-1.15.1.tar && \
cd htslib-1.15.1 && \
./configure && \
make && \
cp bgzip /htslib/ && \
cp tabix /htslib/ && \
make install
2 changes: 1 addition & 1 deletion workflow/report/workflow.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.. image:: docs/source/img/logga_viridis2.png
.. image:: https://github.com/NBISweden/GenErode/blob/main/docs/source/img/logga_viridis2.png?raw=true
:width: 124.0px
:height: 175.4px
:alt: GenErode pipeline logo
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/0.1_reference_genome_preps.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ all_outputs.append(expand(REF_DIR + "/" + REF_NAME + ".{ext}",
ext=["dict", "genome", "bed"]))

# snakemake rules
localrules: make_reference_bed

rule bwa_index_reference:
"""Index the reference genome using bwa"""
input:
Expand Down Expand Up @@ -99,8 +101,6 @@ rule make_reference_bed:
"results/logs/0.1_reference_genome_preps/"
+ REF_NAME
+ "_make_reference_bed.log",
group:
"reference_prep_group"
shell:
"""
awk -v OFS='\t' '{{print $1, "0", $2}}' {input.fai} > {output.ref_bed} 2> {log}
Expand Down
6 changes: 3 additions & 3 deletions workflow/rules/0.2_repeat_identification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
### 0.2 Repeat prediction and repeat masking of the reference genome

# Code collecting output files from this part of the pipeline
all_outputs.append(REF_DIR + "/" + REF_NAME + ".repeats.bed")
all_outputs.append(REF_DIR + "/" + REF_NAME + ".repeats.sorted.bed")
all_outputs.append(REF_DIR + "/" + REF_NAME + ".repma.bed")


Expand Down Expand Up @@ -214,7 +214,7 @@ rule sort_repeats_bed:
log:
"results/logs/0.2_repeat_identification/" + REF_NAME + "_sort_repeats_bed.log",
singularity:
"docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0"
"docker://nbisweden/generode-bedtools-2.29.2"
shell:
"""
bedtools sort -g {input.genomefile} -i {input.rep_bed} > {output.sorted_rep_bed} 2> {log}
Expand All @@ -233,7 +233,7 @@ rule make_no_repeats_bed:
log:
"results/logs/0.2_repeat_identification/" + REF_NAME + "_make_no_repeats_bed.log",
singularity:
"docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0"
"docker://nbisweden/generode-bedtools-2.29.2"
shell:
"""
bedtools subtract -a {input.ref_bed} -b {input.sorted_rep_bed} > {output.no_rep_bed} 2> {log} &&
Expand Down
16 changes: 10 additions & 6 deletions workflow/rules/12_snpEff.smk
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ rule build_snpEff_db:
config=rules.update_snpEff_config.output.config,
output:
db=GTF_DIR + "/snpEff/data/" + REF_NAME + "/snpEffectPredictor.bin",
threads: 1
params:
ref_name=REF_NAME,
abs_gtf=lambda wildcards, input: os.path.abspath(input.gtf),
Expand All @@ -212,9 +213,10 @@ rule build_snpEff_db:
"docker://quay.io/biocontainers/snpeff:4.3.1t--3"
shell:
"""
mem=$(((6 * {threads}) - 2))
cd {params.abs_db_dir}
snpEff build -gtf22 -c {params.abs_config} -dataDir {params.abs_data_dir} -treatAllAsProteinCoding \
-v {params.ref_name} 2> {log}
java -jar -Xmx${{mem}}g /usr/local/share/snpeff-4.3.1t-3/snpEff.jar build -gtf22 -c {params.abs_config} \
-dataDir {params.abs_data_dir} -treatAllAsProteinCoding -v {params.ref_name} 2> {log}
"""


Expand Down Expand Up @@ -242,15 +244,15 @@ rule filter_biallelic_missing_vcf_snpEff:
bed=rules.filtered_vcf2bed.output.bed,
genomefile=rules.genome_file.output.genomefile,
output:
filtered=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf"),
filtered=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz"),
threads: 6
log:
"results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log",
singularity:
"docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0"
"docker://nbisweden/generode-bedtools-2.29.2"
shell:
"""
bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log}
bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log}
"""


Expand All @@ -264,6 +266,7 @@ rule annotate_vcf:
ann="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.ann.vcf",
csv="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv",
html="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.html",
threads: 1
params:
ref_name=REF_NAME,
abs_config=lambda wildcards, input: os.path.abspath(input.config),
Expand All @@ -274,7 +277,8 @@ rule annotate_vcf:
"docker://quay.io/biocontainers/snpeff:4.3.1t--3"
shell:
"""
snpEff -c {params.abs_config} -dataDir {params.abs_data_dir} -s {output.html} -csvStats {output.csv} \
mem=$(((6 * {threads}) - 2))
java -jar -Xmx${{mem}}g /usr/local/share/snpeff-4.3.1t-3/snpEff.jar -c {params.abs_config} -dataDir {params.abs_data_dir} -s {output.html} -csvStats {output.csv} \
-treatAllAsProteinCoding -v -d -lof {params.ref_name} {input.vcf} > {output.ann} 2> {log}
"""

Expand Down
16 changes: 8 additions & 8 deletions workflow/rules/13_GERP.smk
Original file line number Diff line number Diff line change
Expand Up @@ -810,22 +810,22 @@ rule filter_biallelic_missing_vcf_gerp:
bed=rules.filtered_vcf2bed.output.bed,
genomefile=rules.genome_file.output.genomefile,
output:
filtered=temp("results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf"),
filtered=temp("results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz"),
threads: 6
log:
"results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log",
singularity:
"docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0"
"docker://nbisweden/generode-bedtools-2.29.2"
shell:
"""
bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log}
bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log}
"""


rule biallelic_missing_filtered_vcf_gerp_stats:
"""Obtain summary stats of filtered vcf file"""
input:
filtered="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf",
filtered="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz",
output:
stats="results/gerp/{dataset}/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt",
log:
Expand Down Expand Up @@ -879,15 +879,15 @@ rule modern_biallelic_missing_filtered_vcf_gerp_multiqc:
rule split_vcf_files:
"""Split the VCF files into chunks for more resource-efficient merging with GERP results"""
input:
vcf="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf",
vcf="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz",
chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed",
genomefile=REF_DIR + "/" + REF_NAME + ".genome",
output:
vcf_chunk=temp("results/gerp/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chunk}.vcf.gz"),
log:
"results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_fmissing{fmiss}.{chunk}_split_vcf_chunks.log",
singularity:
"docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0"
"docker://nbisweden/generode-bedtools-2.29.2"
shell:
"""
bedtools intersect -a {input.vcf} -b {input.chunk_bed} -g {input.genomefile} -header | gzip - > {output.vcf_chunk} 2> {log}
Expand All @@ -903,7 +903,7 @@ rule split_chunk_bed_files:
log:
"results/logs/13_GERP/" + REF_NAME + ".{chunk}_split_chunk_bed_files.log",
singularity:
"docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0"
"docker://nbisweden/generode-bedtools-2.29.2"
shell:
"""
bedtools makewindows -b {input.chunk_bed} -w 10000000 > {output.chunk_win_bed} 2> {log}
Expand Down Expand Up @@ -994,7 +994,7 @@ rule relative_mutational_load_per_sample:
max_gerp=config["max_gerp"],
log:
"results/logs/13_GERP/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load_table.gerp_{minGERP}_{maxGERP}.log",
threads: 4
threads: 2
shell:
"""
python3 workflow/scripts/gerp_rel_mut_load_sample.py {input.gerp_out} {params.min_gerp} {params.max_gerp} {output.mut_load} 2> {log}
Expand Down
Loading

0 comments on commit 485b645

Please sign in to comment.