From 0a573e05362bdaee015a1a7b2a9d167d77fdcbf3 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 14 Jun 2022 15:44:28 +0200 Subject: [PATCH 01/19] Update conda in github actions with setup-miniconda from conda-incubator --- .github/workflows/main.yaml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 113f18c..093c0a1 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -23,17 +23,20 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: eWaterCycle/setup-singularity@v5 + - uses: eWaterCycle/setup-singularity@v7 with: - singularity-version: 3.6.4 + singularity-version: 3.8.3 - - uses: s-weigand/setup-conda@v1 + - uses: conda-incubator/setup-miniconda@v2 with: - update-conda: true - activate-conda: true + activate-environment: generode + environment-file: environment.yml + auto-activate-base: false + - name: conda_environment run: | - conda env update -n base -f environment.yml + conda info + conda list - name: mitogenome_mapping_dry run: | From 1846d20596e02231c992e0de00e4b08c6ce5a03e Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 14 Jun 2022 16:11:30 +0200 Subject: [PATCH 02/19] Add mamba to github actions --- .github/workflows/main.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 093c0a1..86c2f33 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -29,6 +29,7 @@ jobs: - uses: conda-incubator/setup-miniconda@v2 with: + mamba-version: "*" activate-environment: generode environment-file: environment.yml auto-activate-base: false From 85129dc123069c6adbff469cd7df650980ceac82 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 14 Jun 2022 16:19:33 +0200 Subject: [PATCH 03/19] Ensure login bash with correct environment for github actions --- .github/workflows/main.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 86c2f33..23438ff 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -35,38 +35,47 @@ jobs: auto-activate-base: false - name: conda_environment + shell: bash -l {0} run: | conda info conda list - name: mitogenome_mapping_dry + shell: bash -l {0} run: | snakemake -npr --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity - name: mitogenome_mapping + shell: bash -l {0} run: | snakemake --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity - name: mlRho_options_dry + shell: bash -l {0} run: | snakemake -npr --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity - name: mlRho_options + shell: bash -l {0} run: | snakemake --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity - name: pca_roh_dry + shell: bash -l {0} run: | snakemake -npr --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity - name: pca_roh + shell: bash -l {0} run: | snakemake --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity - name: snpeff_gerp_dry + shell: bash -l {0} run: | snakemake -npr --configfile .test/config/config_snpeff_gerp.yaml -j 4 --cores 1 --use-singularity - name: snpeff_gerp + shell: bash -l {0} run: | snakemake --configfile .test/config/config_snpeff_gerp.yaml -j 4 --cores 1 --use-singularity \ No newline at end of file From 22870fbd507b8b3c760b1d241f2761b9fc75c437 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 13 Jul 2022 10:50:29 +0200 Subject: [PATCH 04/19] Reduce computational requirements --- config/cluster.yaml | 4 ++-- workflow/rules/13_GERP.smk | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/cluster.yaml b/config/cluster.yaml index 8cd96e1..bbfd57f 100644 --- a/config/cluster.yaml +++ b/config/cluster.yaml @@ -280,6 +280,6 @@ merge_gerp_alleles_gz: time: 1-00:00:00 cpus-per-task: 4 relative_mutational_load_per_sample: - time: 10-00:00:00 - cpus-per-task: 4 + time: 1-00:00:00 + cpus-per-task: 2 ### diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 031737d..9ab7c31 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -994,7 +994,7 @@ rule relative_mutational_load_per_sample: max_gerp=config["max_gerp"], log: "results/logs/13_GERP/{dataset}/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.relative_mutational_load_table.gerp_{minGERP}_{maxGERP}.log", - threads: 4 + threads: 2 shell: """ python3 workflow/scripts/gerp_rel_mut_load_sample.py {input.gerp_out} {params.min_gerp} {params.max_gerp} {output.mut_load} 2> {log} From 8de3946aca9ec4f3ef90953e30514f9a345e560f Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 13 Jul 2022 10:51:12 +0200 Subject: [PATCH 05/19] Remove temp flag from rescaled bam index file --- workflow/rules/3.2_historical_bam_mapDamage.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/3.2_historical_bam_mapDamage.smk b/workflow/rules/3.2_historical_bam_mapDamage.smk index 9059b95..6eccf10 100644 --- a/workflow/rules/3.2_historical_bam_mapDamage.smk +++ b/workflow/rules/3.2_historical_bam_mapDamage.smk @@ -49,7 +49,7 @@ rule index_rescaled_bams: input: bam="results/{dataset}/mapping/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.bam", output: - index=temp("results/{dataset}/mapping/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.bam.bai"), + index="results/{dataset}/mapping/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.rescaled.bam.bai", log: "results/logs/3.2_historical_bam_mapDamage/" + REF_NAME + "/{dataset}/{sample}_index_rescaled_bams.log", group: From 4a6f6ec37a76e87a89dd6412fc7b350e13d1d4b4 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 13 Jul 2022 12:53:51 +0200 Subject: [PATCH 06/19] Run snpEff with option to specify -Xmx for large genomes --- workflow/rules/12_snpEff.smk | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index e9dacfe..b285fb1 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -199,6 +199,7 @@ rule build_snpEff_db: config=rules.update_snpEff_config.output.config, output: db=GTF_DIR + "/snpEff/data/" + REF_NAME + "/snpEffectPredictor.bin", + threads: 1 params: ref_name=REF_NAME, abs_gtf=lambda wildcards, input: os.path.abspath(input.gtf), @@ -212,9 +213,10 @@ rule build_snpEff_db: "docker://quay.io/biocontainers/snpeff:4.3.1t--3" shell: """ + mem=$(((6 * {threads}) - 2)) cd {params.abs_db_dir} - snpEff build -gtf22 -c {params.abs_config} -dataDir {params.abs_data_dir} -treatAllAsProteinCoding \ - -v {params.ref_name} 2> {log} + java -jar -Xmx${{mem}}g /usr/local/share/snpeff-4.3.1t-3/snpEff.jar build -gtf22 -c {params.abs_config} \ + -dataDir {params.abs_data_dir} -treatAllAsProteinCoding -v {params.ref_name} 2> {log} """ @@ -264,6 +266,7 @@ rule annotate_vcf: ann="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.ann.vcf", csv="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.csv", html="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}_stats.html", + threads: 1 params: ref_name=REF_NAME, abs_config=lambda wildcards, input: os.path.abspath(input.config), @@ -274,7 +277,8 @@ rule annotate_vcf: "docker://quay.io/biocontainers/snpeff:4.3.1t--3" shell: """ - snpEff -c {params.abs_config} -dataDir {params.abs_data_dir} -s {output.html} -csvStats {output.csv} \ + mem=$(((6 * {threads}) - 2)) + java -jar -Xmx${{mem}}g /usr/local/share/snpeff-4.3.1t-3/snpEff.jar -c {params.abs_config} -dataDir {params.abs_data_dir} -s {output.html} -csvStats {output.csv} \ -treatAllAsProteinCoding -v -d -lof {params.ref_name} {input.vcf} > {output.ann} 2> {log} """ From 390f4003ccfc6ff41f910a4ef8a51ce722293022 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 13 Jul 2022 12:54:12 +0200 Subject: [PATCH 07/19] Add all rules that run snpEff with -Xmx to cluster.yaml for manual adjustment --- config/cluster.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/cluster.yaml b/config/cluster.yaml index 8cd96e1..f1188b0 100644 --- a/config/cluster.yaml +++ b/config/cluster.yaml @@ -223,6 +223,8 @@ vcf2plink_hwe: repmasked_bcf2vcf_snpEff: time: 05:00:00 cpus-per-task: 2 +build_snpEff_db: + time: 05:00:00 filter_biallelic_missing_vcf_snpEff: time: 1-00:00:00 cpus-per-task: 6 From 435c0de601c19c11fc8fd206f84306627e09168c Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 9 Aug 2022 17:50:52 +0200 Subject: [PATCH 08/19] Add Dockerfile for htslib containing bgzip --- workflow/docker/htslib-1.15.1/Dockerfile | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 workflow/docker/htslib-1.15.1/Dockerfile diff --git a/workflow/docker/htslib-1.15.1/Dockerfile b/workflow/docker/htslib-1.15.1/Dockerfile new file mode 100644 index 0000000..3dd25cf --- /dev/null +++ b/workflow/docker/htslib-1.15.1/Dockerfile @@ -0,0 +1,26 @@ +FROM ubuntu:18.04 AS builder +MAINTAINER NBIS Sweden +RUN apt update -y && apt install -y \ + autoconf \ + automake \ + gcc \ + libbz2-dev \ + libcurl4-gnutls-dev \ + liblzma-dev \ + libncurses5-dev \ + libssl-dev \ + make \ + perl \ + wget \ + zlib1g-dev + +WORKDIR /htslib +RUN wget https://github.com/samtools/htslib/releases/download/1.15.1/htslib-1.15.1.tar.bz2 && \ + bunzip2 htslib-1.15.1.tar.bz2 && \ + tar xfv htslib-1.15.1.tar && \ + cd htslib-1.15.1 && \ + ./configure && \ + make && \ + cp bgzip /htslib/ && \ + cp tabix /htslib/ && \ + make install From 8e040e0572a8dbcc7be42c5e385510a9fb4d45a0 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 9 Aug 2022 17:51:50 +0200 Subject: [PATCH 09/19] Add Dockerfile for bedtools with bgzip and tabix --- workflow/docker/bedtools-2.29.2/Dockerfile | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 workflow/docker/bedtools-2.29.2/Dockerfile diff --git a/workflow/docker/bedtools-2.29.2/Dockerfile b/workflow/docker/bedtools-2.29.2/Dockerfile new file mode 100644 index 0000000..2eba8e1 --- /dev/null +++ b/workflow/docker/bedtools-2.29.2/Dockerfile @@ -0,0 +1,30 @@ +FROM ubuntu:18.04 AS builder +MAINTAINER NBIS Sweden +RUN apt update -y && apt install -y \ + autoconf \ + automake \ + gcc \ + libbz2-dev \ + libcurl4-gnutls-dev \ + liblzma-dev \ + libncurses5-dev \ + libssl-dev \ + make \ + perl \ + wget \ + zlib1g-dev + +WORKDIR /bedtools + +RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools.static.binary && \ + mv bedtools.static.binary /bedtools/bedtools && \ + chmod a+x /bedtools/bedtools + +FROM ubuntu:18.04 AS production +RUN apt update -y && apt install -y \ + libcurl4-gnutls-dev +# COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/bgzip /usr/bin +# COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/tabix /usr/bin +COPY --from=verku/htslib-1.15.1:latest /htslib/bgzip /usr/bin +COPY --from=verku/htslib-1.15.1:latest /htslib/tabix /usr/bin +COPY --from=builder /bedtools/bedtools /usr/bin/ From f248c69155f26fa85d5a7112b350d271cc918333 Mon Sep 17 00:00:00 2001 From: verku Date: Tue, 9 Aug 2022 17:58:34 +0200 Subject: [PATCH 10/19] Update docker image for bedtools to include bgzip with temporary image --- workflow/rules/0.2_repeat_identification.smk | 4 ++-- workflow/rules/12_snpEff.smk | 2 +- workflow/rules/13_GERP.smk | 6 +++--- workflow/rules/5_CpG_identification.smk | 16 ++++++++-------- .../rules/6_autosome_sexchromosome_bed_files.smk | 10 +++++----- workflow/rules/8.1_vcf_CpG_filtering.smk | 2 +- workflow/rules/8.2_vcf_qual_repeat_filtering.smk | 2 +- workflow/rules/9_merge_vcfs.smk | 2 +- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index cc3959b..7253fb7 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -214,7 +214,7 @@ rule sort_repeats_bed: log: "results/logs/0.2_repeat_identification/" + REF_NAME + "_sort_repeats_bed.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools sort -g {input.genomefile} -i {input.rep_bed} > {output.sorted_rep_bed} 2> {log} @@ -233,7 +233,7 @@ rule make_no_repeats_bed: log: "results/logs/0.2_repeat_identification/" + REF_NAME + "_make_no_repeats_bed.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools subtract -a {input.ref_bed} -b {input.sorted_rep_bed} > {output.no_rep_bed} 2> {log} && diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index b285fb1..00cdbc3 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -249,7 +249,7 @@ rule filter_biallelic_missing_vcf_snpEff: log: "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 9ab7c31..0858b29 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -815,7 +815,7 @@ rule filter_biallelic_missing_vcf_gerp: log: "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} @@ -887,7 +887,7 @@ rule split_vcf_files: log: "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_fmissing{fmiss}.{chunk}_split_vcf_chunks.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.vcf} -b {input.chunk_bed} -g {input.genomefile} -header | gzip - > {output.vcf_chunk} 2> {log} @@ -903,7 +903,7 @@ rule split_chunk_bed_files: log: "results/logs/13_GERP/" + REF_NAME + ".{chunk}_split_chunk_bed_files.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools makewindows -b {input.chunk_bed} -w 10000000 > {output.chunk_win_bed} 2> {log} diff --git a/workflow/rules/5_CpG_identification.smk b/workflow/rules/5_CpG_identification.smk index 517ec9a..069b9de 100644 --- a/workflow/rules/5_CpG_identification.smk +++ b/workflow/rules/5_CpG_identification.smk @@ -128,7 +128,7 @@ rule merge_CpG_genotype_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_merge_CpG_genotype_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ files=`echo {input} | awk '{{print NF}}'` @@ -154,7 +154,7 @@ rule sort_CpG_genotype_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_sort_CpG_genotype_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools sort -g {input.genomefile} -i {input.merged_bed} > {output.sorted_bed} 2> {log} @@ -174,7 +174,7 @@ rule merge_all_CpG_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_merge_all_CpG_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ files=`echo {input} | awk '{{print NF}}'` @@ -200,7 +200,7 @@ rule sort_all_CpG_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_sort_all_CpG_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools sort -g {input.genomefile} -i {input.merged_bed} > {output.sorted_bed} 2> {log} @@ -217,7 +217,7 @@ rule make_noCpG_bed: log: "results/logs/5_CpG_identification/" + REF_NAME + ".no{CpG_method}_make_no_CpG_bed.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools subtract -a {input.ref_bed} -b {input.CpG_bed} > {output.no_CpG_bed} 2> {log} @@ -238,7 +238,7 @@ rule merge_CpG_repeats_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + ".{CpG_method}_merge_CpG_repeats_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ cat {input[0]} {input[1]} | sort -k1,1 -k2,2n > {output.tmp} 2> {log} && @@ -257,7 +257,7 @@ rule sort_CpG_repeats_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + ".{CpG_method}_sort_CpG_repeats_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools sort -g {input.genomefile} -i {input.merged_bed} > {output.sorted_bed} 2> {log} @@ -274,7 +274,7 @@ rule make_noCpG_repma_bed: log: "results/logs/5_CpG_identification/" + REF_NAME + ".no{CpG_method}_make_noCpG_repma_bed.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools subtract -a {input.ref_bed} -b {input.merged_bed} > {output.no_CpG_repma_bed} 2> {log} diff --git a/workflow/rules/6_autosome_sexchromosome_bed_files.smk b/workflow/rules/6_autosome_sexchromosome_bed_files.smk index 8472ae1..6a977c5 100644 --- a/workflow/rules/6_autosome_sexchromosome_bed_files.smk +++ b/workflow/rules/6_autosome_sexchromosome_bed_files.smk @@ -42,7 +42,7 @@ rule make_autosomes_bed: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + "_make_autosomes_bed.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools subtract -a {input.ref_bed} -b {input.sexchr_bed} > {output.autosome_bed} 2> {log} @@ -61,7 +61,7 @@ rule intersect_sexchr_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + "_intersect_sexchr_repma_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.no_rep_bed_dir} -b {input.sexchr_bed} > {output.repma_sex_chr} 2> {log} @@ -80,7 +80,7 @@ rule intersect_autos_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + "_intersect_autos_repma_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.no_rep_bed_dir} -b {input.autosome_bed} > {output.repma_autos} 2> {log} @@ -99,7 +99,7 @@ rule intersect_sexchr_noCpG_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + ".no{CpG_method}_intersect_sexchr_noCpG_repma_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.no_CpG_repma_bed} -b {input.sexchr_bed} > {output.no_CpG_repma_sexchr} 2> {log} @@ -118,7 +118,7 @@ rule intersect_autos_noCpG_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + ".no{CpG_method}_intersect_autos_noCpG_repma_beds.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.no_CpG_repma_bed} -b {input.autosome_bed} > {output.no_CpG_repma_autos} 2> {log} diff --git a/workflow/rules/8.1_vcf_CpG_filtering.smk b/workflow/rules/8.1_vcf_CpG_filtering.smk index 3a39d32..da91be9 100644 --- a/workflow/rules/8.1_vcf_CpG_filtering.smk +++ b/workflow/rules/8.1_vcf_CpG_filtering.smk @@ -107,7 +107,7 @@ rule remove_CpG_vcf: log: "results/logs/8.1_vcf_CpG_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}.no{CpG_method}_remove_CpG_vcf.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} diff --git a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk index c7edd7f..53ca474 100644 --- a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk +++ b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk @@ -367,7 +367,7 @@ rule remove_repeats_vcf: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_remove_repeats_vcf.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 43faaa0..39c9f1c 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -340,7 +340,7 @@ rule filtered_vcf2bed: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}_filtered_vcf2bed.log", singularity: - "docker://quay.io/biocontainers/bedtools:2.29.2--hc088bd4_0" + "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ gzip -cd {input.vcf} | grep -v "^#" | awk -F'\t' '{{print $1, $2-1, $2}}' OFS='\t' > {output.bed} 2> {log} From a341270712b33ffa182cac67e537765323e0184f Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 10 Aug 2022 10:22:53 +0200 Subject: [PATCH 11/19] Compress bedtools intersect VCF output with bgzip whenever possible --- workflow/rules/12_snpEff.smk | 4 ++-- workflow/rules/13_GERP.smk | 8 ++++---- workflow/rules/8.1_vcf_CpG_filtering.smk | 4 ++-- workflow/rules/8.2_vcf_qual_repeat_filtering.smk | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index 00cdbc3..3cc91c3 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -244,7 +244,7 @@ rule filter_biallelic_missing_vcf_snpEff: bed=rules.filtered_vcf2bed.output.bed, genomefile=rules.genome_file.output.genomefile, output: - filtered=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf"), + filtered=temp("results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz"), threads: 6 log: "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", @@ -252,7 +252,7 @@ rule filter_biallelic_missing_vcf_snpEff: "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ - bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} + bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} """ diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 0858b29..8c3c57a 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -810,7 +810,7 @@ rule filter_biallelic_missing_vcf_gerp: bed=rules.filtered_vcf2bed.output.bed, genomefile=rules.genome_file.output.genomefile, output: - filtered=temp("results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf"), + filtered=temp("results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz"), threads: 6 log: "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", @@ -818,14 +818,14 @@ rule filter_biallelic_missing_vcf_gerp: "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ - bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} + bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} """ rule biallelic_missing_filtered_vcf_gerp_stats: """Obtain summary stats of filtered vcf file""" input: - filtered="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf", + filtered="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz", output: stats="results/gerp/{dataset}/" + REF_NAME + "/vcf/stats/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.stats.txt", log: @@ -879,7 +879,7 @@ rule modern_biallelic_missing_filtered_vcf_gerp_multiqc: rule split_vcf_files: """Split the VCF files into chunks for more resource-efficient merging with GERP results""" input: - vcf="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf", + vcf="results/gerp/{dataset}/" + REF_NAME + "/vcf/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.vcf.gz", chunk_bed=REF_DIR + "/gerp/" + REF_NAME + "/split_bed_files/{chunk}.bed", genomefile=REF_DIR + "/" + REF_NAME + ".genome", output: diff --git a/workflow/rules/8.1_vcf_CpG_filtering.smk b/workflow/rules/8.1_vcf_CpG_filtering.smk index da91be9..0ccea1d 100644 --- a/workflow/rules/8.1_vcf_CpG_filtering.smk +++ b/workflow/rules/8.1_vcf_CpG_filtering.smk @@ -102,7 +102,7 @@ rule remove_CpG_vcf: bed=rules.make_noCpG_bed.output.no_CpG_bed, genomefile=rules.genome_file.output.genomefile, output: - filtered=temp("results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.Q30.sorted.no{CpG_method}.vcf"), + filtered=temp("results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.Q30.sorted.no{CpG_method}.vcf.gz"), threads: 6 log: "results/logs/8.1_vcf_CpG_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}.no{CpG_method}_remove_CpG_vcf.log", @@ -110,7 +110,7 @@ rule remove_CpG_vcf: "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ - bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} + bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} """ diff --git a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk index 53ca474..9b67d1b 100644 --- a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk +++ b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk @@ -362,7 +362,7 @@ rule remove_repeats_vcf: bed=rules.make_no_repeats_bed.output.no_rep_bed_dir, genomefile=rules.genome_file.output.genomefile, output: - filtered=temp("results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.vcf"), + filtered=temp("results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.vcf.gz"), threads: 6 log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_remove_repeats_vcf.log", @@ -370,7 +370,7 @@ rule remove_repeats_vcf: "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo shell: """ - bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} > {output.filtered} 2> {log} + bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} """ From f731893e6273cb2be3665d7c27de783cc8a2141b Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 11 Aug 2022 14:48:13 +0200 Subject: [PATCH 12/19] Allow for more space between supylabel and ylabel --- workflow/scripts/gerp_rel_mut_load_plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/gerp_rel_mut_load_plot.py b/workflow/scripts/gerp_rel_mut_load_plot.py index 948675e..3cd57de 100644 --- a/workflow/scripts/gerp_rel_mut_load_plot.py +++ b/workflow/scripts/gerp_rel_mut_load_plot.py @@ -66,9 +66,9 @@ def max_load(dataframe): widthscale = len(data_df['sample'].unique()) * 1.5 fig.set_figwidth(widthscale) -fig.supylabel('relative mutational load') # y axis label +fig.supylabel('relative mutational load', fontsize=12) # y axis label fig.align_labels() # align axis labels -plt.tight_layout() # fix figure layout +plt.tight_layout(rect=[0.01, 0, 1, 1]) # fix figure layout, allowing for more space between supylabel and ylabel fig.savefig(outplot, bbox_inches='tight', format='pdf') # save the figure \ No newline at end of file From 2d1384056c39fae7a8f3640cd9874ef4c6aa62ba Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 11 Aug 2022 14:48:39 +0200 Subject: [PATCH 13/19] Update script description --- workflow/scripts/gerp_rel_mut_load_plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/gerp_rel_mut_load_plot.py b/workflow/scripts/gerp_rel_mut_load_plot.py index 3cd57de..2b153e1 100644 --- a/workflow/scripts/gerp_rel_mut_load_plot.py +++ b/workflow/scripts/gerp_rel_mut_load_plot.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 """ -Script to plot numbers of variants of different effects from snpEff analysis, for historical and modern samples. +Script to plot relative mutational load as obtained from GERP analyses, for historical and modern samples. Input and output files refer to Snakemake directives. From 79f6870a6d63ef4f92fc9af706e6bb58fb4cfc2c Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 12 Aug 2022 10:53:21 +0200 Subject: [PATCH 14/19] Update gitignore --- .gitignore | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index d36b6d3..d53faab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,25 @@ -results/ data/logs/ -data/raw_reads_symlinks/ data/mitogenomes/*.fasta.amb data/mitogenomes/*.fasta.ann data/mitogenomes/*.fasta.bwt data/mitogenomes/*.fasta.pac data/mitogenomes/*.fasta.sa - +data/raw_reads_symlinks/ +resources/ +results/ +tmpConsensi.fa *.html - +.cache/ +.java/ .snakemake/ +.test/data/references/repeat* +.test/data/references/*bed +.test/data/references/*dict +.test/data/references/*amb +.test/data/references/*ann +.test/data/references/*bwt +.test/data/references/*fai +.test/data/references/*pac +.test/data/references/*sa +.test/data/references/*genome +.test/data/references/*upper.fasta \ No newline at end of file From 1c215c13de57b9bea6499262ac32d9dcebd06deb Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 12 Aug 2022 10:54:24 +0200 Subject: [PATCH 15/19] Use link to pipeline logo so that the image is embedded when moving the report html file elsewhere --- workflow/report/workflow.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst index c73e94a..473b8ed 100644 --- a/workflow/report/workflow.rst +++ b/workflow/report/workflow.rst @@ -1,4 +1,4 @@ -.. image:: docs/source/img/logga_viridis2.png +.. image:: https://github.com/NBISweden/GenErode/blob/main/docs/source/img/logga_viridis2.png?raw=true :width: 124.0px :height: 175.4px :alt: GenErode pipeline logo From 20f61494b88a70354450b31095ab3db24169a00f Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 18 Aug 2022 14:05:26 +0200 Subject: [PATCH 16/19] Update link to bedtools container to NBIS Dockerhub repository --- workflow/docker/bedtools-2.29.2/Dockerfile | 12 ++++++------ workflow/rules/0.2_repeat_identification.smk | 4 ++-- workflow/rules/12_snpEff.smk | 2 +- workflow/rules/13_GERP.smk | 6 +++--- workflow/rules/5_CpG_identification.smk | 16 ++++++++-------- .../rules/6_autosome_sexchromosome_bed_files.smk | 10 +++++----- workflow/rules/8.1_vcf_CpG_filtering.smk | 2 +- workflow/rules/8.2_vcf_qual_repeat_filtering.smk | 2 +- workflow/rules/9_merge_vcfs.smk | 2 +- 9 files changed, 28 insertions(+), 28 deletions(-) diff --git a/workflow/docker/bedtools-2.29.2/Dockerfile b/workflow/docker/bedtools-2.29.2/Dockerfile index 2eba8e1..a8ffa6c 100644 --- a/workflow/docker/bedtools-2.29.2/Dockerfile +++ b/workflow/docker/bedtools-2.29.2/Dockerfile @@ -17,14 +17,14 @@ RUN apt update -y && apt install -y \ WORKDIR /bedtools RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools.static.binary && \ - mv bedtools.static.binary /bedtools/bedtools && \ - chmod a+x /bedtools/bedtools + mv bedtools.static.binary /bedtools/bedtools && \ + chmod a+x /bedtools/bedtools FROM ubuntu:18.04 AS production RUN apt update -y && apt install -y \ libcurl4-gnutls-dev -# COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/bgzip /usr/bin -# COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/tabix /usr/bin -COPY --from=verku/htslib-1.15.1:latest /htslib/bgzip /usr/bin -COPY --from=verku/htslib-1.15.1:latest /htslib/tabix /usr/bin +COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/bgzip /usr/bin +COPY --from=nbisweden/generode-htslib-1.15.1:latest /htslib/tabix /usr/bin +# COPY --from=verku/htslib-1.15.1:latest /htslib/bgzip /usr/bin # container used for development +# COPY --from=verku/htslib-1.15.1:latest /htslib/tabix /usr/bin # container used for development COPY --from=builder /bedtools/bedtools /usr/bin/ diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index 7253fb7..8305151 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -214,7 +214,7 @@ rule sort_repeats_bed: log: "results/logs/0.2_repeat_identification/" + REF_NAME + "_sort_repeats_bed.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools sort -g {input.genomefile} -i {input.rep_bed} > {output.sorted_rep_bed} 2> {log} @@ -233,7 +233,7 @@ rule make_no_repeats_bed: log: "results/logs/0.2_repeat_identification/" + REF_NAME + "_make_no_repeats_bed.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools subtract -a {input.ref_bed} -b {input.sorted_rep_bed} > {output.no_rep_bed} 2> {log} && diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index 3cc91c3..0f60a96 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -249,7 +249,7 @@ rule filter_biallelic_missing_vcf_snpEff: log: "results/logs/12_snpEff/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 8c3c57a..cdf9c17 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -815,7 +815,7 @@ rule filter_biallelic_missing_vcf_gerp: log: "results/logs/13_GERP/{dataset}/" + REF_NAME + "/vcf/{sample}.{processed}_fmissing{fmiss}_filter_biallelic_missing_vcf.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} @@ -887,7 +887,7 @@ rule split_vcf_files: log: "results/logs/13_GERP/chunks/" + REF_NAME + "/{dataset}/vcf/{sample}.{processed}_fmissing{fmiss}.{chunk}_split_vcf_chunks.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.vcf} -b {input.chunk_bed} -g {input.genomefile} -header | gzip - > {output.vcf_chunk} 2> {log} @@ -903,7 +903,7 @@ rule split_chunk_bed_files: log: "results/logs/13_GERP/" + REF_NAME + ".{chunk}_split_chunk_bed_files.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools makewindows -b {input.chunk_bed} -w 10000000 > {output.chunk_win_bed} 2> {log} diff --git a/workflow/rules/5_CpG_identification.smk b/workflow/rules/5_CpG_identification.smk index 069b9de..ad55b48 100644 --- a/workflow/rules/5_CpG_identification.smk +++ b/workflow/rules/5_CpG_identification.smk @@ -128,7 +128,7 @@ rule merge_CpG_genotype_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_merge_CpG_genotype_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ files=`echo {input} | awk '{{print NF}}'` @@ -154,7 +154,7 @@ rule sort_CpG_genotype_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_sort_CpG_genotype_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools sort -g {input.genomefile} -i {input.merged_bed} > {output.sorted_bed} 2> {log} @@ -174,7 +174,7 @@ rule merge_all_CpG_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_merge_all_CpG_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ files=`echo {input} | awk '{{print NF}}'` @@ -200,7 +200,7 @@ rule sort_all_CpG_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + "_sort_all_CpG_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools sort -g {input.genomefile} -i {input.merged_bed} > {output.sorted_bed} 2> {log} @@ -217,7 +217,7 @@ rule make_noCpG_bed: log: "results/logs/5_CpG_identification/" + REF_NAME + ".no{CpG_method}_make_no_CpG_bed.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools subtract -a {input.ref_bed} -b {input.CpG_bed} > {output.no_CpG_bed} 2> {log} @@ -238,7 +238,7 @@ rule merge_CpG_repeats_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + ".{CpG_method}_merge_CpG_repeats_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ cat {input[0]} {input[1]} | sort -k1,1 -k2,2n > {output.tmp} 2> {log} && @@ -257,7 +257,7 @@ rule sort_CpG_repeats_beds: log: "results/logs/5_CpG_identification/" + REF_NAME + ".{CpG_method}_sort_CpG_repeats_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools sort -g {input.genomefile} -i {input.merged_bed} > {output.sorted_bed} 2> {log} @@ -274,7 +274,7 @@ rule make_noCpG_repma_bed: log: "results/logs/5_CpG_identification/" + REF_NAME + ".no{CpG_method}_make_noCpG_repma_bed.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools subtract -a {input.ref_bed} -b {input.merged_bed} > {output.no_CpG_repma_bed} 2> {log} diff --git a/workflow/rules/6_autosome_sexchromosome_bed_files.smk b/workflow/rules/6_autosome_sexchromosome_bed_files.smk index 6a977c5..36777ef 100644 --- a/workflow/rules/6_autosome_sexchromosome_bed_files.smk +++ b/workflow/rules/6_autosome_sexchromosome_bed_files.smk @@ -42,7 +42,7 @@ rule make_autosomes_bed: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + "_make_autosomes_bed.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools subtract -a {input.ref_bed} -b {input.sexchr_bed} > {output.autosome_bed} 2> {log} @@ -61,7 +61,7 @@ rule intersect_sexchr_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + "_intersect_sexchr_repma_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.no_rep_bed_dir} -b {input.sexchr_bed} > {output.repma_sex_chr} 2> {log} @@ -80,7 +80,7 @@ rule intersect_autos_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + "_intersect_autos_repma_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.no_rep_bed_dir} -b {input.autosome_bed} > {output.repma_autos} 2> {log} @@ -99,7 +99,7 @@ rule intersect_sexchr_noCpG_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + ".no{CpG_method}_intersect_sexchr_noCpG_repma_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.no_CpG_repma_bed} -b {input.sexchr_bed} > {output.no_CpG_repma_sexchr} 2> {log} @@ -118,7 +118,7 @@ rule intersect_autos_noCpG_repma_beds: log: "results/logs/6_autosome_sexchromosome_bed_files/" + REF_NAME + ".no{CpG_method}_intersect_autos_noCpG_repma_beds.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.no_CpG_repma_bed} -b {input.autosome_bed} > {output.no_CpG_repma_autos} 2> {log} diff --git a/workflow/rules/8.1_vcf_CpG_filtering.smk b/workflow/rules/8.1_vcf_CpG_filtering.smk index 0ccea1d..28170e6 100644 --- a/workflow/rules/8.1_vcf_CpG_filtering.smk +++ b/workflow/rules/8.1_vcf_CpG_filtering.smk @@ -107,7 +107,7 @@ rule remove_CpG_vcf: log: "results/logs/8.1_vcf_CpG_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}.no{CpG_method}_remove_CpG_vcf.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} diff --git a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk index 9b67d1b..805d76e 100644 --- a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk +++ b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk @@ -367,7 +367,7 @@ rule remove_repeats_vcf: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_remove_repeats_vcf.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ bedtools intersect -a {input.vcf} -b {input.bed} -header -sorted -g {input.genomefile} | bgzip -c > {output.filtered} 2> {log} diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 39c9f1c..ecf381a 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -340,7 +340,7 @@ rule filtered_vcf2bed: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}_filtered_vcf2bed.log", singularity: - "docker://verku/bedtools-2.29.2" # replace with link to NBIS Dockerhub repo + "docker://nbisweden/generode-bedtools-2.29.2" shell: """ gzip -cd {input.vcf} | grep -v "^#" | awk -F'\t' '{{print $1, $2-1, $2}}' OFS='\t' > {output.bed} 2> {log} From 28f1da77d5146237e614ced287ffee428a185ef4 Mon Sep 17 00:00:00 2001 From: verku Date: Fri, 19 Aug 2022 16:20:19 +0200 Subject: [PATCH 17/19] Update pipeline version number --- .test/config/config_mitogenomes.yaml | 2 +- .test/config/config_mlRho_options.yaml | 2 +- .test/config/config_pca_roh.yaml | 2 +- .test/config/config_snpeff_gerp.yaml | 2 +- Snakefile | 2 +- config/config.yaml | 2 +- workflow/rules/common.smk | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.test/config/config_mitogenomes.yaml b/.test/config/config_mitogenomes.yaml index e513592..662d2f0 100644 --- a/.test/config/config_mitogenomes.yaml +++ b/.test/config/config_mitogenomes.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.4.1 # +# Configuration settings for the GenErode pipeline 0.4.2 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/.test/config/config_mlRho_options.yaml b/.test/config/config_mlRho_options.yaml index c2efaf7..a1bf807 100644 --- a/.test/config/config_mlRho_options.yaml +++ b/.test/config/config_mlRho_options.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.4.1 # +# Configuration settings for the GenErode pipeline 0.4.2 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index 7903806..d670b68 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.4.1 # +# Configuration settings for the GenErode pipeline 0.4.2 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/.test/config/config_snpeff_gerp.yaml b/.test/config/config_snpeff_gerp.yaml index bfd7b8c..38e461b 100644 --- a/.test/config/config_snpeff_gerp.yaml +++ b/.test/config/config_snpeff_gerp.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.4.1 # +# Configuration settings for the GenErode pipeline 0.4.2 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/Snakefile b/Snakefile index 336d090..1f360e1 100644 --- a/Snakefile +++ b/Snakefile @@ -2,7 +2,7 @@ # This is the Snakefile of the GenErode pipeline for historical or # # ancient and modern samples to study patterns of genome erosion # # # -# Pipeline version 0.4.1 # +# Pipeline version 0.4.2 # # # # Written by Verena Kutschera, Marcin Kierczak and Tom van der Valk # # Email: generode@nbis.se # diff --git a/config/config.yaml b/config/config.yaml index 7e997ac..15b6714 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,6 +1,6 @@ ################################################################# ################################################################# -# Configuration settings for the GenErode pipeline 0.4.1 # +# Configuration settings for the GenErode pipeline 0.4.2 # # for ancient or historical samples, and modern samples # ################################################################# ################################################################# diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d9716a9..2a04cc9 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -10,7 +10,7 @@ import pandas as pd min_version("5.19.0") -generode_version = "0.4.1" +generode_version = "0.4.2" configfile: "config/config.yaml" From 47dee18104202460c4dd8231aab0cff39dc3d801 Mon Sep 17 00:00:00 2001 From: verku Date: Wed, 24 Aug 2022 11:04:07 +0200 Subject: [PATCH 18/19] Separate make_reference_bed from group job for easier debugging --- workflow/rules/0.1_reference_genome_preps.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/0.1_reference_genome_preps.smk b/workflow/rules/0.1_reference_genome_preps.smk index 9e4b5b5..006af7b 100644 --- a/workflow/rules/0.1_reference_genome_preps.smk +++ b/workflow/rules/0.1_reference_genome_preps.smk @@ -9,6 +9,8 @@ all_outputs.append(expand(REF_DIR + "/" + REF_NAME + ".{ext}", ext=["dict", "genome", "bed"])) # snakemake rules +localrules: make_reference_bed + rule bwa_index_reference: """Index the reference genome using bwa""" input: @@ -99,8 +101,6 @@ rule make_reference_bed: "results/logs/0.1_reference_genome_preps/" + REF_NAME + "_make_reference_bed.log", - group: - "reference_prep_group" shell: """ awk -v OFS='\t' '{{print $1, "0", $2}}' {input.fai} > {output.ref_bed} 2> {log} From 1fe7ac1a2d96339a95b717273f4459824582c75b Mon Sep 17 00:00:00 2001 From: verku Date: Thu, 1 Sep 2022 13:50:07 +0200 Subject: [PATCH 19/19] Update path to final output file of subworkflow --- workflow/rules/0.2_repeat_identification.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/0.2_repeat_identification.smk b/workflow/rules/0.2_repeat_identification.smk index 8305151..c9dddf9 100644 --- a/workflow/rules/0.2_repeat_identification.smk +++ b/workflow/rules/0.2_repeat_identification.smk @@ -2,7 +2,7 @@ ### 0.2 Repeat prediction and repeat masking of the reference genome # Code collecting output files from this part of the pipeline -all_outputs.append(REF_DIR + "/" + REF_NAME + ".repeats.bed") +all_outputs.append(REF_DIR + "/" + REF_NAME + ".repeats.sorted.bed") all_outputs.append(REF_DIR + "/" + REF_NAME + ".repma.bed")