diff --git a/.github/workflows/gerp.yaml b/.github/workflows/gerp.yaml index 2051d7b..686e53c 100644 --- a/.github/workflows/gerp.yaml +++ b/.github/workflows/gerp.yaml @@ -67,9 +67,9 @@ jobs: - name: gerp_dry shell: bash -l {0} run: | - snakemake -npr --configfile .test/config/config_gerp.yaml -j 4 --cores 1 --use-singularity + snakemake -np -k --configfile .test/config/config_gerp.yaml -j 4 --cores 1 --use-singularity - name: gerp shell: bash -l {0} run: | - snakemake --configfile .test/config/config_gerp.yaml -j 4 --cores 1 --use-singularity \ No newline at end of file + snakemake -k --configfile .test/config/config_gerp.yaml -j 4 --cores 1 --use-singularity \ No newline at end of file diff --git a/.github/workflows/mitogenome_mapping.yaml b/.github/workflows/mitogenome_mapping.yaml index b5f7992..9381fe7 100644 --- a/.github/workflows/mitogenome_mapping.yaml +++ b/.github/workflows/mitogenome_mapping.yaml @@ -91,10 +91,10 @@ jobs: - name: mitogenome_mapping_dry shell: bash -l {0} run: | - snakemake -npr --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity + snakemake -np -k --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity - name: mitogenome_mapping shell: bash -l {0} run: | - snakemake --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity + snakemake -k --configfile .test/config/config_mitogenomes.yaml -j 4 --cores 1 --use-singularity diff --git a/.github/workflows/mlRho_options.yaml b/.github/workflows/mlRho_options.yaml index f245c48..41a939a 100644 --- a/.github/workflows/mlRho_options.yaml +++ b/.github/workflows/mlRho_options.yaml @@ -73,9 +73,9 @@ jobs: - name: mlRho_options_dry shell: bash -l {0} run: | - snakemake -npr --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity + snakemake -np -k --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity - name: mlRho_options shell: bash -l {0} run: | - snakemake --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity + snakemake -k --configfile .test/config/config_mlRho_options.yaml -j 4 --cores 1 --use-singularity diff --git a/.github/workflows/pca_roh.yaml b/.github/workflows/pca_roh.yaml index a27d99b..6aba2f0 100644 --- a/.github/workflows/pca_roh.yaml +++ b/.github/workflows/pca_roh.yaml @@ -65,9 +65,9 @@ jobs: - name: pca_roh_dry shell: bash -l {0} run: | - snakemake -npr --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity + snakemake -np -k --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity - name: pca_roh shell: bash -l {0} run: | - snakemake --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity + snakemake -k --configfile .test/config/config_pca_roh.yaml -j 4 --cores 1 --use-singularity diff --git a/.github/workflows/snpeff.yaml b/.github/workflows/snpeff.yaml index 5e656c5..bff3fe1 100644 --- a/.github/workflows/snpeff.yaml +++ b/.github/workflows/snpeff.yaml @@ -67,9 +67,9 @@ jobs: - name: snpeff_dry shell: bash -l {0} run: | - snakemake -npr --configfile .test/config/config_snpeff.yaml -j 4 --cores 1 --use-singularity + snakemake -np -k --configfile .test/config/config_snpeff.yaml -j 4 --cores 1 --use-singularity - name: snpeff shell: bash -l {0} run: | - snakemake --configfile .test/config/config_snpeff.yaml -j 4 --cores 1 --use-singularity \ No newline at end of file + snakemake -k --configfile .test/config/config_snpeff.yaml -j 4 --cores 1 --use-singularity \ No newline at end of file diff --git a/.test/config/config_pca_roh.yaml b/.test/config/config_pca_roh.yaml index fc66955..71936cb 100644 --- a/.test/config/config_pca_roh.yaml +++ b/.test/config/config_pca_roh.yaml @@ -153,13 +153,13 @@ zerocoverage: False # to set a minimum depth threshold. For ultra low coverage samples, a # minimum hard threshold of 3X is applied that overrides this parameter. # A minimum depth of 6X should be aimed for. -minDP: 0.33 +minDP: 0.1 # Maximum depth threshold calculation per sample. # Will be applied to mlRho analysis and in VCF file filtering. # Factor by which the average genome-wide depth should be multiplied # to set a maximum depth threshold. -maxDP: 10 +maxDP: 100 ##### diff --git a/config/slurm/README.md b/config/slurm/README.md new file mode 100644 index 0000000..a2337ae --- /dev/null +++ b/config/slurm/README.md @@ -0,0 +1,71 @@ +# GenErode execution on Dardel (PDC/KTH) + +1) Load the following modules: + +``` +module load PDC UPPMAX bioinfo-tools conda singularity tmux +``` + +> Note that tmux is only available as a module on Dardel +but the equivalent tool screen is pre-installed and does +not need to be loaded. + +2) After cloning the repository, change permissions for the +Snakefile: + +``` +chmod 755 Snakefile +``` + +3) Create the GenErode conda environment or update an earlier +version. The latest conda environment contains the Snakemake +executor plugin for slurm: + +``` +conda create -f environment.yaml -n generode +``` + +4) Copy the configuration file `config/slurm/profile/config_plugin_dardel.yaml` +to `slurm/config.yaml`. This file specifies compute resources +for each rule or group jobs to be run on Dardel. Any rule or +group job that is not listed under `set-threads` or `set-resources` +uses default resources specified under `default-resources`. If +any rule or group job fails due to too little memory or run +time, their compute resources can be updated in this file. + +> Note that memory requirements are specified three times in +the configuration file: 1) under `set-threads` (used by Snakemake +to specify threads in rules), 2) under `set-resources` and therein +under `mem_mb`, specifying the memory in Megabytes (multiplying +the number of threads with the available memory per thread), +and 3) under `set-resources` and therein under `cpus-per-task` +(the same number as specified under `set-threads`, required for +correct memory assignment on Dardel). + +5) Start GenErode the following: + +- Open a tmux session (alternatively, you can use screen) + +- Activate the GenErode conda environment (create or update +from `environment.yaml`), replacing the path to the location +of the conda environment: + +``` +export CONDA_ENVS_PATH=/cfs/klemming/home/.../ +conda activate generode +``` + +- Start the dry run: + +``` +snakemake --profile slurm -n &> YYMMDD_dry.out +``` + +- Start the main run: + +``` +snakemake --profile slurm &> YYMMDD_main.out +``` + +> Useful flags for running the pipeline: `--ri` to re-run +incomplete jobs and `-k` to keep going in case a job fails. diff --git a/config/slurm/cluster.yaml b/config/slurm/cluster.yaml deleted file mode 100644 index 4ef96f5..0000000 --- a/config/slurm/cluster.yaml +++ /dev/null @@ -1,281 +0,0 @@ -# cluster.yaml - cluster configuration file for GenErode pipeline -__default__: - account: # fill in your compute account on your HPC system (if applicable) - partition: core - time: 02:00:00 - ntasks: 1 - cpus-per-task: 1 -### repeat identification -repeatmodeler: - time: 06-00:00:00 - cpus-per-task: 16 -repeatmasker: - time: 10-00:00:00 - cpus-per-task: 16 -### fastq processing -historical_fastq_before_group: - time: 09:00:00 - cpus-per-task: 2 -fastqc_historical_raw: - cpus-per-task: 2 -fastqc_modern_raw: - cpus-per-task: 2 -fastp_historical: - time: 09:00:00 - cpus-per-task: 4 -fastp_modern: - time: 09:00:00 - cpus-per-task: 4 -fastqc_historical_merged: - cpus-per-task: 2 -fastqc_historical_unmerged: - cpus-per-task: 2 -fastqc_modern_trimmed: - cpus-per-task: 2 -### map to mitochondrial genomes -map_historical_merged_to_mito: - time: 06:00:00 -map_historical_unmerged_to_mito: - time: 06:00:00 -historical_mito_bams_group: - time: 2-00:00:00 -merge_historical_mitogenome_bams_per_sample: - time: 2-00:00:00 -historical_merged_mito_bams_group: - time: 2-00:00:00 -### map to reference genome -map_historical: - time: 10-00:00:00 - cpus-per-task: 8 -sai2bam: - time: 10-00:00:00 - cpus-per-task: 8 -map_modern: - time: 10-00:00:00 - cpus-per-task: 8 -sorted_bam_qualimap: - time: 4-00:00:00 - cpus-per-task: 8 -### process bam files -merge_historical_bams_per_index: - time: 2-00:00:00 - cpus-per-task: 2 -merge_modern_bams_per_index: - time: 2-00:00:00 - cpus-per-task: 2 -merged_index_bam_qualimap: - time: 4-00:00:00 - cpus-per-task: 8 -rmdup_historical_bams: - time: 3-00:00:00 - cpus-per-task: 6 -rmdup_modern_bams: - time: 3-00:00:00 - cpus-per-task: 2 -rmdup_bam_qualimap: - time: 4-00:00:00 - cpus-per-task: 8 -merge_historical_bams_per_sample: - time: 2-00:00:00 - cpus-per-task: 2 -merge_modern_bams_per_sample: - time: 2-00:00:00 - cpus-per-task: 2 -merged_sample_bam_qualimap: - time: 4-00:00:00 - cpus-per-task: 8 -indel_realigner_targets: - time: 5-00:00:00 - cpus-per-task: 8 -indel_realigner: - time: 5-00:00:00 - cpus-per-task: 8 -realigned_bam_qualimap: - time: 4-00:00:00 - cpus-per-task: 8 -realigned_bam_group: - time: 2-00:00:00 - cpus-per-task: 2 -### rescale bam files -rescale_historical: - time: 3-00:00:00 - cpus-per-task: 4 -rescaled_bam_qualimap: - time: 4-00:00:00 - cpus-per-task: 8 -rescaled_bam_group: - time: 09:00:00 - cpus-per-task: 2 -### subsample bam files -filter_bam_mapped_mq: - time: 1-00:00:00 -subsample_bams: - time: 4-00:00:00 - cpus-per-task: 2 -subsampled_bam_qualimap: - time: 4-00:00:00 - cpus-per-task: 6 -subsampled_bam_group: - time: 2-00:00:00 -### genotyping -variant_calling: - time: 2-00:00:00 - cpus-per-task: 3 -sort_vcfs: - time: 1-00:00:00 -### identify CpG sites -sorted_bcf2vcf: - time: 05:00:00 -make_CpG_genotype_bed: - time: 1-00:00:00 -CpG_genotype_bed_formatting_group: - time: 1-00:00:00 - cpus-per-task: 4 -all_CpG_bed_formatting_group: - time: 1-00:00:00 - cpus-per-task: 2 -make_noCpG_bed: - time: 05:00:00 - cpus-per-task: 2 -CpG_repeats_bed_formatting_group: - time: 1-00:00:00 - cpus-per-task: 2 -merge_noCpG_noRepeats_beds: - time: 05:00:00 - cpus-per-task: 2 -make_noCpG_repma_bed: - time: 05:00:00 - cpus-per-task: 2 -### create chromosome bed files -make_autosomes_bed: - time: 05:00:00 -intersect_sexchr_repma_beds: - time: 05:00:00 - cpus-per-task: 2 -intersect_autos_repma_beds: - time: 05:00:00 - cpus-per-task: 2 -intersect_sexchr_CpG_repma_beds: - time: 05:00:00 - cpus-per-task: 2 -intersect_autos_CpG_repma_beds: - time: 05:00:00 - cpus-per-task: 2 -### mlRho -bam2pro_autos: - time: 1-00:00:00 -bam2pro_sexchr: - time: 1-00:00:00 -bam2pro_all: - time: 1-00:00:00 -### CpG filter VCF files -remove_CpG_vcf: - time: 05:00:00 - cpus-per-task: 6 -CpG_vcf2bcf: - time: 05:00:00 - cpus-per-task: 2 -### process VCF files -remove_snps_near_indels: - time: 05:00:00 - cpus-per-task: 2 -filter_vcfs_qual_dp: - time: 05:00:00 - cpus-per-task: 2 -filter_vcfs_allelic_balance: - time: 1-00:00:00 - cpus-per-task: 2 -remove_repeats_vcf: - time: 05:00:00 - cpus-per-task: 6 -filtered_vcf2bcf: - time: 05:00:00 - cpus-per-task: 2 -### merge and process VCF files -merge_all_vcfs: - time: 3-00:00:00 - cpus-per-task: 6 -filter_vcf_biallelic: - time: 1-00:00:00 - cpus-per-task: 2 -filter_vcf_missing: - time: 1-00:00:00 - cpus-per-task: 2 -remove_chromosomes: - time: 1-00:00:00 - cpus-per-task: 2 -extract_historical_samples: - time: 05:00:00 -extract_modern_samples: - time: 05:00:00 -repmasked_bcf2vcf: - time: 05:00:00 - cpus-per-task: 2 -filter_biallelic_missing_vcf: - time: 1-00:00:00 - cpus-per-task: 6 -### PCA -vcf2plink_pca: - time: 05:00:00 - cpus-per-task: 2 -### runs of homozygosity -filter_vcf_hwe: - time: 05:00:00 - cpus-per-task: 2 -vcf2plink_hwe: - time: 05:00:00 - cpus-per-task: 2 -### snpEff -build_snpEff_db: - time: 05:00:00 -annotate_vcf: - time: 05:00:00 -### gerp -outgroups2fastq: - time: 1-00:00:00 -outgroup_fastqc: - cpus-per-task: 2 -align2target: - time: 3-00:00:00 - cpus-per-task: 8 -bam2fasta: - time: 1-00:00:00 - cpus-per-task: 2 -split_ref_contigs: - time: 1-00:00:00 -concatenate_fasta_per_contig: - time: 1-00:00:00 - cpus-per-task: 2 -compute_gerp: - time: 1-00:00:00 - cpus-per-task: 4 -gerp2coords: - time: 1-00:00:00 - cpus-per-task: 2 -get_ancestral_state: - time: 1-00:00:00 - cpus-per-task: 2 -produce_contig_out: - time: 1-00:00:00 - cpus-per-task: 2 -merge_per_chunk: - time: 1-00:00:00 - cpus-per-task: 2 -merge_gerp_gz: - time: 1-00:00:00 - cpus-per-task: 2 -split_vcf_files: - time: 05:00:00 -gerp_derived_alleles: - time: 10-00:00:00 - cpus-per-task: 2 -merge_gerp_alleles_per_chunk: - time: 1-00:00:00 - cpus-per-task: 4 -merge_gerp_alleles_gz: - time: 1-00:00:00 - cpus-per-task: 4 -relative_mutational_load_per_sample: - time: 1-00:00:00 - cpus-per-task: 2 -### diff --git a/config/slurm/profile/config_plugin_dardel.yaml b/config/slurm/profile/config_plugin_dardel.yaml new file mode 100644 index 0000000..50504df --- /dev/null +++ b/config/slurm/profile/config_plugin_dardel.yaml @@ -0,0 +1,407 @@ +# Configuration file for slurm plugin (Snakemake >8.0.0) for Dardel cluster at PDC/KTH +# snakemake CLI flags +executor: slurm +jobs: 100 +printshellcmds: true +software-deployment-method: apptainer + +# slurm resources +## default-resources: applied to all jobs, overruled by resources defined below for jobs +default-resources: + slurm_account: XXX-XX-XXX # update this to your slurm account + slurm_partition: shared # use Dardel’s shared partition + runtime: 120 # default runtime in minutes + mem_mb: 8000 + nodes: 1 # one node on Dardel from the shared partition + ntasks: 1 # number of concurrent tasks / ranks + cpus_per_task: 8 # number of hyperthreads per task, corresponds to 1 GB RAM + +## set-threads: map rule names to threads +set-threads: + repeatmodeler: 128 + repeatmasker: 128 + historical_fastq_before_group: 16 + fastqc_historical_raw: 16 + fastqc_modern_raw: 16 + fastp_historical: 32 + fastp_modern: 32 + fastqc_historical_merged: 16 + fastqc_historical_unmerged: 16 + fastqc_modern_trimmed: 16 + map_historical: 64 + sai2bam: 64 + map_modern: 64 + sorted_bam_qualimap: 64 + merge_historical_bams_per_index: 16 + merge_modern_bams_per_index: 16 + merged_index_bam_qualimap: 64 + rmdup_historical_bams: 48 + rmdup_modern_bams: 16 + rmdup_bam_qualimap: 64 + merge_historical_bams_per_sample: 16 + merge_modern_bams_per_sample: 16 + merged_sample_bam_qualimap: 64 + indel_realigner_targets: 64 + indel_realigner: 64 + realigned_bam_qualimap: 64 + realigned_bam_group: 16 + rescale_historical: 32 + rescaled_bam_qualimap: 64 + rescaled_bam_group: 16 + subsample_bams: 16 + subsampled_bam_qualimap: 48 + variant_calling: 32 + sort_vcfs: 16 + CpG_genotype_bed_formatting_group: 32 + all_CpG_bed_formatting_group: 16 + make_noCpG_bed: 16 + CpG_repeats_bed_formatting_group: 16 + merge_noCpG_noRepeats_beds: 16 + make_noCpG_repma_bed: 16 + intersect_sexchr_repma_beds: 16 + intersect_autos_repma_beds: 16 + intersect_sexchr_CpG_repma_beds: 16 + intersect_autos_CpG_repma_beds: 16 + remove_CpG_vcf: 48 + CpG_vcf2bcf: 16 + remove_snps_near_indels: 16 + filter_vcfs_qual_dp: 16 + filter_vcfs_allelic_balance: 16 + remove_repeats_vcf: 48 + filtered_vcf2bcf: 16 + merge_all_vcfs: 48 + filter_vcf_biallelic: 16 + filter_vcf_missing: 16 + remove_chromosomes: 16 + repmasked_bcf2vcf: 16 + filter_biallelic_missing_vcf: 48 + vcf2plink_pca: 16 + plink_eigenvec: 8 + filter_vcf_hwe: 16 + vcf2plink_hwe: 16 + outgroup_fastqc: 16 + align2target: 64 + bam2fasta: 16 + concatenate_fasta_per_contig: 16 + compute_gerp: 32 + gerp2coords: 16 + get_ancestral_state: 16 + produce_contig_out: 16 + merge_per_chunk: 16 + merge_gerp_gz: 16 + gerp_derived_alleles: 16 + merge_gerp_alleles_per_chunk: 32 + merge_gerp_alleles_gz: 32 + relative_mutational_load_per_sample: 16 + +## set-resources: map rule names to resources in general +set-resources: + repeatmodeler: + slurm_partition: long # switch to Dardel's long partition for shorter wait time in queue + runtime: 10080 + mem_mb: 128000 + cpus_per_task: 128 + repeatmasker: + slurm_partition: long # switch to Dardel's long partition for shorter wait time in queue + runtime: 10080 + mem_mb: 128000 + cpus_per_task: 128 + historical_fastq_before_group: + runtime: 600 + mem_mb: 16000 + cpus_per_task: 16 + fastqc_historical_raw: + mem_mb: 16000 + cpus_per_task: 16 + fastqc_modern_raw: + mem_mb: 16000 + cpus_per_task: 16 + fastp_historical: + runtime: 600 + mem_mb: 32000 + cpus_per_task: 32 + fastp_modern: + runtime: 600 + mem_mb: 32000 + cpus_per_task: 32 + fastqc_historical_merged: + mem_mb: 16000 + cpus_per_task: 16 + fastqc_historical_unmerged: + mem_mb: 16000 + cpus_per_task: 16 + fastqc_modern_trimmed: + mem_mb: 16000 + cpus_per_task: 16 + map_historical: + runtime: 10080 + mem_mb: 64000 + cpus_per_task: 64 + sai2bam: + runtime: 10080 + mem_mb: 64000 + cpus_per_task: 64 + map_modern: + runtime: 10080 + mem_mb: 64000 + cpus_per_task: 64 + sorted_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 64 + merge_historical_bams_per_index: + runtime: 2880 + mem_mb: 16000 + cpus_per_task: 16 + merge_modern_bams_per_index: + runtime: 2880 + mem_mb: 16000 + cpus_per_task: 16 + merged_index_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 64 + rmdup_historical_bams: + runtime: 4320 + mem_mb: 48000 + cpus_per_task: 48 + rmdup_modern_bams: + runtime: 4320 + mem_mb: 16000 + cpus_per_task: 16 + rmdup_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 64 + merge_historical_bams_per_sample: + runtime: 2880 + mem_mb: 16000 + cpus_per_task: 16 + merge_modern_bams_per_sample: + runtime: 2880 + mem_mb: 16000 + cpus_per_task: 16 + merged_sample_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 64 + indel_realigner_targets: + runtime: 7200 + mem_mb: 64000 + cpus_per_task: 64 + indel_realigner: + runtime: 7200 + mem_mb: 64000 + cpus_per_task: 64 + realigned_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 64 + realigned_bam_group: + runtime: 2880 + mem_mb: 16000 + cpus_per_task: 16 + rescale_historical: + runtime: 4320 + mem_mb: 32000 + cpus_per_task: 32 + rescaled_bam_qualimap: + runtime: 5760 + rescaled_bam_group: + runtime: 600 + filter_bam_mapped_mq: + runtime: 1440 + subsample_bams: + runtime: 5760 + mem_mb: 16000 + cpus_per_task: 16 + subsampled_bam_qualimap: + runtime: 5760 + mem_mb: 48000 + cpus_per_task: 48 + subsampled_bam_group: + runtime: 2880 + variant_calling: + runtime: 2880 + mem_mb: 32000 + cpus_per_task: 32 + sort_vcfs: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + sorted_bcf2vcf: + runtime: 300 + make_CpG_genotype_bed: + runtime: 1440 + CpG_genotype_bed_formatting_group: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 32 + all_CpG_bed_formatting_group: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + make_noCpG_bed: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + CpG_repeats_bed_formatting_group: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + merge_noCpG_noRepeats_beds: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + make_noCpG_repma_bed: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + make_autosomes_bed: + runtime: 300 + intersect_sexchr_repma_beds: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + intersect_autos_repma_beds: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + intersect_sexchr_CpG_repma_beds: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + intersect_autos_CpG_repma_beds: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + bam2pro_autos: + runtime: 1440 + bam2pro_sexchr: + runtime: 1440 + bam2pro_all: + runtime: 1440 + remove_CpG_vcf: + runtime: 300 + mem_mb: 48000 + cpus_per_task: 48 + CpG_vcf2bcf: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + remove_snps_near_indels: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + filter_vcfs_qual_dp: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + filter_vcfs_allelic_balance: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + remove_repeats_vcf: + runtime: 300 + mem_mb: 48000 + cpus_per_task: 48 + filtered_vcf2bcf: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + merge_all_vcfs: + runtime: 4320 + mem_mb: 48000 + cpus_per_task: 48 + filter_vcf_biallelic: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + filter_vcf_missing: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + remove_chromosomes: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + extract_historical_samples: + runtime: 300 + extract_modern_samples: + runtime: 300 + repmasked_bcf2vcf: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + filter_biallelic_missing_vcf: + runtime: 1440 + mem_mb: 48000 + cpus_per_task: 48 + vcf2plink_pca: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + filter_vcf_hwe: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + vcf2plink_hwe: + runtime: 300 + mem_mb: 16000 + cpus_per_task: 16 + outgroup_fastqc: + runtime: 16 + mem_mb: 16000 + cpus_per_task: 16 + align2target: + runtime: 4320 + mem_mb: 64000 + cpus_per_task: 64 + bam2fasta: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + concatenate_fasta_per_contig: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + compute_gerp: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 32 + gerp2coords: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + get_ancestral_state: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + produce_contig_out: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + merge_per_chunk: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + merge_gerp_gz: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 + gerp_derived_alleles: + runtime: 10080 + mem_mb: 16000 + cpus_per_task: 16 + merge_gerp_alleles_per_chunk: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 32 + merge_gerp_alleles_gz: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 32 + relative_mutational_load_per_sample: + runtime: 1440 + mem_mb: 16000 + cpus_per_task: 16 diff --git a/config/slurm/profile/config_plugin_rackham.yaml b/config/slurm/profile/config_plugin_rackham.yaml new file mode 100644 index 0000000..2fae685 --- /dev/null +++ b/config/slurm/profile/config_plugin_rackham.yaml @@ -0,0 +1,395 @@ +# Configuration file for slurm plugin (Snakemake >8.0.0) for Rackham cluster at UPPMAX +# snakemake CLI flags +executor: slurm +jobs: 100 +printshellcmds: true +software-deployment-method: apptainer + +# slurm resources +## default-resources: applied to all jobs, overruled by resources defined below for jobs +default-resources: + slurm_account: XXX-XX-XXX # update this to your slurm account + runtime: 120 # default runtime in minutes + mem_mb: 6400 + cpus_per_task: 1 # number of threads per task, corresponds to 6.4 GB RAM + +## set-threads: map rule names to threads +set-threads: + repeatmodeler: 20 + repeatmasker: 20 + historical_fastq_before_group: 2 + fastqc_historical_raw: 2 + fastqc_modern_raw: 2 + fastp_historical: 5 + fastp_modern: 5 + fastqc_historical_merged: 2 + fastqc_historical_unmerged: 2 + fastqc_modern_trimmed: 2 + map_historical: 10 + sai2bam: 10 + map_modern: 10 + sorted_bam_qualimap: 10 + merge_historical_bams_per_index: 2 + merge_modern_bams_per_index: 2 + merged_index_bam_qualimap: 10 + rmdup_historical_bams: 6 + rmdup_modern_bams: 2 + rmdup_bam_qualimap: 10 + merge_historical_bams_per_sample: 2 + merge_modern_bams_per_sample: 2 + merged_sample_bam_qualimap: 10 + indel_realigner_targets: 10 + indel_realigner: 10 + realigned_bam_qualimap: 10 + realigned_bam_group: 2 + rescale_historical: 5 + rescaled_bam_qualimap: 10 + rescaled_bam_group: 2 + subsample_bams: 2 + subsampled_bam_qualimap: 6 + variant_calling: 3 + CpG_genotype_bed_formatting_group: 5 + all_CpG_bed_formatting_group: 2 + make_noCpG_bed: 2 + CpG_repeats_bed_formatting_group: 2 + merge_noCpG_noRepeats_beds: 2 + make_noCpG_repma_bed: 2 + intersect_sexchr_repma_beds: 2 + intersect_autos_repma_beds: 2 + intersect_sexchr_CpG_repma_beds: 2 + intersect_autos_CpG_repma_beds: 2 + remove_CpG_vcf: 6 + CpG_vcf2bcf: 2 + remove_snps_near_indels: 2 + filter_vcfs_qual_dp: 2 + filter_vcfs_allelic_balance: 2 + remove_repeats_vcf: 6 + filtered_vcf2bcf: 2 + merge_all_vcfs: 6 + filter_vcf_biallelic: 2 + filter_vcf_missing: 2 + remove_chromosomes: 2 + repmasked_bcf2vcf: 2 + filter_biallelic_missing_vcf: 6 + vcf2plink_pca: 2 + plink_eigenvec: 1 + filter_vcf_hwe: 2 + vcf2plink_hwe: 2 + outgroup_fastqc: 2 + align2target: 10 + bam2fasta: 2 + concatenate_fasta_per_contig: 2 + compute_gerp: 5 + gerp2coords: 2 + get_ancestral_state: 2 + produce_contig_out: 2 + merge_per_chunk: 2 + merge_gerp_gz: 2 + gerp_derived_alleles: 2 + merge_gerp_alleles_per_chunk: 5 + merge_gerp_alleles_gz: 5 + relative_mutational_load_per_sample: 2 + +## set-resources: map rule names to resources in general +set-resources: + repeatmodeler: + runtime: 10080 + mem_mb: 128000 + cpus_per_task: 20 + repeatmasker: + runtime: 10080 + mem_mb: 128000 + cpus_per_task: 20 + historical_fastq_before_group: + runtime: 600 + mem_mb: 12800 + cpus_per_task: 2 + fastqc_historical_raw: + mem_mb: 12800 + cpus_per_task: 2 + fastqc_modern_raw: + mem_mb: 12800 + cpus_per_task: 2 + fastp_historical: + runtime: 600 + mem_mb: 32000 + cpus_per_task: 5 + fastp_modern: + runtime: 600 + mem_mb: 32000 + cpus_per_task: 5 + fastqc_historical_merged: + mem_mb: 12800 + cpus_per_task: 2 + fastqc_historical_unmerged: + mem_mb: 12800 + cpus_per_task: 2 + fastqc_modern_trimmed: + mem_mb: 12800 + cpus_per_task: 2 + map_historical: + runtime: 10080 + mem_mb: 64000 + cpus_per_task: 10 + sai2bam: + runtime: 10080 + mem_mb: 64000 + cpus_per_task: 10 + map_modern: + runtime: 10080 + mem_mb: 64000 + cpus_per_task: 10 + sorted_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 10 + merge_historical_bams_per_index: + runtime: 2880 + mem_mb: 12800 + cpus_per_task: 2 + merge_modern_bams_per_index: + runtime: 2880 + mem_mb: 12800 + cpus_per_task: 2 + merged_index_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 10 + rmdup_historical_bams: + runtime: 5320 + mem_mb: 64000 + cpus_per_task: 10 + rmdup_modern_bams: + runtime: 5320 + mem_mb: 12800 + cpus_per_task: 2 + rmdup_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 10 + merge_historical_bams_per_sample: + runtime: 2880 + mem_mb: 12800 + cpus_per_task: 2 + merge_modern_bams_per_sample: + runtime: 2880 + mem_mb: 12800 + cpus_per_task: 2 + merged_sample_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 10 + indel_realigner_targets: + runtime: 7200 + mem_mb: 64000 + cpus_per_task: 10 + indel_realigner: + runtime: 7200 + mem_mb: 64000 + cpus_per_task: 10 + realigned_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 10 + realigned_bam_group: + runtime: 2880 + mem_mb: 12800 + cpus_per_task: 2 + rescale_historical: + runtime: 5320 + mem_mb: 32000 + cpus_per_task: 5 + rescaled_bam_qualimap: + runtime: 5760 + rescaled_bam_group: + runtime: 600 + filter_bam_mapped_mq: + runtime: 1440 + subsample_bams: + runtime: 5760 + mem_mb: 12800 + cpus_per_task: 2 + subsampled_bam_qualimap: + runtime: 5760 + mem_mb: 64000 + cpus_per_task: 10 + subsampled_bam_group: + runtime: 2880 + variant_calling: + runtime: 2880 + mem_mb: 32000 + cpus_per_task: 5 + sort_vcfs: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + make_CpG_genotype_bed: + runtime: 1440 + CpG_genotype_bed_formatting_group: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 5 + all_CpG_bed_formatting_group: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + make_noCpG_bed: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + CpG_repeats_bed_formatting_group: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + merge_noCpG_noRepeats_beds: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + make_noCpG_repma_bed: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + make_autosomes_bed: + runtime: 300 + intersect_sexchr_repma_beds: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + intersect_autos_repma_beds: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + intersect_sexchr_CpG_repma_beds: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + intersect_autos_CpG_repma_beds: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + bam2pro_autos: + runtime: 1440 + bam2pro_sexchr: + runtime: 1440 + bam2pro_all: + runtime: 1440 + remove_CpG_vcf: + runtime: 300 + mem_mb: 64000 + cpus_per_task: 10 + CpG_vcf2bcf: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + remove_snps_near_indels: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + filter_vcfs_qual_dp: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + filter_vcfs_allelic_balance: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + remove_repeats_vcf: + runtime: 300 + mem_mb: 64000 + cpus_per_task: 10 + filtered_vcf2bcf: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + merge_all_vcfs: + runtime: 5320 + mem_mb: 64000 + cpus_per_task: 10 + filter_vcf_biallelic: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + filter_vcf_missing: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + remove_chromosomes: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + extract_historical_samples: + runtime: 300 + extract_modern_samples: + runtime: 300 + repmasked_bcf2vcf: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + filter_biallelic_missing_vcf: + runtime: 1440 + mem_mb: 64000 + cpus_per_task: 10 + vcf2plink_pca: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + vcf2plink_hwe: + runtime: 300 + mem_mb: 12800 + cpus_per_task: 2 + outgroup_fastqc: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + align2target: + runtime: 5320 + mem_mb: 64000 + cpus_per_task: 10 + bam2fasta: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + concatenate_fasta_per_contig: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + compute_gerp: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 5 + gerp2coords: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + get_ancestral_state: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + produce_contig_out: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + merge_per_chunk: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + merge_gerp_gz: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 + gerp_derived_alleles: + runtime: 10080 + mem_mb: 12800 + cpus_per_task: 2 + merge_gerp_alleles_per_chunk: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 5 + merge_gerp_alleles_gz: + runtime: 1440 + mem_mb: 32000 + cpus_per_task: 5 + relative_mutational_load_per_sample: + runtime: 1440 + mem_mb: 12800 + cpus_per_task: 2 diff --git a/config/slurm/profile/config.yaml b/config/slurm/profile/config_v7.yaml similarity index 100% rename from config/slurm/profile/config.yaml rename to config/slurm/profile/config_v7.yaml diff --git a/environment.yml b/environment.yml index bf46c32..accc962 100644 --- a/environment.yml +++ b/environment.yml @@ -2,9 +2,10 @@ channels: - bioconda - conda-forge dependencies: - - python=3.8.15 - - snakemake=7.20.0 - - biopython=1.76 - - matplotlib=3.4.2 - - pandas=1.0.3 - - numpy=1.18.4 + - python=3.12.3 + - snakemake=8.14.0 + - biopython=1.83 + - matplotlib=3.8.4 + - pandas=2.2.2 + - numpy=1.26.4 + - snakemake-executor-plugin-slurm=0.6.0 diff --git a/utilities/mutational_load_snpeff/README.md b/utilities/mutational_load_snpeff/README.md new file mode 100644 index 0000000..060b82b --- /dev/null +++ b/utilities/mutational_load_snpeff/README.md @@ -0,0 +1,76 @@ +# Mutational load pipeline + +Snakemake pipeline to process and analyse VCF files that +were annotated with snpEff to calculate potential and +realised mutational load. + +Requirements: + +- Conda/mamba. To run the pipeline, the GenErode conda +environment needs to be activated. + +- Singularity/apptainer. The pipeline executes each rule +in a container. + +- A slurm profile is provided with the pipeline for runs +on HPC clusters with the slurm workload manager. Please +add your compute project to the file +`utilities/mutational_load_snpeff/slurm/settings.json` +by modifying the following line: + +``` + "SBATCH_DEFAULTS": "account=XXXX-XX-XXXX", +``` + +For job- and cluster-specific adjustments, please edit the +file `utilities/mutational_load_snpeff/slurm/config.yaml` +accordingly. + +The pipeline performs the following filtering and data +processing steps: + +- Identify sites that are are fixed for the ALT allele across +all samples (1/1) from a merged and filtered VCF file +containing all samples. When the reference genome of an +outgroup species/population is used, these sites are not +informative for population-level analyses. This filter +currently only handles VCF files without any missing data. +The pipeline produces additionally a merged VCF file from +which these sites are removed. + +- From SNPeff-annotated VCF files (one per individual), +intergenic and intron variants are removed, along with +the sites that were identified as being fixed for the ALT +allele across all samples. + +- Variants allocated to the following categories are extracted: +a) Low impact: mostly harmless or unlikely to change protein +behaviour; b) Moderate impact: non-disruptive variants that +might change protein effectiveness; c) High impact: variants +assumed to have high (disruptive) impact on protein, probably +causing protein truncation, loss of function or triggering +nonsense-mediated decay and including stop gained codons, +splice donor variant and splice acceptor, start codon lost; +d) Synonymous variants. + +- Each individual's potential (total) load is calculated by +taking the sum of the number of variants of each variant impact +category i, correcting for potential mapping biases by dividing +the number of each category by the total number of synonymous +SNPs (Xue et al. 2015). + +- Realised load is calculated by dividing the total number +of homozygous variants of category i by twice the total +number of variants for category i per individual (Mathur & +DeWoody 2021). + + +References: + +- Mathur, S. & DeWoody, J. A. Genetic load has potential in +large populations but is realized in small inbred populations. +Evol. Appl. 14, 1540–1557 (2021). + +- Xue, Y. et al. Mountain gorilla genomes reveal the impact of +long-term population decline and inbreeding. Science 348, +242–245 (2015). diff --git a/utilities/mutational_load_snpeff/Snakefile b/utilities/mutational_load_snpeff/Snakefile new file mode 100644 index 0000000..132dc69 --- /dev/null +++ b/utilities/mutational_load_snpeff/Snakefile @@ -0,0 +1,345 @@ +############################################################ +################ Mutational load pipeline ################## +# Snakemake pipeline to filter snpEff annotated VCF files # +# and to estimate individual potential and realised # +# mutational load # +############################################################ + + +############################################################ +################ WORKFLOW VARIABLES AND CODE ############### +############################################################ + +from snakemake.exceptions import WorkflowError +from snakemake.utils import min_version +from snakemake.utils import validate +from snakemake.io import glob_wildcards +import pandas as pd +import os + +min_version("7.0.0") +snpeff_filtering_version = "0.0.1" +configfile: "config.yaml" + +# Input and output directories and paths +# Get paths and file names for merged VCF file +vcf_dir = os.path.dirname(config["vcf_path"]) +vcf_name = ".".join(os.path.basename(config["vcf_path"]).split('.')[:-2]) + +# Clean directory paths +snpEff_dir = config["snpEff_dir"].rstrip("/") +out_dir = config["out_dir"].rstrip("/") + +# Get a list of snpEff annotated VCF files from the snpEff directory +ann_vcfs, = glob_wildcards(snpEff_dir + "/{ann_vcf}.vcf") + +############################################################ +#################### WORKFLOW RULES ######################## +############################################################ + +rule all: + input: + merged_filtered_vcf=out_dir + "/" + vcf_name + ".no_fixed_hom_alt.vcf", + ann_category_vcfs=expand(out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.vcf", + ann_vcf=ann_vcfs, + ann=["ann_high", "ann_moderate", "ann_low", "ann_syn"]), + genetic_load=expand(out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.{load}_load.txt", + ann_vcf=ann_vcfs, + ann=["ann_high", "ann_moderate", "ann_low"], + load=["total", "realised"]), + +rule extract_number_of_samples: + """ + Count the number of samples in the merged VCF file. + Required for rule "find_fixed_homozygote_alt_sites". + """ + input: + vcf=config["vcf_path"], + output: + n_samples=temp(out_dir + "/" + vcf_name + ".n_samples.txt"), + log: + os.path.abspath("logs/rule_logs/extract_number_of_samples.log"), + singularity: + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" + threads: 2 + shell: + """ + bcftools query -l {input.vcf} | wc -l > {output.n_samples} 2> {log} + """ + +rule find_fixed_homozygote_alt_sites: + """ + Identify sites in the merged VCF that are fixed for the + ALT allele across all samples (1/1). When the reference + genome of an outgroup species/population is used, these + sites are not informative for population-level analyses. + This filter currently only handles sites without missing + data. + """ + input: + vcf=config["vcf_path"], + n_samples=out_dir + "/" + vcf_name + ".n_samples.txt", + output: + bed=out_dir + "/" + vcf_name + ".fixed_hom_alt.bed", + log: + os.path.abspath("logs/rule_logs/find_fixed_homozygote_alt_sites.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 4 + resources: + mem_mb=32000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + n_samples=`head -n 1 {input.n_samples}` + zcat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "((countHom() = $n_samples) && (countVariant() = $n_samples))" | \ + grep -v "^#" | awk -F'\t' '{{print $1, $2-1, $2}}' OFS='\t' > {output.bed} 2> {log} + """ + +rule remove_fixed_homozygote_alt_sites_merged_vcf: + """ + Remove sites in the merged VCF that are fixed for the + ALT allele across all samples (1/1) from the merged + VCF file for other downstream analyses and variant + calling statistics. + """ + input: + vcf=config["vcf_path"], + bed=out_dir + "/" + vcf_name + ".fixed_hom_alt.bed", + output: + vcf=out_dir + "/" + vcf_name + ".no_fixed_hom_alt.vcf", + log: + os.path.abspath("logs/rule_logs/remove_fixed_homozygote_alt_sites_merged_vcf.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 4 + resources: + mem_mb=32000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + zcat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + intervals -x {input.bed} > {output.vcf} + """ + +rule find_intron_intergenic_variants: + """ + Identify sites that were annotated as being located in + an intron or an intergenic region in each sample. + Merge the BED files with the BED file of sites that are + fixed ALT across all samples. + """ + input: + vcf=snpEff_dir + "/{ann_vcf}.vcf", + bed=out_dir + "/" + vcf_name + ".fixed_hom_alt.bed", + output: + intron_bed=temp(out_dir + "/{ann_vcf}.intron.bed"), + intergenic_bed=temp(out_dir + "/{ann_vcf}.intergenic.bed"), + merged_bed=out_dir + "/{ann_vcf}.fixed_hom_alt.intron.intergenic.bed", + log: + os.path.abspath("logs/rule_logs/find_intron_intergenic_variants/{ann_vcf}.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 2 + resources: + mem_mb=16000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + # Extract SNPs from introns and intergenic regions, convert to BED format + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "ANN[*].EFFECT has 'intron_variant'" | grep -v "^#" | \ + awk -F'\t' '{{print $1, $2-1, $2}}' OFS='\t' > {output.intron_bed} 2> {log} && + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "ANN[*].EFFECT has 'intergenic_region'" | grep -v "^#" | \ + awk -F'\t' '{{print $1, $2-1, $2}}' OFS='\t' > {output.intergenic_bed} 2>> {log} && + + # Merge and sort the BED files + cat {input.bed} {output.intron_bed} {output.intergenic_bed} | sort -k1,1 -k2,2n > {output.merged_bed} 2>> {log} + """ + +rule remove_sites_snpEff_vcf: + """ + Remove sites that are fixed for the ALT allele across + all samples (1/1), that are located in an intron or in + an intergenic region. + """ + input: + vcf=snpEff_dir + "/{ann_vcf}.vcf", + merged_bed=out_dir + "/{ann_vcf}.fixed_hom_alt.intron.intergenic.bed", + output: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.vcf", + log: + os.path.abspath("logs/rule_logs/remove_sites_snpEff_vcfs/{ann_vcf}.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 4 + resources: + mem_mb=32000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + intervals -x {input.merged_bed} > {output.vcf} 2> {log} + """ + +rule extract_high_impact_snps: + """ + Extract sites that were annotated in snpEff as having + a potentially high impact on protein function. + """ + input: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.vcf", + output: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.ann_high.vcf", + log: + os.path.abspath("logs/rule_logs/extract_high_impact_snps/{ann_vcf}.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 2 + resources: + mem_mb=16000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "ANN[0].IMPACT has 'HIGH'" > {output.vcf} 2> {log} + """ + +rule extract_moderate_impact_snps: + """ + Extract sites that were annotated in snpEff as having + a potentially moderate impact on protein function. + """ + input: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.vcf", + output: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.ann_moderate.vcf", + log: + os.path.abspath("logs/rule_logs/extract_moderate_impact_snps/{ann_vcf}.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 2 + resources: + mem_mb=16000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "ANN[0].IMPACT has 'MODERATE'" > {output.vcf} 2> {log} + """ + +rule extract_low_impact_snps: + """ + Extract sites that were annotated in snpEff as having + a potentially low impact on protein function. + """ + input: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.vcf", + output: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.ann_low.vcf", + log: + os.path.abspath("logs/rule_logs/extract_low_impact_snps/{ann_vcf}.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 2 + resources: + mem_mb=16000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "ANN[0].IMPACT has 'LOW'" > {output.vcf} 2> {log} + """ + +rule extract_synonymous_variant_snps: + """ + Extract sites that were annotated in snpEff as synonymous + variants, i.e. without changing the amino acid in the + resulting protein sequence. + """ + input: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.vcf", + output: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.ann_syn.vcf", + log: + os.path.abspath("logs/rule_logs/extract_synonymous_variant_snps/{ann_vcf}.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 2 + resources: + mem_mb=16000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "ANN[0].EFFECT has 'synonymous_variant'" > {output.vcf} 2> {log} + """ + +rule total_load: + """ + Sum of the number of variants of each category i, correcting + for potential mapping biases by dividing the number of each + category by the total number of synonymous SNPs + """ + input: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.vcf", + syn=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.ann_syn.vcf", + output: + load=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.total_load.txt", + log: + os.path.abspath("logs/rule_logs/total_load/{ann_vcf}_{ann}.log"), + threads: 1 + shell: + """ + ann=`grep -v "#" {input.vcf} | wc -l` + syn=`grep -v "#" {input.syn} | wc -l` + load=`awk "BEGIN {{print $ann/$syn}}"` + echo "n_variants n_synonymous_variants total_load" > {output.load} + echo $ann $syn $load >> {output.load} + """ + +rule extract_homozygous_variants: + """ + Extract homozygous variants for each category + """ + input: + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.vcf", + output: + hom_vcf=temp(out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.hom.vcf"), + log: + os.path.abspath("logs/rule_logs/extract_homozygous_variants/{ann_vcf}_{ann}.log"), + singularity: + "docker://quay.io/biocontainers/snpsift:4.3.1t--hdfd78af_3" + threads: 2 + resources: + mem_mb=16000, + shell: + """ + mem=$((({resources.mem_mb} - 2000)/1000)) + cat {input.vcf} | java -jar -Xmx${{mem}}g /usr/local/share/snpsift-4.3.1t-3/SnpSift.jar \ + filter "(countHom() = 1)" > {output.hom_vcf} 2> {log} + """ + +rule realised_load: + """ + Total number of homozygous variants of category i divided by + twice the total number of variants for category i per individual + """ + input: + hom_vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.hom.vcf", + vcf=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.vcf", + output: + load=out_dir + "/{ann_vcf}.no_fixed_hom_alt.no_intron.no_intergenic.{ann}.realised_load.txt", + log: + os.path.abspath("logs/rule_logs/realised_load/{ann_vcf}_{ann}.log"), + threads: 1 + shell: + """ + hom=`grep -v "#" {input.hom_vcf} | wc -l` + tot=`grep -v "#" {input.vcf} | wc -l` + load=`awk "BEGIN {{print $hom / (2 * $tot)}}"` + echo "n_homozygous_variants n_total_variants realised_load" > {output.load} + echo $hom $tot $load >> {output.load} + """ \ No newline at end of file diff --git a/utilities/mutational_load_snpeff/config.yaml b/utilities/mutational_load_snpeff/config.yaml new file mode 100644 index 0000000..d45238e --- /dev/null +++ b/utilities/mutational_load_snpeff/config.yaml @@ -0,0 +1,32 @@ +################################################################# +################################################################# +# Configuration settings for the mutational load pipeline # +################################################################# +################################################################# + + +################################################################# +# 1) Full path to merged VCF file. Required for removal of sites +# that are fixed for the ALT allele across all samples (1/1). When +# the reference genome of an outgroup species/population is used, +# these sites are not informative for population-level analyses. +# This filter currently only handles VCF files from which sites +# with any missing data were removed. +vcf_path: "" +################################################################# + +################################################################# +# 2) Full path to directory containing snpEff annotated VCF files. +# Required for removal of sites that were annotated as being +# located in an intron or an intergenic region, for the extraction +# of sites that were annotated in snpEff according to different +# impact categories, and for the calculation of total and relative +# load for each sample. +snpEff_dir: "" +################################################################# + +################################################################# +# 3) Full path to an output directory for results from this +# pipeline. +out_dir: "" +################################################################# diff --git a/utilities/mutational_load_snpeff/slurm/profile/README.md b/utilities/mutational_load_snpeff/slurm/profile/README.md new file mode 100644 index 0000000..46f4da3 --- /dev/null +++ b/utilities/mutational_load_snpeff/slurm/profile/README.md @@ -0,0 +1,71 @@ +# GenErode execution on Dardel + +1) Load the following modules: + +``` +module load PDC UPPMAX bioinfo-tools conda singularity tmux +``` + +> Note that tmux is only available as a module on Dardel +but the equivalent tool screen is pre-installed and does +not need to be loaded. + +2) After cloning the repository, change permissions for the +Snakefile: + +``` +chmod 755 Snakefile +``` + +3) Create the GenErode conda environment or update an earlier +version. The latest conda environment contains the Snakemake +executor plugin for slurm: + +``` +conda create -f environment.yaml -n generode +``` + +4) Copy the configuration file `config/slurm/profile/config_plugin_dardel.yaml` +to `slurm/config.yaml`. This file specifies compute resources +for each rule or group jobs to be run on Dardel. Any rule or +group job that is not listed under `set-threads` or `set-resources` +uses default resources specified under `default-resources`. If +any rule or group job fails due to too little memory or run +time, their compute resources can be updated in this file. + +> Note that memory requirements are specified three times in +the configuration file: 1) under `set-threads` (used by Snakemake +to specify threads in rules), 2) under `set-resources` and therein +under `mem_mb`, specifying the memory in Megabytes (multiplying +the number of threads with the available memory per thread), +and 3) under `set-resources` and therein under `cpus-per-task` +(the same number as specified under `set-threads`, required for +correct memory assignment on Dardel). + +5) Start GenErode the following: + +- Open a tmux session (alternatively, you can use screen) + +- Activate the GenErode conda environment (create or update +from `environment.yaml`), replacing the path to the location +of the conda environment: + +``` +export CONDA_ENVS_PATH=/cfs/klemming/home/.../ +conda activate generode +``` + +- Start the dry run: + +``` +snakemake --profile slurm -n &> YYMMDD_dry.out +``` + +- Start the main run: + +``` +snakemake --profile slurm &> YYMMDD_main.out +``` + +> Useful flags for running the pipeline: `--ri` to re-run +incomplete jobs and `-k` to keep going in case a job fails. diff --git a/utilities/mutational_load_snpeff/slurm/profile/config_plugin_dardel.yaml b/utilities/mutational_load_snpeff/slurm/profile/config_plugin_dardel.yaml new file mode 100644 index 0000000..32cb4b9 --- /dev/null +++ b/utilities/mutational_load_snpeff/slurm/profile/config_plugin_dardel.yaml @@ -0,0 +1,74 @@ +# Configuration file for slurm plugin (Snakemake >8.0.0) for Dardel cluster at PDC/KTH +# snakemake CLI flags +executor: slurm +jobs: 100 +printshellcmds: true +software-deployment-method: apptainer + +# slurm resources +## default-resources: applied to all jobs, overruled by resources defined below for jobs +default-resources: + slurm_account: XXX-XX-XXX # update this to your slurm account + slurm_partition: shared # use Dardel’s shared partition + runtime: 120 # default runtime in minutes + mem_mb: 8000 + nodes: 1 # one node on Dardel from the shared partition + ntasks: 1 # number of concurrent tasks / ranks + cpus_per_task: 8 # number of hyperthreads per task, corresponds to 1 GB RAM + +## map rule names to threads +set-threads: + extract_number_of_samples: 16 + find_fixed_homozygote_alt_sites: 32 + remove_fixed_homozygote_alt_sites_merged_vcf: 32 + find_intron_intergenic_variants: 16 + remove_sites_snpEff_vcf: 32 + extract_high_impact_snps: 16 + extract_moderate_impact_snps: 16 + extract_low_impact_snps: 16 + extract_synonymous_variant_snps: 16 + total_load: 8 + realised_load: 8 + +## set-resources: map rule names to resources in general +set-resources: + extract_number_of_samples: + mem_mb: 16000 + runtime: 30 + cpus_per_task: 16 + find_fixed_homozygote_alt_sites: + mem_mb: 32000 + runtime: 300 + cpus_per_task: 32 + remove_fixed_homozygote_alt_sites_merged_vcf: + mem_mb: 32000 + runtime: 300 + cpus_per_task: 32 + find_intron_intergenic_variants: + mem_mb: 16000 + runtime: 300 + cpus_per_task: 16 + remove_sites_snpEff_vcf: + mem_mb: 32000 + runtime: 300 + cpus_per_task: 32 + extract_high_impact_snps: + mem_mb: 16000 + cpus_per_task: 16 + extract_moderate_impact_snps: + mem_mb: 16000 + cpus_per_task: 16 + extract_low_impact_snps: + mem_mb: 16000 + cpus_per_task: 16 + extract_synonymous_variant_snps: + mem_mb: 16000 + cpus_per_task: 16 + total_load: + mem_mb: 8000 + runtime: 30 + cpus_per_task: 8 + realised_load: + mem_mb: 8000 + runtime: 30 + cpus_per_task: 8 diff --git a/workflow/rules/1.2_map_to_mitogenomes.smk b/workflow/rules/1.2_map_to_mitogenomes.smk index 1c92d7c..7b7122c 100644 --- a/workflow/rules/1.2_map_to_mitogenomes.smk +++ b/workflow/rules/1.2_map_to_mitogenomes.smk @@ -250,12 +250,14 @@ rule historical_mito_bams_qualimap: group: "historical_mito_bams_group" threads: 1 + resources: + mem_mb=8000, singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ reads=`head -n1 {input.stats} | cut -d' ' -f 1` - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) if [ "$reads" -gt 100 ] # check if bam file contains enough reads then unset DISPLAY @@ -366,12 +368,14 @@ rule historical_merged_mito_bams_qualimap: group: "historical_merged_mito_bams_group" threads: 1 + resources: + mem_mb=8000, singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ reads=`head -n1 {input.stats} | cut -d' ' -f 1` - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) if [ "$reads" -gt 100 ] # check if bam file contains enough reads then unset DISPLAY diff --git a/workflow/rules/10_pca.smk b/workflow/rules/10_pca.smk index 3bd5193..c9831e3 100644 --- a/workflow/rules/10_pca.smk +++ b/workflow/rules/10_pca.smk @@ -49,7 +49,7 @@ rule vcf2plink_pca: log: "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_vcf2plink_pca.log", singularity: - "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" + "https://depot.galaxyproject.org/singularity/plink:1.90b6.12--heea4ae3_0" shell: """ plink --vcf {input.vcf} --make-bed --allow-extra-chr --out {params.bfile} 2> {log} @@ -66,18 +66,19 @@ rule plink_eigenvec: output: eigenvec="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.eigenvec", eigenval="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}.eigenval", + threads: 1 params: bfile="results/{dataset}/pca/" + REF_NAME + ".{dataset}.merged.biallelic.fmissing{fmiss}.{chr}", log: "results/logs/10_pca/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_plink_eigenvec.log", singularity: - "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" + "https://depot.galaxyproject.org/singularity/plink:1.90b6.12--heea4ae3_0" shell: """ samples=`cat {input.fam} | wc -l` if [ "$samples" -gt 1 ] then - plink --bfile {params.bfile} --allow-extra-chr --pca --out {params.bfile} 2> {log} + plink --bfile {params.bfile} --threads {threads} --allow-extra-chr --pca --out {params.bfile} 2> {log} else touch {output.eigenvec} && touch {output.eigenval} 2> {log} echo "Not enough samples to calculate a PCA." >> {log} diff --git a/workflow/rules/11_ROH.smk b/workflow/rules/11_ROH.smk index a32f6aa..79f47d7 100644 --- a/workflow/rules/11_ROH.smk +++ b/workflow/rules/11_ROH.smk @@ -72,7 +72,7 @@ rule compress_roh_vcf: log: "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_compress_roh_vcf.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools view -Oz -o {output.compressed} {input.vcf} 2> {log} && @@ -98,7 +98,7 @@ rule vcf2plink_hwe: log: "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_vcf2plink_hwe.log", singularity: - "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" + "https://depot.galaxyproject.org/singularity/plink:1.90b6.12--heea4ae3_0" shell: """ plink --vcf {input.vcf} --make-bed --allow-extra-chr --out {params.bfile} 2> {log} @@ -127,7 +127,7 @@ rule ROHs: log: "results/logs/11_ROH/{dataset}/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}.homsnp{homsnp}.homkb{homkb}.homwinsnp{homwinsnp}.homwinhet{homwinhet}.homwinmis{homwinmis}.homhet{homhet}_ROHs.log", singularity: - "docker://quay.io/biocontainers/plink:1.90b6.12--heea4ae3_0" + "https://depot.galaxyproject.org/singularity/plink:1.90b6.12--heea4ae3_0" shell: """ plink --bfile {params.bfile} --homozyg --homozyg-window-threshold 0.05 --allow-extra-chr \ diff --git a/workflow/rules/12_snpEff.smk b/workflow/rules/12_snpEff.smk index 6fd74b5..deeb28a 100644 --- a/workflow/rules/12_snpEff.smk +++ b/workflow/rules/12_snpEff.smk @@ -227,6 +227,8 @@ rule build_snpEff_db: output: db=GTF_DIR + "/snpEff/data/" + REF_NAME + "/snpEffectPredictor.bin", threads: 1 + resources: + mem_mb=8000, params: ref_name=REF_NAME, abs_gtf=lambda wildcards, input: os.path.abspath(input.gtf), @@ -240,7 +242,7 @@ rule build_snpEff_db: "docker://quay.io/biocontainers/snpeff:4.3.1t--3" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) cd {params.abs_db_dir} java -jar -Xmx${{mem}}g /usr/local/share/snpeff-4.3.1t-3/snpEff.jar build -gtf22 -c {params.abs_config} \ -dataDir {params.abs_data_dir} -treatAllAsProteinCoding -v {params.ref_name} 2> {log} @@ -258,6 +260,8 @@ rule annotate_vcf: csv="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.csv", html="results/{dataset}/snpEff/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.snps5.noIndel.QUAL30.dp.AB.repma.biallelic.fmissing{fmiss}.{chr}_stats.html", threads: 1 + resources: + mem_mb=8000, params: ref_name=REF_NAME, abs_config=lambda wildcards, input: os.path.abspath(input.config), @@ -268,7 +272,7 @@ rule annotate_vcf: "docker://quay.io/biocontainers/snpeff:4.3.1t--3" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) java -jar -Xmx${{mem}}g /usr/local/share/snpeff-4.3.1t-3/snpEff.jar -c {params.abs_config} -dataDir {params.abs_data_dir} -s {output.html} -csvStats {output.csv} \ -treatAllAsProteinCoding -v -d -lof {params.ref_name} {input.vcf} > {output.ann} 2> {log} """ diff --git a/workflow/rules/13_GERP.smk b/workflow/rules/13_GERP.smk index 78d2afc..ea1d1a7 100644 --- a/workflow/rules/13_GERP.smk +++ b/workflow/rules/13_GERP.smk @@ -445,7 +445,7 @@ rule split_ref_contigs: "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/fasta/" + REF_NAME + "_{chunk}_split_ref_contigs.log", threads: 1 singularity: - "docker://quay.io/biocontainers/seqtk:1.3--hed695b0_2" + "oras://community.wave.seqera.io/library/seqtk:1.4--e75a8dec899d1be8" shell: """ if [ ! -d {output.fasta_dir} ]; then @@ -525,7 +525,7 @@ rule compute_gerp: "results/logs/13_GERP/{chr}_chunks/" + REF_NAME + "/gerp/{chunk}_compute_gerp.log", threads: 4 singularity: - "docker://quay.io/biocontainers/gerp:2.1--hfc679d8_0" + "https://depot.galaxyproject.org/singularity/gerp:2.1--h1b792b2_2" shell: """ if [ ! -d {output.gerp_dir} ]; then diff --git a/workflow/rules/2_mapping.smk b/workflow/rules/2_mapping.smk index 35e14e6..f92790f 100644 --- a/workflow/rules/2_mapping.smk +++ b/workflow/rules/2_mapping.smk @@ -158,15 +158,17 @@ rule sorted_bam_qualimap: results="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_sorted/{sample}_{index}_{lane}.sorted.bam.qualimap/genome_results.txt", outdir=directory("results/{dataset}/mapping/" + REF_NAME + "/stats/bams_sorted/{sample}_{index}_{lane}.sorted.bam.qualimap/"), threads: 8 + resources: + mem_mb=64000, params: outdir="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_sorted/{sample}_{index}_{lane}.sorted.bam.qualimap", log: "results/logs/2_mapping/{dataset}/" + REF_NAME + "/{sample}_{index}_{lane}_sorted_bam_qualimap.log", singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) unset DISPLAY qualimap bamqc -bam {input.bam} --java-mem-size=${{mem}}G -nt {threads} -outdir {params.outdir} -outformat html 2> {log} """ diff --git a/workflow/rules/3.1_bam_rmdup_realign_indels.smk b/workflow/rules/3.1_bam_rmdup_realign_indels.smk index cbeaee6..e5accd6 100644 --- a/workflow/rules/3.1_bam_rmdup_realign_indels.smk +++ b/workflow/rules/3.1_bam_rmdup_realign_indels.smk @@ -142,15 +142,17 @@ rule merged_index_bam_qualimap: results="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_merged_index/{sample}_{index}.merged.bam.qualimap/genome_results.txt", outdir=directory("results/{dataset}/mapping/" + REF_NAME + "/stats/bams_merged_index/{sample}_{index}.merged.bam.qualimap"), threads: 8 + resources: + mem_mb=64000, params: outdir="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_merged_index/{sample}_{index}.merged.bam.qualimap", log: "results/logs/3.1_bam_rmdup_realign_indels/{dataset}/" + REF_NAME + "/{sample}_{index}_merged_index_bam_qualimap.log", singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) unset DISPLAY qualimap bamqc -bam {input.bam} --java-mem-size=${{mem}}G -nt {threads} -outdir {params.outdir} -outformat html 2> {log} """ @@ -228,13 +230,15 @@ rule rmdup_modern_bams: rmdup=temp("results/modern/mapping/" + REF_NAME + "/{sample}_{index}.merged.rmdup.bam"), metrix=temp("results/modern/mapping/" + REF_NAME + "/{sample}_{index}.merged.rmdup_metrics.txt"), threads: 2 + resources: + mem_mb=16000, log: "results/logs/3.1_bam_rmdup_realign_indels/modern/" + REF_NAME + "/{sample}_{index}_rmdup_modern_bams.log", singularity: "docker://quay.io/biocontainers/picard:2.26.6--hdfd78af_0" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) picard MarkDuplicates -Xmx${{mem}}g INPUT={input.merged} OUTPUT={output.rmdup} METRICS_FILE={output.metrix} 2> {log} """ @@ -285,15 +289,17 @@ rule rmdup_bam_qualimap: results="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_rmdup/{sample}_{index}.merged.rmdup.bam.qualimap/genome_results.txt", outdir=directory("results/{dataset}/mapping/" + REF_NAME + "/stats/bams_rmdup/{sample}_{index}.merged.rmdup.bam.qualimap"), threads: 8 + resources: + mem_mb=64000, params: outdir="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_rmdup/{sample}_{index}.merged.rmdup.bam.qualimap", log: "results/logs/3.1_bam_rmdup_realign_indels/{dataset}/" + REF_NAME + "/{sample}_{index}_rmdup_bam_qualimap.log", singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) unset DISPLAY qualimap bamqc -bam {input.bam} --java-mem-size=${{mem}}G -nt {threads} -outdir {params.outdir} -outformat html 2> {log} """ @@ -441,15 +447,17 @@ rule merged_sample_bam_qualimap: results="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_merged_sample/{sample}.merged.rmdup.merged.bam.qualimap/genome_results.txt", outdir=directory("results/{dataset}/mapping/" + REF_NAME + "/stats/bams_merged_sample/{sample}.merged.rmdup.merged.bam.qualimap"), threads: 8 + resources: + mem_mb=64000, params: outdir="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_merged_sample/{sample}.merged.rmdup.merged.bam.qualimap", log: "results/logs/3.1_bam_rmdup_realign_indels/{dataset}/" + REF_NAME + "/{sample}_merged_sample_bam_qualimap.log", singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) unset DISPLAY qualimap bamqc -bam {input.bam} --java-mem-size=${{mem}}G -nt {threads} -outdir {params.outdir} -outformat html 2> {log} """ @@ -510,13 +518,15 @@ rule indel_realigner_targets: output: target_list=temp("results/{dataset}/mapping/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn_targets.list"), threads: 8 + resources: + mem_mb=64000, log: "results/logs/3.1_bam_rmdup_realign_indels/{dataset}/" + REF_NAME + "/{sample}_indel_realigner_targets.log", singularity: "docker://broadinstitute/gatk3:3.7-0" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) java -jar -Xmx${{mem}}g /usr/GenomeAnalysisTK.jar -T RealignerTargetCreator -R {input.ref} -I {input.bam} -o {output.target_list} -nt {threads} 2> {log} """ @@ -533,13 +543,15 @@ rule indel_realigner: output: realigned="results/{dataset}/mapping/" + REF_NAME + "/{sample}.merged.rmdup.merged.realn.bam", threads: 8 + resources: + mem_mb=64000, log: "results/logs/3.1_bam_rmdup_realign_indels/{dataset}/" + REF_NAME + "/{sample}_indel_realigner.log", singularity: "docker://broadinstitute/gatk3:3.7-0" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) java -jar -Xmx${{mem}}g /usr/GenomeAnalysisTK.jar -T IndelRealigner -R {input.ref} -I {input.bam} -targetIntervals {input.target_list} -o {output.realigned} 2> {log} """ @@ -613,15 +625,17 @@ rule realigned_bam_qualimap: results="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_indels_realigned/{sample}.merged.rmdup.merged.realn.bam.qualimap/genome_results.txt", outdir=directory("results/{dataset}/mapping/" + REF_NAME + "/stats/bams_indels_realigned/{sample}.merged.rmdup.merged.realn.bam.qualimap"), threads: 8 + resources: + mem_mb=64000, params: outdir="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_indels_realigned/{sample}.merged.rmdup.merged.realn.bam.qualimap", log: "results/logs/3.1_bam_rmdup_realign_indels/{dataset}/" + REF_NAME + "/{sample}_realigned_bam_qualimap.log", singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) unset DISPLAY qualimap bamqc -bam {input.bam} --java-mem-size=${{mem}}G -nt {threads} -outdir {params.outdir} -outformat html 2> {log} """ diff --git a/workflow/rules/3.2_historical_bam_mapDamage.smk b/workflow/rules/3.2_historical_bam_mapDamage.smk index 3d4ad83..faf2e3f 100644 --- a/workflow/rules/3.2_historical_bam_mapDamage.smk +++ b/workflow/rules/3.2_historical_bam_mapDamage.smk @@ -117,15 +117,17 @@ rule rescaled_bam_qualimap: results="results/historical/mapping/" + REF_NAME + "/stats/bams_rescaled/{sample}.merged.rmdup.merged.realn.rescaled.bam.qualimap/genome_results.txt", outdir=directory("results/historical/mapping/" + REF_NAME + "/stats/bams_rescaled/{sample}.merged.rmdup.merged.realn.rescaled.bam.qualimap"), threads: 8 + resources: + mem_mb=64000, params: outdir="results/historical/mapping/" + REF_NAME + "/stats/bams_rescaled/{sample}.merged.rmdup.merged.realn.rescaled.bam.qualimap", log: "results/logs/3.2_historical_bam_mapDamage/" + REF_NAME + "/{sample}_rescaled_bam_qualimap.log", singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) unset DISPLAY qualimap bamqc -bam {input.bam} --java-mem-size=${{mem}}G -nt {threads} -outdir {params.outdir} -outformat html 2> {log} """ diff --git a/workflow/rules/3.3_bam_subsampling.smk b/workflow/rules/3.3_bam_subsampling.smk index ec0dc13..0454f87 100644 --- a/workflow/rules/3.3_bam_subsampling.smk +++ b/workflow/rules/3.3_bam_subsampling.smk @@ -141,15 +141,17 @@ rule subsampled_bam_qualimap: results="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_subsampled/{sample}.merged.rmdup.merged.{processed}.mapped_q30.subs_dp{DP}.bam.qualimap/genome_results.txt", outdir=directory("results/{dataset}/mapping/" + REF_NAME + "/stats/bams_subsampled/{sample}.merged.rmdup.merged.{processed}.mapped_q30.subs_dp{DP}.bam.qualimap"), threads: 8 + resources: + mem_mb=64000, params: outdir="results/{dataset}/mapping/" + REF_NAME + "/stats/bams_subsampled/{sample}.merged.rmdup.merged.{processed}.mapped_q30.subs_dp{DP}.bam.qualimap", log: "results/logs/3.3_bam_subsampling/{dataset}/" + REF_NAME + "/{sample}.{processed}.subs_dp{DP}_subsampled_bam_qualimap.log", singularity: - "docker://quay.io/biocontainers/qualimap:2.2.2d--1" + "oras://community.wave.seqera.io/library/qualimap:2.3--95d781b369b835f2" shell: """ - mem=$(((6 * {threads}) - 2)) + mem=$((({resources.mem_mb} - 2000)/1000)) unset DISPLAY qualimap bamqc -bam {input.bam} --java-mem-size=${{mem}}G -nt {threads} -outdir {params.outdir} -outformat html 2> {log} """ diff --git a/workflow/rules/4_genotyping.smk b/workflow/rules/4_genotyping.smk index 0e4918a..cbc0edc 100644 --- a/workflow/rules/4_genotyping.smk +++ b/workflow/rules/4_genotyping.smk @@ -25,7 +25,7 @@ rule variant_calling: log: "results/logs/4_genotyping/{dataset}/" + REF_NAME + "/{sample}.{processed}_variant_calling.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools mpileup -Ou -Q 30 -q 30 -B -f {input.ref} {input.bam} | bcftools call -c -M -O b --threads {threads} -o {output.bcf} 2> {log} @@ -38,15 +38,16 @@ rule sort_vcfs: bcf=rules.variant_calling.output.bcf, output: sort="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.Q30.sorted.bcf", - params: - tmpdir="results/{dataset}/vcf/" + REF_NAME + "/{sample}.merged.rmdup.merged.{processed}.Q30.sorted_tmp/", + threads: 2 + resources: + mem_mb=16000, log: "results/logs/4_genotyping/{dataset}/" + REF_NAME + "/{sample}.{processed}_sort_vcfs.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ - bcftools sort -O b -T {params.tmpdir} -o {output.sort} {input.bcf} 2> {log} + bcftools sort -O b -o {output.sort} {input.bcf} 2> {log} """ @@ -61,7 +62,7 @@ rule index_sorted_vcfs: log: "results/logs/4_genotyping/{dataset}/" + REF_NAME + "/{sample}.{processed}_index_sorted_vcfs.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools index -o {output.index} {input.sort} 2> {log} @@ -80,7 +81,7 @@ rule sorted_vcf_stats: log: "results/logs/4_genotyping/{dataset}/" + REF_NAME + "/{sample}.{processed}_sorted_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.sort} > {output.stats} 2> {log} diff --git a/workflow/rules/5_CpG_identification.smk b/workflow/rules/5_CpG_identification.smk index ad55b48..acc967c 100644 --- a/workflow/rules/5_CpG_identification.smk +++ b/workflow/rules/5_CpG_identification.smk @@ -76,7 +76,7 @@ rule sorted_bcf2vcf_CpG_id: log: "results/logs/5_CpG_identification/{dataset}/" + REF_NAME + "/{sample}.{processed}_sorted_bcf2vcf_CpG_id.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools convert -O z -o {output.vcf} {input.bcf} 2> {log} diff --git a/workflow/rules/8.1_vcf_CpG_filtering.smk b/workflow/rules/8.1_vcf_CpG_filtering.smk index 28170e6..6e71e70 100644 --- a/workflow/rules/8.1_vcf_CpG_filtering.smk +++ b/workflow/rules/8.1_vcf_CpG_filtering.smk @@ -88,7 +88,7 @@ rule sorted_bcf2vcf_CpG_removal: log: "results/logs/8.1_vcf_CpG_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_sorted_bcf2vcf_CpG_removal.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools convert -O z -o {output.vcf} {input.bcf} 2> {log} @@ -124,7 +124,7 @@ rule CpG_vcf2bcf: log: "results/logs/8.1_vcf_CpG_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}.no{CpG_method}_CpG_vcf2bcf.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools convert -O b -o {output.bcf} {input.filtered} 2> {log} @@ -142,7 +142,7 @@ rule index_CpG_bcf: log: "results/logs/8.1_vcf_CpG_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}.no{CpG_method}_index_CpG_bcf.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools index -o {output.index} {input.bcf} 2> {log} @@ -161,7 +161,7 @@ rule CpG_filtered_vcf_stats: log: "results/logs/8.1_vcf_CpG_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}.no{CpG_method}_CpG_filtered_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.bcf} > {output.stats} 2> {log} diff --git a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk index d0304e5..6d6ed7d 100644 --- a/workflow/rules/8.2_vcf_qual_repeat_filtering.smk +++ b/workflow/rules/8.2_vcf_qual_repeat_filtering.smk @@ -208,7 +208,7 @@ rule remove_snps_near_indels: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_remove_snps_near_indels.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools filter -g 5 -O b --threads {threads} -o {output.snps} {input.bcf} 2> {log} @@ -228,7 +228,7 @@ rule filter_vcfs_qual_dp: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_filter_vcfs_qual_dp.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ minDP=`head -n 1 {input.dp} | cut -d' ' -f 2` @@ -256,7 +256,7 @@ rule filter_vcfs_allelic_balance: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_filter_vcfs_allelic_balance.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools view -e 'GT="0/1" & (DP4[2]+DP4[3])/(DP4[0]+DP4[1]+DP4[2]+DP4[3]) < 0.2' {input.bcf} | \ @@ -275,7 +275,7 @@ rule index_filtered_vcfs: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_index_filtered_vcfs.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools index -o {output.index} {input.bcf} 2> {log} @@ -294,7 +294,7 @@ rule filtered_vcf_stats: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_filtered_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.bcf} > {output.stats} 2> {log} @@ -348,7 +348,7 @@ rule filtered_bcf2vcf: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_filtered_bcf2vcf.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools convert -O z -o {output.vcf} {input.bcf} 2> {log} @@ -384,7 +384,7 @@ rule filtered_vcf2bcf: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_filtered_vcf2bcf.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools convert -O b -o {output.bcf} {input.filtered} 2> {log} @@ -402,7 +402,7 @@ rule index_repmasked_vcfs: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_index_repmasked_vcfs.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools index -o {output.index} {input.bcf} 2> {log} @@ -421,7 +421,7 @@ rule repmasked_vcf_stats: log: "results/logs/8.2_vcf_qual_repeat_filtering/{dataset}/" + REF_NAME + "/{sample}.{processed}_repmasked_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.bcf} > {output.stats} 2> {log} diff --git a/workflow/rules/9_merge_vcfs.smk b/workflow/rules/9_merge_vcfs.smk index 0514803..f9c04a5 100644 --- a/workflow/rules/9_merge_vcfs.smk +++ b/workflow/rules/9_merge_vcfs.smk @@ -315,7 +315,7 @@ rule merge_all_vcfs: log: "results/logs/9_merge_vcfs/" + REF_NAME + "_merge_all_vcfs.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ files=`echo {input.bcf} | awk '{{print NF}}'` @@ -340,7 +340,7 @@ rule index_merged_vcf: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_index_merged_vcfs.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools index -o {output.index} {input.bcf} 2> {log} @@ -359,7 +359,7 @@ rule merged_vcf_stats: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_merged_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.merged} > {output.stats} 2> {log} @@ -400,7 +400,7 @@ rule filter_vcf_biallelic: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_filter_vcf_biallelic.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools view -m2 -M2 -v snps -Ob -o {output.bcf} {input.bcf} 2> {log} && @@ -420,7 +420,7 @@ rule biallelic_filtered_vcf_stats: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_biallelic_filtered_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.bcf} > {output.stats} 2> {log} @@ -462,7 +462,7 @@ rule filter_vcf_missing: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}_filter_vcf_missing.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ # only include sites with zero missing data @@ -496,7 +496,7 @@ rule remove_chromosomes: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".all_fmissing{fmiss}.autos_remove_chromosomes.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools view {input.bcf} \ @@ -535,7 +535,7 @@ rule extract_historical_samples: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".historical_fmissing{fmiss}.{chr}_extract_historical_samples.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" params: samples=hist_sm, all_samples=ALL_SAMPLES, @@ -568,7 +568,7 @@ rule extract_modern_samples: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".modern_fmissing{fmiss}.{chr}_extract_modern_samples.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" params: samples=mod_sm, all_samples=ALL_SAMPLES, @@ -600,7 +600,7 @@ rule missingness_filtered_vcf_stats: log: "results/logs/9_merge_vcfs/" + REF_NAME + ".{dataset}_fmissing{fmiss}.{chr}_missingness_filtered_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.merged} > {output.stats} 2> {log} @@ -642,7 +642,7 @@ rule repmasked_bcf2vcf: log: "results/logs/9_merge_vcfs/{dataset}/" + REF_NAME + "/{sample}.{processed}_repmasked_bcf2vcf.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools convert -O z -o {output.vcf} {input.bcf} 2> {log} @@ -677,7 +677,7 @@ rule biallelic_missing_filtered_vcf_stats: log: "results/logs/9_merge_vcfs/{dataset}/" + REF_NAME + "/{sample}.{processed}_fmissing{fmiss}.{chr}_biallelic_missing_filtered_vcf_stats.log", singularity: - "docker://quay.io/biocontainers/bcftools:1.9--h68d8f2e_9" + "https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0" shell: """ bcftools stats {input.filtered} > {output.stats} 2> {log} diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 4867348..b0d252b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -24,11 +24,11 @@ REF_NAME, REF_EXT = os.path.splitext(REF_FASTA) ### Global wildcard contraints that apply to all rules wildcard_constraints: - sample="[A-Za-z0-9]+", - DP="[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?", # avoid extension with "rm.autos" when running mlRho - fmiss="[0-1].[0-9]+", # avoid extension with ".autos" when filtering the merged BCF file - CpG_method="CpG_[vcfre]{3,6}", - chunk="chunk[0-9]+", + sample=r"[A-Za-z0-9]+", + DP=r"[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?", # avoid extension with "rm.autos" when running mlRho + fmiss=r"[0-1].[0-9]+", # avoid extension with ".autos" when filtering the merged BCF file + CpG_method=r"CpG_[vcfre]{3,6}", + chunk=r"chunk[0-9]+", ### Code to generate lists and dictionaries from the metadata tables