Skip to content

Commit

Permalink
Merge pull request #35 from NBISweden/dev
Browse files Browse the repository at this point in the history
Bug fixes and updates for GenErode version 0.5.0
  • Loading branch information
verku authored Feb 15, 2023
2 parents 485b645 + 3970634 commit 1e6dd36
Show file tree
Hide file tree
Showing 13 changed files with 356 additions and 71 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- uses: eWaterCycle/setup-singularity@v7
- uses: eWaterCycle/setup-apptainer@v2
with:
singularity-version: 3.8.3
apptainer-version: 1.1.4

- uses: conda-incubator/setup-miniconda@v2
with:
mamba-version: "*"
miniforge-variant: Mambaforge
channels: conda-forge,bioconda
activate-environment: generode
environment-file: environment.yml
auto-activate-base: false
Expand Down
2 changes: 1 addition & 1 deletion .test/config/config_mitogenomes.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.2 #
# Configuration settings for the GenErode pipeline 0.5.0 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
2 changes: 1 addition & 1 deletion .test/config/config_mlRho_options.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.2 #
# Configuration settings for the GenErode pipeline 0.5.0 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
2 changes: 1 addition & 1 deletion .test/config/config_pca_roh.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.2 #
# Configuration settings for the GenErode pipeline 0.5.0 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
2 changes: 1 addition & 1 deletion .test/config/config_snpeff_gerp.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.2 #
# Configuration settings for the GenErode pipeline 0.5.0 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
10 changes: 2 additions & 8 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This is the Snakefile of the GenErode pipeline for historical or #
# ancient and modern samples to study patterns of genome erosion #
# #
# Pipeline version 0.4.2 #
# Pipeline version 0.5.0 #
# #
# Written by Verena Kutschera, Marcin Kierczak and Tom van der Valk #
# Email: generode@nbis.se #
Expand Down Expand Up @@ -321,12 +321,6 @@ if (config["bam_rmdup_realign_indels"]
shell(
r"""
snakemake --unlock --cores 1 &&
snakemake --report report_raw.html --cores 1 &&
echo "Editing the report HTML file..." &&
grep -v "Click the nodes" report_raw.html | sed 's/max-width: 100vw;/max-width: 50%; border: 1px solid #ccc;/g' \
| sed 's/margin-left: auto;/margin-left: 1;/g' | sed 's/Workflow/GenErode pipeline report/g' | sed 's/"workflow"/"pipeline"/g' \
| sed 's/.panel#workflow/.panel#pipeline/g' > GenErode_pipeline_report.html &&
rm report_raw.html
echo "Report HTML file successfully edited"
snakemake --report GenErode_pipeline_report.html --cores 1
""")
all_outputs.append("GenErode_pipeline_report.html")
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#################################################################
#################################################################
# Configuration settings for the GenErode pipeline 0.4.2 #
# Configuration settings for the GenErode pipeline 0.5.0 #
# for ancient or historical samples, and modern samples #
#################################################################
#################################################################
Expand Down
4 changes: 2 additions & 2 deletions config/cluster.yaml → config/slurm/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ historical_merged_mito_bams_group:
map_historical:
time: 10-00:00:00
cpus-per-task: 8
convert_historical_sam:
sai2bam:
time: 10-00:00:00
cpus-per-task: 8
map_modern:
time: 10-00:00:00
cpus-per-task: 8
sorted_bam_stats_group:
sorted_bam_qualimap:
time: 4-00:00:00
cpus-per-task: 8
### process bam files
Expand Down
287 changes: 287 additions & 0 deletions config/slurm/profile/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
cluster-cancel: "scancel"
restart-times: "3"
jobscript: "slurm-jobscript.sh"
cluster: "slurm-submit.py"
cluster-status: "slurm-status.py"
max-jobs-per-second: "10"
max-status-checks-per-second: "10"
local-cores: 1
latency-wait: "5"
use-conda: "False"
use-singularity: "True"
jobs: "100"
printshellcmds: "True"

# Resource configuration for GenErode pipeline jobs, fine-tune if necessary
default-resources:
- runtime=120
- mem_mb=6400
- disk_mb=1000000
## set-threads: map rule names to threads
set-threads:
### repeat identification
- repeatmodeler=16
- repeatclassifier=2
- repeatmasker=16
### fastq processing
- historical_fastq_before_group=2
- fastqc_historical_raw=2
- fastqc_modern_raw=2
- fastp_historical=4
- fastp_modern=4
- fastqc_historical_merged=2
- fastqc_historical_unmerged=2
- fastqc_modern_trimmed=2
### map to reference genome
- map_historical=8
- sai2bam=8
- map_modern=8
- sorted_bam_qualimap=8
### process bam files
- merge_historical_bams_per_index=2
- merge_modern_bams_per_index=2
- merged_index_bam_qualimap=8
- rmdup_historical_bams=6
- rmdup_modern_bams=2
- rmdup_bam_qualimap=8
- merge_historical_bams_per_sample=2
- merge_modern_bams_per_sample=2
- merged_sample_bam_qualimap=8
- indel_realigner_targets=8
- indel_realigner=8
- realigned_bam_qualimap=8
- realigned_bam_group=2
### rescale bam files
- rescale_historical=4
- rescaled_bam_qualimap=8
- rescaled_bam_group=2
### subsample bam files
- subsample_bams=2
- subsampled_bam_qualimap=6
### genotyping
- variant_calling=3
### identify CpG sites
- CpG_genotype_bed_formatting_group=4
- all_CpG_bed_formatting_group=2
- make_noCpG_bed=2
- CpG_repeats_bed_formatting_group=2
- merge_noCpG_noRepeats_beds=2
- make_noCpG_repma_bed=2
### create chromosome bed files
- intersect_sexchr_repma_beds=2
- intersect_autos_repma_beds=2
- intersect_sexchr_CpG_repma_beds=2
- intersect_autos_CpG_repma_beds=2
### CpG filter VCF files
- remove_CpG_vcf=6
- CpG_vcf2bcf=2
### process VCF files
- remove_snps_near_indels=2
- filter_vcfs_qual_dp=2
- filter_vcfs_allelic_balance=2
- remove_repeats_vcf=6
- filtered_vcf2bcf=2
### merge and process VCF files
- merge_all_vcfs=6
- filter_vcf_biallelic=2
- filter_vcf_missing=2
### PCA
- vcf2plink_pca=2
### runs of homozygosity
- filter_vcf_hwe=2
- vcf2plink_hwe=2
### snpEff
- repmasked_bcf2vcf_snpEff=2
- filter_biallelic_missing_vcf_snpEff=6
### gerp
- outgroup_fastqc=2
- align2target=8
- bam2fasta=2
- concatenate_fasta_per_contig=2
- compute_gerp=4
- gerp2coords=2
- get_ancestral_state=2
- produce_contig_out=2
- merge_per_chunk=2
- merge_gerp_gz=2
- repmasked_bcf2vcf_gerp=2
- filter_biallelic_missing_vcf_gerp=6
- gerp_derived_alleles=2
- merge_gerp_alleles_per_chunk=4
- merge_gerp_alleles_gz=4
- relative_mutational_load_per_sample=2
## set-resources: map rule names to resources in general
set-resources:
### repeat identification
- repeatmodeler:runtime=8640
- repeatmodeler:mem_mb=102400
- repeatclassifier:runtime=1440
- repeatclassifier:mem_mb=12800
- repeatmasker:runtime=8640
- repeatmasker:mem_mb=102400
### fastq processing
- historical_fastq_before_group:runtime=600
- historical_fastq_before_group:mem_mb=12800
- fastqc_historical_raw:mem_mb=12800
- fastqc_modern_raw:mem_mb=12800
- fastp_historical:runtime=600
- fastp_historical:mem_mb=25600
- fastp_modern:runtime=600
- fastp_modern:mem_mb=25600
- fastqc_historical_merged:mem_mb=12800
- fastqc_historical_unmerged:mem_mb=12800
- fastqc_modern_trimmed:mem_mb=12800
### map to mitochondrial genomes
- map_historical_merged_to_mito:runtime=360
- map_historical_unmerged_to_mito:runtime=360
- historical_mito_bams_group:runtime=2880
- merge_historical_mitogenome_bams_per_sample:runtime=2880
- historical_merged_mito_bams_group:runtime=2880
### map to reference genome
- map_historical:runtime=14400
- map_historical:mem_mb=51200
- sai2bam:runtime=14400
- sai2bam:mem_mb=51200
- map_modern:runtime=14400
- map_modern:mem_mb=51200
- sorted_bam_qualimap:runtime=5760
- sorted_bam_qualimap:mem_mb=51200
### process bam files
- merge_historical_bams_per_index:runtime=2880
- merge_historical_bams_per_index:mem_mb=12800
- merge_modern_bams_per_index:runtime=2880
- merge_modern_bams_per_index:mem_mb=12800
- merged_index_bam_qualimap:runtime=5760
- merged_index_bam_qualimap:mem_mb=51200
- rmdup_historical_bams:runtime=4320
- rmdup_historical_bams:mem_mb=38400
- rmdup_modern_bams:runtime=4320
- rmdup_modern_bams:mem_mb=12800
- rmdup_bam_qualimap:runtime=5760
- rmdup_bam_qualimap:mem_mb=51200
- merge_historical_bams_per_sample:runtime=2880
- merge_historical_bams_per_sample:mem_mb=12800
- merge_modern_bams_per_sample:runtime=2880
- merge_modern_bams_per_sample:mem_mb=12800
- merged_sample_bam_qualimap:runtime=5760
- merged_sample_bam_qualimap:mem_mb=51200
- indel_realigner_targets:runtime=7200
- indel_realigner_targets:mem_mb=51200
- indel_realigner:runtime=7200
- indel_realigner:mem_mb=51200
- realigned_bam_qualimap:runtime=5760
- realigned_bam_qualimap:mem_mb=51200
- realigned_bam_group:runtime=2880
- realigned_bam_group:mem_mb=12800
### rescale bam files
- rescale_historical:runtime=4320
- rescale_historical:mem_mb=25600
- rescaled_bam_qualimap:runtime=5760
- rescaled_bam_qualimap:mem_mb=51200
- rescaled_bam_group:runtime=600
- rescaled_bam_group:mem_mb=12800
### subsample bam files
- filter_bam_mapped_mq:runtime=1440
- subsample_bams:runtime=5760
- subsample_bams:mem_mb=12800
- subsampled_bam_qualimap:runtime=5760
- subsampled_bam_qualimap:mem_mb=38400
- subsampled_bam_group:runtime=2880
- subsampled_bam_group:runtime=2880
### genotyping
- variant_calling:runtime=2880
- variant_calling:mem_mb=19200
- sort_vcfs:runtime=1440
### identify CpG sites
- sorted_bcf2vcf:runtime=300
- make_CpG_genotype_bed:runtime=1440
- CpG_genotype_bed_formatting_group:runtime=1440
- CpG_genotype_bed_formatting_group:mem_mb=25600
- all_CpG_bed_formatting_group:runtime=1440
- all_CpG_bed_formatting_group:mem_mb=12800
- make_noCpG_bed:runtime=300
- make_noCpG_bed:mem_mb=12800
- CpG_repeats_bed_formatting_group:runtime=1440
- CpG_repeats_bed_formatting_group:mem_mb=12800
- merge_noCpG_noRepeats_beds:runtime=300
- merge_noCpG_noRepeats_beds:mem_mb=12800
- make_noCpG_repma_bed:runtime=300
- make_noCpG_repma_bed:mem_mb=12800
### create chromosome bed files
- make_autosomes_bed:runtime=300
- intersect_sexchr_repma_beds:mem_mb=12800
- intersect_autos_repma_beds:mem_mb=12800
- intersect_sexchr_CpG_repma_beds:mem_mb=12800
- intersect_autos_CpG_repma_beds:mem_mb=12800
### mlRho
- bam2pro_autos:runtime=1440
- bam2pro_sexchr:runtime=1440
- bam2pro_all:runtime=1440
### CpG filter VCF files
- remove_CpG_vcf:mem_mb=38400
- CpG_vcf2bcf:mem_mb=12800
### process VCF files
- remove_snps_near_indels:mem_mb=12800
- filter_vcfs_qual_dp:mem_mb=12800
- filter_vcfs_allelic_balance:mem_mb=12800
- remove_repeats_vcf:mem_mb=38400
- filtered_vcf2bcf:mem_mb=12800
### merge and process VCF files
- merge_all_vcfs:runtime=4320
- merge_all_vcfs:mem_mb=38400
- filter_vcf_biallelic:runtime=1440
- filter_vcf_biallelic:mem_mb=12800
- filter_vcf_missing:runtime=1440
- filter_vcf_missing:mem_mb=12800
- extract_historical_samples:runtime=300
- extract_modern_samples:runtime=300
### PCA
- vcf2plink_pca:runtime=300
- vcf2plink_pca:mem_mb=12800
### runs of homozygosity
- filter_vcf_hwe:runtime=300
- filter_vcf_hwe:mem_mb=12800
- vcf2plink_hwe:runtime=300
- vcf2plink_hwe:mem_mb=12800
### snpEff
- repmasked_bcf2vcf_snpEff:runtime=300
- repmasked_bcf2vcf_snpEff:mem_mb=12800
- build_snpEff_db:runtime=300
- filter_biallelic_missing_vcf_snpEff:runtime=1440
- filter_biallelic_missing_vcf_snpEff:mem_mb=38400
- annotate_vcf:runtime=300
### gerp
- outgroups2fastq:runtime=1440
- outgroup_fastqc:mem_mb=12800
- align2target:runtime=4320
- align2target:mem_mb=51200
- bam2fasta:runtime=1440
- bam2fasta:mem_mb=12800
- split_ref_contigs:runtime=1440
- concatenate_fasta_per_contig:runtime=1440
- concatenate_fasta_per_contig:mem_mb=12800
- compute_gerp:runtime=1440
- compute_gerp:mem_mb=25600
- gerp2coords:runtime=1440
- gerp2coords:mem_mb=12800
- get_ancestral_state:runtime=1440
- get_ancestral_state:mem_mb=12800
- produce_contig_out:runtime=1440
- produce_contig_out:mem_mb=12800
- merge_per_chunk:runtime=1440
- merge_per_chunk:mem_mb=12800
- merge_gerp_gz:runtime=1440
- merge_gerp_gz:mem_mb=12800
- repmasked_bcf2vcf_gerp:runtime=300
- repmasked_bcf2vcf_gerp:mem_mb=12800
- filter_biallelic_missing_vcf_gerp:runtime=1440
- filter_biallelic_missing_vcf_gerp:mem_mb=38400
- split_vcf_files:runtime=300
- gerp_derived_alleles:runtime=14400
- gerp_derived_alleles:mem_mb=12800
- merge_gerp_alleles_per_chunk:runtime=1440
- merge_gerp_alleles_per_chunk:mem_mb=25600
- merge_gerp_alleles_gz:runtime=1440
- merge_gerp_alleles_gz:mem_mb=25600
- relative_mutational_load_per_sample:runtime=1440
- relative_mutational_load_per_sample:mem_mb=12800
4 changes: 2 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ channels:
- bioconda
- conda-forge
dependencies:
- python=3.7.6
- snakemake=6.12.1
- python=3.8.15
- snakemake=7.20.0
- biopython=1.76
- matplotlib=3.4.2
- pandas=1.0.3
Expand Down
Loading

0 comments on commit 1e6dd36

Please sign in to comment.