Skip to content

Commit

Permalink
Merge pull request #57 from NBISweden/splitgenome
Browse files Browse the repository at this point in the history
Optional removal of sex chromosomes from VCF files
  • Loading branch information
verku authored Oct 17, 2023
2 parents f1865a9 + 3035286 commit 5b58893
Show file tree
Hide file tree
Showing 20 changed files with 813 additions and 701 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ tmpConsensi.fa
.test/data/references/*pac
.test/data/references/*sa
.test/data/references/*genome
.test/data/references/*upper.fasta
.test/data/references/*upper.fasta
.test/data/references/gerp
55 changes: 28 additions & 27 deletions .test/config/config_mitogenomes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
# The file name will be reused by the pipeline and can have the file
# name extensions *.fasta, *.fa or *.fna.
ref_path: ".test/data/references/sumatran_rhino.fasta"

# OPTIONAL:
# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Is used to create BED files to run mlRho separately for autosomes
# and sex chromosomes or exclusively for autosomes, and/or to create
# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses.
# Can also be used to specify any other contigs/scaffolds, e.g.
# unplaced or short scaffolds, for removal from mlRho analysis
# and BCF files.
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if the pipeline should be run on all scaffolds/contigs of the genome.
sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt"
#################################################################
#################################################################


Expand Down Expand Up @@ -281,26 +296,6 @@ CpG_samplenames: []
# Rules for BAM file processing for mlRho, and mlRho #
#################################################################

#####
# OPTIONAL:
# Generate BED files of autosomes and sex chromosomes for mlRho
# analyses, in case these should be analyzed separately from each
# other (see below for further options).
# Includes intersecting of the new chromosome-specific BED files
# with CpG- and repeat-masking BED files for downstream filtering.
autosome_sexchromosome_bed_files: False

# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if mlRho should be run on all scaffolds/contigs of the genome.
# Keep the path to the file when running the next step (mlRho)
# separately for autosomes and sex chromosomes or only for autosomes.
sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt"
#####


#####
# Run mlRho 2.9 on filtered BAM files.
# Automatically generates a PDF file with a plot of genome-wide
Expand All @@ -318,21 +313,22 @@ mlRho: False
# and/or mlRho should be run on all contigs/scaffolds,
# set mlRho_autosomes_sexchromosomes to False and do not provide
# a path to a text file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 2) If the identity of sex-chromosomal contigs/scaffolds is known,
# mlRho analyses can be run for autosomes and sex chromosomes
# separately from each other.
# In that case, set mlRho_autosomes_sexchromosomes to True and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 3) If the identity of sex-chromosomal contigs/scaffolds is known,
# sex-chromosomal contigs/scaffolds can be entirely excluded from
# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such
# as unplaced or short scaffolds) can be entirely excluded from
# the analysis.
# In that case, set mlRho_autosomes_sexchromosomes to False and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
mlRho_autosomes_sexchromosomes: False
#####
#################################################################
Expand Down Expand Up @@ -365,11 +361,16 @@ vcf_qual_repeat_filtering: False
#####
# Merge BCF files into a BCF file containing all samples and remove all
# sites that are not biallelic and with missing data across all samples
# up to a certain threshold as defined below.
# up to a certain threshold as defined below.
# If the path to a file with sex-chromosomal contigs/scaffolds is provided
# with the reference genome ("sexchromosomes"), these scaffolds/contigs are
# removed from the merged and filtered BCF file and all downstream analyses
# (optional).
# Extract 1) all historical and 2) all modern samples from the merged and
# filtered BCF file.
# Create a BED file of sites that remain after filtering across all samples
# to be used for downstream filtering of individual BCF files.
# Create a BED file of sites that remain after filtering and contig/scaffold
# removal across all samples to be used for downstream filtering of individual
# BCF files.
merge_vcfs_per_dataset: False

# Maximum allowed fraction of missing genotypes across all samples for a
Expand Down
55 changes: 28 additions & 27 deletions .test/config/config_mlRho_options.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
# The file name will be reused by the pipeline and can have the file
# name extensions *.fasta, *.fa or *.fna.
ref_path: ".test/data/references/sumatran_rhino.fasta"

# OPTIONAL:
# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Is used to create BED files to run mlRho separately for autosomes
# and sex chromosomes or exclusively for autosomes, and/or to create
# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses.
# Can also be used to specify any other contigs/scaffolds, e.g.
# unplaced or short scaffolds, for removal from mlRho analysis
# and BCF files.
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if the pipeline should be run on all scaffolds/contigs of the genome.
sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt"
#################################################################
#################################################################


Expand Down Expand Up @@ -281,26 +296,6 @@ CpG_samplenames: ["S03", "S08"]
# Rules for BAM file processing for mlRho, and mlRho #
#################################################################

#####
# OPTIONAL:
# Generate BED files of autosomes and sex chromosomes for mlRho
# analyses, in case these should be analyzed separately from each
# other (see below for further options).
# Includes intersecting of the new chromosome-specific BED files
# with CpG- and repeat-masking BED files for downstream filtering.
autosome_sexchromosome_bed_files: False

# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if mlRho should be run on all scaffolds/contigs of the genome.
# Keep the path to the file when running the next step (mlRho)
# separately for autosomes and sex chromosomes or only for autosomes.
sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt"
#####


#####
# Run mlRho 2.9 on filtered BAM files.
# Automatically generates a PDF file with a plot of genome-wide
Expand All @@ -318,21 +313,22 @@ mlRho: True
# and/or mlRho should be run on all contigs/scaffolds,
# set mlRho_autosomes_sexchromosomes to False and do not provide
# a path to a text file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 2) If the identity of sex-chromosomal contigs/scaffolds is known,
# mlRho analyses can be run for autosomes and sex chromosomes
# separately from each other.
# In that case, set mlRho_autosomes_sexchromosomes to True and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 3) If the identity of sex-chromosomal contigs/scaffolds is known,
# sex-chromosomal contigs/scaffolds can be entirely excluded from
# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such
# as unplaced or short scaffolds) can be entirely excluded from
# the analysis.
# In that case, set mlRho_autosomes_sexchromosomes to False and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
mlRho_autosomes_sexchromosomes: False
#####
#################################################################
Expand Down Expand Up @@ -365,11 +361,16 @@ vcf_qual_repeat_filtering: False
#####
# Merge BCF files into a BCF file containing all samples and remove all
# sites that are not biallelic and with missing data across all samples
# up to a certain threshold as defined below.
# up to a certain threshold as defined below.
# If the path to a file with sex-chromosomal contigs/scaffolds is provided
# with the reference genome ("sexchromosomes"), these scaffolds/contigs are
# removed from the merged and filtered BCF file and all downstream analyses
# (optional).
# Extract 1) all historical and 2) all modern samples from the merged and
# filtered BCF file.
# Create a BED file of sites that remain after filtering across all samples
# to be used for downstream filtering of individual BCF files.
# Create a BED file of sites that remain after filtering and contig/scaffold
# removal across all samples to be used for downstream filtering of individual
# BCF files.
merge_vcfs_per_dataset: False

# Maximum allowed fraction of missing genotypes across all samples for a
Expand Down
55 changes: 28 additions & 27 deletions .test/config/config_pca_roh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
# The file name will be reused by the pipeline and can have the file
# name extensions *.fasta, *.fa or *.fna.
ref_path: ".test/data/references/sumatran_rhino.fasta"

# OPTIONAL:
# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Is used to create BED files to run mlRho separately for autosomes
# and sex chromosomes or exclusively for autosomes, and/or to create
# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses.
# Can also be used to specify any other contigs/scaffolds, e.g.
# unplaced or short scaffolds, for removal from mlRho analysis
# and BCF files.
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if the pipeline should be run on all scaffolds/contigs of the genome.
sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt"
#################################################################
#################################################################


Expand Down Expand Up @@ -281,26 +296,6 @@ CpG_samplenames: []
# Rules for BAM file processing for mlRho, and mlRho #
#################################################################

#####
# OPTIONAL:
# Generate BED files of autosomes and sex chromosomes for mlRho
# analyses, in case these should be analyzed separately from each
# other (see below for further options).
# Includes intersecting of the new chromosome-specific BED files
# with CpG- and repeat-masking BED files for downstream filtering.
autosome_sexchromosome_bed_files: False

# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if mlRho should be run on all scaffolds/contigs of the genome.
# Keep the path to the file when running the next step (mlRho)
# separately for autosomes and sex chromosomes or only for autosomes.
sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt"
#####


#####
# Run mlRho 2.9 on filtered BAM files.
# Automatically generates a PDF file with a plot of genome-wide
Expand All @@ -318,21 +313,22 @@ mlRho: False
# and/or mlRho should be run on all contigs/scaffolds,
# set mlRho_autosomes_sexchromosomes to False and do not provide
# a path to a text file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 2) If the identity of sex-chromosomal contigs/scaffolds is known,
# mlRho analyses can be run for autosomes and sex chromosomes
# separately from each other.
# In that case, set mlRho_autosomes_sexchromosomes to True and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 3) If the identity of sex-chromosomal contigs/scaffolds is known,
# sex-chromosomal contigs/scaffolds can be entirely excluded from
# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such
# as unplaced or short scaffolds) can be entirely excluded from
# the analysis.
# In that case, set mlRho_autosomes_sexchromosomes to False and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
mlRho_autosomes_sexchromosomes: False
#####
#################################################################
Expand Down Expand Up @@ -365,11 +361,16 @@ vcf_qual_repeat_filtering: False
#####
# Merge BCF files into a BCF file containing all samples and remove all
# sites that are not biallelic and with missing data across all samples
# up to a certain threshold as defined below.
# up to a certain threshold as defined below.
# If the path to a file with sex-chromosomal contigs/scaffolds is provided
# with the reference genome ("sexchromosomes"), these scaffolds/contigs are
# removed from the merged and filtered BCF file and all downstream analyses
# (optional).
# Extract 1) all historical and 2) all modern samples from the merged and
# filtered BCF file.
# Create a BED file of sites that remain after filtering across all samples
# to be used for downstream filtering of individual BCF files.
# Create a BED file of sites that remain after filtering and contig/scaffold
# removal across all samples to be used for downstream filtering of individual
# BCF files.
merge_vcfs_per_dataset: False

# Maximum allowed fraction of missing genotypes across all samples for a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
# The file name will be reused by the pipeline and can have the file
# name extensions *.fasta, *.fa or *.fna.
ref_path: ".test/data/references/sumatran_rhino.fasta"

# OPTIONAL:
# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Is used to create BED files to run mlRho separately for autosomes
# and sex chromosomes or exclusively for autosomes, and/or to create
# autosome-only BCF files for PCA, ROH, snpEff and GERP analyses.
# Can also be used to specify any other contigs/scaffolds, e.g.
# unplaced or short scaffolds, for removal from mlRho analysis
# and BCF files.
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if the pipeline should be run on all scaffolds/contigs of the genome.
sexchromosomes: ".test/config/seq_to_exclude.txt" # for example, "config/chrX_candidate_scaffolds.txt"
#################################################################
#################################################################


Expand Down Expand Up @@ -281,26 +296,6 @@ CpG_samplenames: []
# Rules for BAM file processing for mlRho, and mlRho #
#################################################################

#####
# OPTIONAL:
# Generate BED files of autosomes and sex chromosomes for mlRho
# analyses, in case these should be analyzed separately from each
# other (see below for further options).
# Includes intersecting of the new chromosome-specific BED files
# with CpG- and repeat-masking BED files for downstream filtering.
autosome_sexchromosome_bed_files: False

# Relative path (from the main pipeline directory) to file listing
# scaffolds/contigs linked to sex chromosomes (one scaffold/contig
# name per line).
# Leave empty ("") if identity of sex chromosomes is unknown and/or
# if mlRho should be run on all scaffolds/contigs of the genome.
# Keep the path to the file when running the next step (mlRho)
# separately for autosomes and sex chromosomes or only for autosomes.
sexchromosomes: "" # for example, "config/chrX_candidate_scaffolds.txt"
#####


#####
# Run mlRho 2.9 on filtered BAM files.
# Automatically generates a PDF file with a plot of genome-wide
Expand All @@ -318,21 +313,22 @@ mlRho: False
# and/or mlRho should be run on all contigs/scaffolds,
# set mlRho_autosomes_sexchromosomes to False and do not provide
# a path to a text file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 2) If the identity of sex-chromosomal contigs/scaffolds is known,
# mlRho analyses can be run for autosomes and sex chromosomes
# separately from each other.
# In that case, set mlRho_autosomes_sexchromosomes to True and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
#
# 3) If the identity of sex-chromosomal contigs/scaffolds is known,
# sex-chromosomal contigs/scaffolds can be entirely excluded from
# sex-chromosomal contigs/scaffolds (or other contigs/scaffolds such
# as unplaced or short scaffolds) can be entirely excluded from
# the analysis.
# In that case, set mlRho_autosomes_sexchromosomes to False and
# provide the path to the file with sex-chromosomal contigs/scaffolds
# above when running mlRho.
# with the reference genome ("sexchromosomes") when running mlRho.
mlRho_autosomes_sexchromosomes: False
#####
#################################################################
Expand Down Expand Up @@ -365,11 +361,16 @@ vcf_qual_repeat_filtering: False
#####
# Merge BCF files into a BCF file containing all samples and remove all
# sites that are not biallelic and with missing data across all samples
# up to a certain threshold as defined below.
# up to a certain threshold as defined below.
# If the path to a file with sex-chromosomal contigs/scaffolds is provided
# with the reference genome ("sexchromosomes"), these scaffolds/contigs are
# removed from the merged and filtered BCF file and all downstream analyses
# (optional).
# Extract 1) all historical and 2) all modern samples from the merged and
# filtered BCF file.
# Create a BED file of sites that remain after filtering across all samples
# to be used for downstream filtering of individual BCF files.
# Create a BED file of sites that remain after filtering and contig/scaffold
# removal across all samples to be used for downstream filtering of individual
# BCF files.
merge_vcfs_per_dataset: False

# Maximum allowed fraction of missing genotypes across all samples for a
Expand Down
1 change: 1 addition & 0 deletions .test/config/seq_to_exclude.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Sc9M7eS_1280_HRSCAF_1917_split_75000
Loading

0 comments on commit 5b58893

Please sign in to comment.