Add doc and data updates for v0.1.7 release

PacificBiosciences · Aug 23, 2023 · 7b06227 · 7b06227
1 parent 3771c77
commit 7b06227
Show file tree

Hide file tree

Showing 17 changed files with 161 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Change Log
 
+## v0.1.7 - 2023-08-22
+
+### Added
+- Add new data files to partially support hg19 and hs37d5
+
+### Changed
+- Rename expected copy number example files from male/female to XY/XX
+
+### Fixed
+* New FAQ section to explain common errors
+* Improved error message for cov-regex mismatch
+* Improved error messaging for mismatches between aligned BAM contigs and provided reference file contigs
+
 ## v0.1.6 - 2023-03-29
 ### Additions
 * Added support for minimap2-aligned BAM files

diff --git a/README.md b/README.md
@@ -8,6 +8,8 @@ HiFiCNV is a copy number variant (CNV) caller optimized for HiFi reads. Key feat
 - CNV output in bedgraph and VCF formats
 - Efficient multi-threaded analysis
 
+Authors: [Chris Saunders](https://github.com/ctsa), [Matt Holt](https://github.com/holtjma)
+
 ## Early release warning
 Please note that HiFiCNV is still in early development. 
 We are still tweaking the input / output file formats and making changes that can affect the behavior of the program.
@@ -26,7 +28,7 @@ We are still tweaking the input / output file formats and making changes that ca
 
 ## Need help?
 If you notice any missing features, bugs, or need assistance with analyzing the output of HiFiCNV, 
-please don't hesitate to [reach out by email](mailto:mholt@pacificbiosciences.com) or open a GitHub issue.
+please open a GitHub issue.
 
 ## Support information
 HiFiCNV is a pre-release software intended for research use only and not for use in diagnostic procedures. 

diff --git a/data/excluded_regions/cnv.excluded_regions.hg19.bed.gz b/data/excluded_regions/cnv.excluded_regions.hg19.bed.gz
diff --git a/data/excluded_regions/cnv.excluded_regions.hg19.bed.gz.tbi b/data/excluded_regions/cnv.excluded_regions.hg19.bed.gz.tbi
diff --git a/data/excluded_regions/cnv.excluded_regions.hs37d5.bed.gz b/data/excluded_regions/cnv.excluded_regions.hs37d5.bed.gz
diff --git a/data/excluded_regions/cnv.excluded_regions.hs37d5.bed.gz.tbi b/data/excluded_regions/cnv.excluded_regions.hs37d5.bed.gz.tbi
diff --git a/data/excluded_regions/convert_hg19_regions_to_hs37d5.bash b/data/excluded_regions/convert_hg19_regions_to_hs37d5.bash
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+set -o nounset
+
+#
+# This script yields a recommended set of exclusion regions for CNV calling on hs37d5, by
+# converting excluded regions from hg19.
+#
+hg19_excluded_regions=cnv.excluded_regions.hg19.bed.gz
+
+# This script depends on bgzip and tabix, customize these values if they are not already in the path
+bgzip=bgzip
+tabix=tabix
+
+hg19_renamed() {
+  gzip -dc $hg19_excluded_regions |\
+  sed s/^chr// |\
+  awk '$1~/^([1-2]?[0-9]|[XY]|MT)$/'
+}
+
+other() {
+  wget -O - http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz.fai |\
+  awk '$1!~/^([1-2]?[0-9]|[XY]|MT)$/ {printf "%s\t0\t%s\tother\n",$1,$2}'
+}
+
+
+label=cnv.excluded_regions.hs37d5
+
+cat <(hg19_renamed) <(other) | $bgzip -c >| $label.bed.gz
+$tabix -p bed $label.bed.gz
+
diff --git a/data/excluded_regions/get_cnv_exclusion_regions.bash b/data/excluded_regions/get_cnv_exclusion_regions.bash
@@ -11,11 +11,11 @@ set -o nounset
 bgzip=bgzip
 tabix=tabix
 
-# The reference tag for the genome version to use. This script may work with other reference versions in the UCSC genome
-# browser database, but this hasn't been tested.
+# The reference tag for the genome version to use. This script has only been tested for ref values 'hg19' and 'hg38'.
+# It may work with other reference versions in the UCSC genome browser database, but this hasn't been tested.
 ref=hg38
 
-outfile=cnv.excluded_regions.hg38.bed.gz
+outfile=cnv.excluded_regions.${ref}.bed.gz
 
 base_url=http://hgdownload.cse.ucsc.edu/goldenPath/$ref
 
@@ -31,11 +31,18 @@ get_gaps() {
   }'
 }
 
-# Get UCSC Centromere track, simplify labels and convert to bed format:
+# Get UCSC Centromere track if it exists, simplify labels and convert to bed format
+#
+# This file is optional because in at least some older genomes, it may not exist. In such cases the centromere
+# regions are annotated in the gap track instead.
+#
 get_centromeres() {
-  wget -O - $base_url/database/centromeres.txt.gz |\
-  gzip -dc |\
-  awk -v OFS='\t' '{print $2,$3,$4,"centromere";}'
+  url=$base_url/database/centromeres.txt.gz
+  if wget --quiet --spider $url; then
+    wget -O - $url |\
+    gzip -dc |\
+    awk -v OFS='\t' '{print $2,$3,$4,"centromere";}'
+  fi
 }
 
 # Get alpha-satellite regions from the UCSC repeatmasker track

diff --git a/data/expected_cn/expected_cn.hg19.XX.bed b/data/expected_cn/expected_cn.hg19.XX.bed
@@ -0,0 +1,6 @@
+chrX	0	2699520	chrX_PAR_1	2
+chrX	2699520	154931043	chrX_uniq_1	2
+chrX	154931043	155260560	chrX_PAR_2	2
+chrY	0	2649520	chrY_PAR_1	0
+chrY	2649520	59034049	chrY_uniq_1	0
+chrY	59034049	59363566	chrY_PAR_2	0
diff --git a/data/expected_cn/expected_cn.hg19.XY.bed b/data/expected_cn/expected_cn.hg19.XY.bed
@@ -0,0 +1,6 @@
+chrX	0	2699520	chrX_PAR_1	2
+chrX	2699520	154931043	chrX_uniq_1	1
+chrX	154931043	155260560	chrX_PAR_2	2
+chrY	0	2649520	chrY_PAR_1	0
+chrY	2649520	59034049	chrY_uniq_1	1
+chrY	59034049	59363566	chrY_PAR_2	0
diff --git a/data/expected_cn/female_expected_cn.hg38.bed → data/expected_cn/expected_cn.hg38.XX.bed b/data/expected_cn/female_expected_cn.hg38.bed → data/expected_cn/expected_cn.hg38.XX.bed
diff --git a/data/expected_cn/male_expected_cn.hg38.bed → data/expected_cn/expected_cn.hg38.XY.bed b/data/expected_cn/male_expected_cn.hg38.bed → data/expected_cn/expected_cn.hg38.XY.bed
diff --git a/data/expected_cn/expected_cn.hs37d5.XX.bed b/data/expected_cn/expected_cn.hs37d5.XX.bed
@@ -0,0 +1,6 @@
+X	0	2699520	X_PAR_1	2
+X	2699520	154931043	X_uniq_1	2
+X	154931043	155260560	X_PAR_2	2
+Y	0	2649520	Y_PAR_1	0
+Y	2649520	59034049	Y_uniq_1	0
+Y	59034049	59363566	Y_PAR_2	0
diff --git a/data/expected_cn/expected_cn.hs37d5.XY.bed b/data/expected_cn/expected_cn.hs37d5.XY.bed
@@ -0,0 +1,6 @@
+X	0	2699520	X_PAR_1	2
+X	2699520	154931043	X_uniq_1	1
+X	154931043	155260560	X_PAR_2	2
+Y	0	2649520	Y_PAR_1	0
+Y	2649520	59034049	Y_uniq_1	1
+Y	59034049	59363566	Y_PAR_2	0
diff --git a/docs/aux_data.md b/docs/aux_data.md
@@ -3,10 +3,38 @@ Details on how HiFiCNV [auxiliary data files](../data) were generated.
 
 ## Excluded Regions
 Excluded regions can optionally be specified with a bed file.
-Two example exclusion files for hg38/GRCh38 are provided here:
 
-* [cnv.excluded_regions.hg38.bed.gz](../data/excluded_regions/cnv.excluded_regions.hg38.bed.gz) - Contains regions that are known to cause artifacts during data processing (e.g. centromeres).  Script to generate this file can be found [here](../data/excluded_regions/get_cnv_exclusion_regions.bash).
-* [cnv.excluded_regions.common_50.hg38.bed.gz](../data/excluded_regions/cnv.excluded_regions.common_50.hg38.bed.gz) - Contains all of the regions in the above file, plus regions that were frequently called as a duplication or deletion in a population.  The additional regions were generated by running HiFiCNV on our population (N=97), and then storing any bin where >50% of the population had a duplication or deletion overlapping that bin.
+### Pre-computed excluded regions files
+
+Several useful exclusions tracks are provided for commonly used genomes, this can be used directly for those genomes,
+or as examples to develop exclusion files for other genomes.
+
+Two pre-computed exclusion files are provided for hg38/GRCh38:
+
+* [cnv.excluded_regions.hg38.bed.gz](../data/excluded_regions/cnv.excluded_regions.hg38.bed.gz) - Contains regions that
+  are known to cause artifacts during data processing (e.g. centromeres). Script to generate this file can be found
+  [here](../data/excluded_regions/get_cnv_exclusion_regions.bash).
+* [cnv.excluded_regions.common_50.hg38.bed.gz](../data/excluded_regions/cnv.excluded_regions.common_50.hg38.bed.gz) -
+  Contains all the regions in the above file, plus regions that were frequently called as a duplication or deletion 
+  in a population.  The additional regions were generated by running HiFiCNV on our population (N=97), and then storing
+  any bin where >50% of the population had a duplication or deletion overlapping that bin. This is the recommended
+  excluded regions track for human sample analysis.
+
+More limited exclusion files are also provided for hg19 and hs37d5:
+
+* [cnv.excluded_regions.hg19.bed.gz](../data/excluded_regions/cnv.excluded_regions.hg19.bed.gz) - Contains regions that
+  are known to cause artifacts during data processing (e.g. centromeres). This file was generated with the following
+  [script](../data/excluded_regions/get_cnv_exclusion_regions.bash), modified with a 'ref' value of 'hg19'.
+* [cnv.excluded_regions.hs37d5.bed.gz](../data/excluded_regions/cnv.excluded_regions.hs37d5.bed.gz) - Contains regions 
+  that are known to cause artifacts during data processing (e.g. centromeres). This file was generated by the following 
+  [script](../data/excluded_regions/convert_hg19_regions_to_hs37d5.bash) which converts chromosome names from the hg19
+   exclusion file, removing hg19 non-canonical contigs and marking those from hs37d5 as excluded.
+
+Note that the common deletion and duplication calls in the population provided for GRCh38 are not available for the
+other reference genomes. To improve CNV precision, it is recommended to either use GRCh38 or create a similar track
+of common population calls for other reference genomes.
+
+### How excluded regions influence copy number calling
 
 All depth bins intersecting an excluded region are removed from the depth bins track. 
 All minor allele frequency evidence intersecting an excluded region are removed from the MAF track.
@@ -16,12 +44,17 @@ unknown copy-number state -- the probability of all other copy number states are
 state. This means that a copy number change can span through a short excluded region if there is sufficient evidence on
 the left or right flank, but longer excluded regions should be segmented into an unknown state.
 
-### Expected Copy Number
+## Expected Copy Number
+
 By default, HiFiCNV expects each chromosome to have two full copies (e.g. a diploid organism).
 When reporting variants to the output VCF file, it will only report deviations from this expectation.
 However, this expectation is undesirable for some chromosomes (e.g. sex chromosomes) or non-diploid organisms.
 The expectation can be overridden by providing a BED file with expected copy number values.
-Two examples corresponding to male/female in human hg38/GRCh38 are provided here:
+Examples corresponding to XX/XY karyotypes are provided for human GRCh38/hg38, hg19 and hs37d5 references:
 
-* [female_expected_cn.hg38.bed](../data/expected_cn/female_expected_cn.hg38.bed)
-* [male_expected_cn.hg38.bed](../data/expected_cn/male_expected_cn.hg38.bed)
+* [expected_cn.hg38.XX.bed](../data/expected_cn/expected_cn.hg38.XX.bed)
+* [expected_cn.hg38.XY.bed](../data/expected_cn/expected_cn.hg38.XY.bed)
+* [expected_cn.hg19.XX.bed](../data/expected_cn/expected_cn.hg19.XX.bed)
+* [expected_cn.hg19.XY.bed](../data/expected_cn/expected_cn.hg19.XY.bed)
+* [expected_cn.hs37d5.XX.bed](../data/expected_cn/expected_cn.hs37d5.XX.bed)
+* [expected_cn.hs37d5.XY.bed](../data/expected_cn/expected_cn.hs37d5.XY.bed)
diff --git a/docs/outputs.md b/docs/outputs.md
@@ -1,18 +1,30 @@
 # Output files
-All outputs are based on the provided `{OUTPUT_PREFIX}` and a inferred `{sample_name}`.
+All outputs are based on the provided `{OUTPUT_PREFIX}` and an inferred `{sample_name}`.
 The `{sample_name}` is extracted from the alignment file. 
 The name is taken from the first `@RG` tag in the alignment file header including a sample name.
 
 ## Primary outputs
 * `{OUTPUT_PREFIX}.{sample_name}.vcf.gz` - the primary VCF output containing copy number variant calls for the sample
-* `{OUTPUT_PREFIX}.{sample_name}.depth.bw` - a bigwig file containing the depth measurements
+* `{OUTPUT_PREFIX}.{sample_name}.depth.bw` - a bigwig depth track
 * `{OUTPUT_PREFIX}.{sample_name}.copynum.bedgraph` - the copy number values calculated for each region
 
 ## Secondary outputs
 * `{OUTPUT_PREFIX}.log` - the log file generated from running HiFiCNV
 * `{OUTPUT_PREFIX}.{sample_name}.maf.bw` - a bigwig file containing the minor allele frequency measurements, only generated if a VCF file is provided
 
+## Debug outputs
+Additional outputs related to GC correction can be obtained with the `--debug-gc-correction` option, these are debug
+outputs and may change in future updates:
+* `{OUTPUT_PREFIX}.gc_frac.bw` - A bigwig track of GC fraction windows (from the reference sequence) shared across all
+samples.
+* `{OUTPUT_PREFIX}.{sample_name}.gc_scaled_depth.bw` - A bigwig depth track, similar to the standard bigwig depth output 
+except that all depths are scaled by their region's GC correction factor. Note that the internal segmentation model uses
+GC correction factors directly instead of these adjusted depths, so these depths are only used for visualization.
+* `{OUTPUT_PREFIX}.{sample_name}.gc_correction_table.tsv` - Sample GC correction factors as a function of GC fraction
+* `{OUTPUT_PREFIX}.{sample_name}.gc_reduction_factor.bw` - A bigwig track of sample GC correction factors by region
+
 ## VCF notes
 HiFiCNV follows VCF format specification 4.2.
 The `QUAL` field is reported as an average of the next-most-likely copy-number state for each bin from the HMM (see Methods).
 It also includes a `TARGET_SIZE` filter flag for events that are smaller than 100kbp.
+This filter can be disabled using the `--disable-vcf-filters` option.
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -4,6 +4,7 @@ Table of contents:
 * [Quickstart](#quickstart)
 * [Supported upstream processes](#supported-upstream-processes)
 * [Output files](./outputs.md)
+* [FAQ](#faq)
 
 # Quickstart
 ```
@@ -19,7 +20,7 @@ Parameters:
 * `{BAM}` - a BAM file containing reads from the sample
 * `{REF_FASTA}` - a FASTA file containing the reference genome, gzip allowed
 * `{EXCLUDE}` - a BED file of excluded regions, recommended for hg38: [cnv.excluded_regions.common_50.hg38.bed.gz](../data/excluded_regions/cnv.excluded_regions.common_50.hg38.bed.gz)
-* `{EXPECTED_CNV}` - a BED file containing regions with deviant copy number expectations (two copy is the default if unspecified), male and female expectation files are proved in the [expected_cn](../data/expected_cn) folder
+* `{EXPECTED_CNV}` - a BED file specifying expected copy-number by region (two copy is the default if unspecified), example files for human XX/XY karyotypes are provided in the [expected_cn](../data/expected_cn) folder
 * `{THREADS}` - number of threads to use
 * `{OUTPUT_PREFIX}` - the prefix for all output files
 
@@ -29,35 +30,35 @@ See [auxiliary file generation](aux_data.md) for details on the above recommende
 
 ## Example
 
-Example of running HiFiCNV on an HG002 (male) WGS sample.  An optional VCF file is provided for minor-allele frequency outputs in this example:
+Example of running HiFiCNV on an HG002 WGS sample.  An optional VCF file is provided for minor-allele frequency outputs in this example:
 
 ```
 $ hificnv \
 >     --bam /path/to/HG002.GRCh38.deepvariant.haplotagged.bam \
 >     --maf /path/to/HG002.GRCh38.deepvariant.phased.vcf.gz \
 >     --ref /path/to/human_GRCh38_no_alt_analysis_set.fasta \
->     --exclude /path/to/common_events.exclude-merged.bed.gz \
->     --expected-cn /path/to/male_ecn.bed \
+>     --exclude /path/to/cnv.excluded_regions.common_50.hg38.bed.gz \
+>     --expected-cn /path/to/expected_cn.hg38.XY.bed \
 >     --threads 8 \
->     --output-prefix HG002_male
+>     --output-prefix dtracks
 [2022-09-29][06:22:57][hificnv][INFO] Starting hificnv
-[2022-09-29][06:22:57][hificnv][INFO] cmdline: hificnv --bam /path/to/HG002.GRCh38.deepvariant.haplotagged.bam --ref /path/to/human_GRCh38_no_alt_analysis_set.fasta --exclude /path/to/common_events.exclude-merged.bed.gz --expected-cn /path/to/male_ecn.bed --threads 8 --output-prefix HG002_male
+[2022-09-29][06:22:57][hificnv][INFO] cmdline: hificnv --bam /path/to/HG002.GRCh38.deepvariant.haplotagged.bam --ref /path/to/human_GRCh38_no_alt_analysis_set.fasta --exclude /path/to/cnv.excluded_regions.common_50.hg38.bed.gz --expected-cn /path/to/expected_cn.hg38.XY.bed --threads 8 --output-prefix dtracks
 [2022-09-29][06:22:57][hificnv][INFO] Running on 8 threads
 [2022-09-29][06:22:57][hificnv][INFO] Reading reference genome from file '/path/to/human_GRCh38_no_alt_analysis_set.fasta'
-[2022-09-29][06:23:29][hificnv][INFO] Reading excluded regions from file '/path/to/common_events.exclude-merged.bed.gz'
-[2022-09-29][06:23:29][hificnv][INFO] Reading expected CN regions from file '/path/to/male_ecn.bed'
+[2022-09-29][06:23:29][hificnv][INFO] Reading excluded regions from file '/path/to/cnv.excluded_regions.common_50.hg38.bed.gz'
+[2022-09-29][06:23:29][hificnv][INFO] Reading expected CN regions from file '/path/to/expected_cn.hg38.XY.bed'
 [2022-09-29][06:23:29][hificnv][INFO] Processing alignment file '/path/to/HG002.GRCh38.deepvariant.haplotagged.bam'
-[2022-09-29][06:37:47][hificnv][INFO] Writing depth track to bigwig file: 'HG002_male.HG002.depth.bw'
+[2022-09-29][06:37:47][hificnv][INFO] Writing depth track to bigwig file: 'dtracks.HG002.depth.bw'
 [2022-09-29][06:37:48][hificnv][INFO] Scanning minor allele frequency data from file '/path/to/HG002.GRCh38.deepvariant.phased.vcf.gz'
-[2022-09-29][06:38:01][hificnv][INFO] Writing bigwig maf track to file: 'HG002_male.HG002.maf.bw'
+[2022-09-29][06:38:01][hificnv][INFO] Writing bigwig maf track to file: 'dtracks.HG002.maf.bw'
 [2022-09-29][06:38:18][hificnv][INFO] Segmenting copy number
 [2022-09-29][06:38:18][hificnv][INFO] Haploid coverage estimates for sample 'HG002', iteration 1. Uncorrected: 14.955 GC-Corrected: 15.708
 [2022-09-29][06:38:21][hificnv][INFO] Haploid coverage estimates for sample 'HG002', iteration 2. Uncorrected: 14.955 GC-Corrected: 15.708
-[2022-09-29][06:38:23][hificnv][INFO] Writing bedgraph copy number track to file: 'HG002_male.HG002.copynum.bedgraph'
-[2022-09-29][06:38:23][hificnv][INFO] Writing copy number variants to file: 'HG002_male.HG002.vcf.gz'
+[2022-09-29][06:38:23][hificnv][INFO] Writing bedgraph copy number track to file: 'dtracks.HG002.copynum.bedgraph'
+[2022-09-29][06:38:23][hificnv][INFO] Writing copy number variants to file: 'dtracks.HG002.vcf.gz'
 [2022-09-29][06:38:23][hificnv][INFO] hificnv completed. Total Runtime: 00:15:26.323
 $ ls
-HG002_male.HG002.copynum.bedgraph  HG002_male.HG002.depth.bw  HG002_male.HG002.maf.bw  HG002_male.HG002.vcf.gz  HG002_male.log
+dtracks.HG002.copynum.bedgraph  dtracks.HG002.depth.bw  dtracks.HG002.maf.bw  dtracks.HG002.vcf.gz  dtracks.log
 ```
 
 These tracks visualized in IGV appear as follows:
@@ -74,3 +75,11 @@ The following upstream processes are supported as inputs to HiFiCNV:
   * [DeepVariant](https://github.com/google/deepvariant) - for SNV/indel
 
 Other upstream processes may work with HiFiCNV, but there is no official support for them at this time.
+
+# FAQ
+## What does the error "Diploid chromosome regex does not match any sample chromosome names" mean?
+By default, HiFiCNV tries to find autosomes for determining normal single-copy coverage in a sample.
+The default regular expression searches for contigs named like "chr1" or "11" and uses those for normalization.
+If your reference is using a different contig labeling system (e.g. an NCBI name), then you will need to alter the `--cov-regex` option.
+Using `--cov-regex "."` is the easiest option, as it will match _all_ chromosomes available in your files.
+However, this can lead to subtle differences in the output files, primarily because it may match sex chromosomes (which are not always diploid) or decoy/alt contigs that are frequently packaged in reference genome files.