Skip to content

Commit

Permalink
Resolve ensemble calling issues: ensure input VCFs have consistent sa…
Browse files Browse the repository at this point in the history
…mple headers and correctly prioritize reads with depth genotype annotations. Thanks to Shalabh Suman
  • Loading branch information
chapmanb committed Mar 26, 2014
1 parent fc5bac4 commit ae632b0
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 10 deletions.
8 changes: 8 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.1.6 (in progress)

- Ensure Ensemble input files have consistent VCF headers for multi population
variant calling. Thanks to Shalabh Suman.
- Prioritize depth attribute when choosing representative callset from multiple
choices during Ensemble calling. Thanks to Shalabh Suman.
- Avoid identifying mitochondrial-only test datasets as haploid reference genomes.

## 0.1.5 (15 March 2014)

- Move to MIT licensed GATK 3.0 framework.
Expand Down
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject bcbio.variation "0.1.5"
(defproject bcbio.variation "0.1.6-SNAPSHOT"
:description "Toolkit to analyze genomic variation data, built on the GATK with Clojure"
:license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"}
:dependencies [[org.clojure/clojure "1.5.1"]
Expand Down
18 changes: 18 additions & 0 deletions src/bcbio/variation/ensemble.clj
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,23 @@
(spit out-file))
out-file))

(defn first-mismatch
"Return the first mismatched pair in the two sequences"
[c1 c2]
(first (drop-while (fn [[x1 x2]] (= x1 x2)) (map vector c1 c2))))

(defn- check-vcf-headers
"Ensure consistent headers for multi-sample inputs to Ensemble calling."
[vcf-files]
(reduce (fn [samples cmp-vcf]
(let [cmp-samples (gvc/get-samples cmp-vcf)]
(if (= samples cmp-samples)
samples
(throw (Exception. (str "VCF files do not have consistent headers: "
(vec (map fs/base-name vcf-files))
"\nFirst mismatch:" (first-mismatch samples cmp-samples)))))))
(gvc/get-samples (first vcf-files)) (rest vcf-files)))

(defn consensus-calls
"Provide a finalized set of consensus calls from multiple inputs.
Handles cleaning up and normalizing input files, generating consensus
Expand All @@ -90,6 +107,7 @@
ref-file (fsp/abspath ref-file)
dirs (setup-work-dir out-file)
config-file (create-ready-config vrn-files ref-file in-config dirs)]
(check-vcf-headers vrn-files)
(compare/variant-comparison-from-config config-file)
(let [prep-file (first (fs/glob (str (io/file (:prep dirs) "*cfilter.vcf"))))]
(assert prep-file (str "Did not find prepped and filtered consensus file. "
Expand Down
6 changes: 4 additions & 2 deletions src/bcbio/variation/phasing.clj
Original file line number Diff line number Diff line change
Expand Up @@ -452,12 +452,14 @@

(defn is-haploid?
"Is the provided VCF file a haploid genome (one genotype or all homozygous).
Samples the first set of variants, checking for haploid calls."
Samples the first set of variants, checking for haploid calls.
Avoids calling haploid based on chrM-only calls, used during testing."
[vcf-file ref-file]
(let [sample-size 10]
(letfn [(is-vc-haploid? [vc]
(when-not (= 0 (:num-samples vc))
(= 1 (apply max (map #(count (:alleles %)) (:genotypes vc))))))]
(when-not (contains? #{"chrM" "MT" "M" "chrMT"} (:chr vc))
(= 1 (apply max (map #(count (:alleles %)) (:genotypes vc)))))))]
(with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)]
(let [vcf-iter (parse-vcf vcf-iter)]
(if-not (empty? vcf-iter)
Expand Down
12 changes: 5 additions & 7 deletions src/bcbio/variation/recall.clj
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
(map vec)
(into {}))
g (->> (:genotypes vc)
(filter #(= sample (:sample-name %)))
first)]
(filter #(= sample (:sample-name %)))
first)]
(when g
{:sample-name sample
:qual (:qual vc)
Expand All @@ -64,7 +64,7 @@
(if (seq (get-in g [:attributes "PVAL"])) 1 0)
(if (seq (get-in g [:attributes "AD"])) 1 0)
(if (get-in g [:attributes "GQ"]) 1 0)
(if (get-in g [:attributes "DP"]) 1 0))
(if (pos? (get-in g [:attributes "DP"] -1)) 1 0))
:pl (attr/get-pl g)
})))

Expand Down Expand Up @@ -121,10 +121,8 @@
(let [match-fn (juxt :start :ref-allele)
other-vcs (filter #(= (match-fn %) (match-fn vc))
(gvc/variants-in-region input-vc-getter vc))
most-likely-gs (->> samples
(map (partial best-supported-sample-gs other-vcs))
(remove nil?))]
(when (seq most-likely-gs)
most-likely-gs (map (partial best-supported-sample-gs other-vcs) samples)]
(when (not-every? nil? most-likely-gs)
(-> (VariantContextBuilder. (:vc vc))
(.alleles (conj (set (remove #(.isNoCall %) (mapcat :alleles most-likely-gs)))
(:ref-allele vc)))
Expand Down
5 changes: 5 additions & 0 deletions src/bcbio/variation/variantcontext.clj
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@
(with-open [vcf-reader (AbstractFeatureReader/getFeatureReader vcf-file (VCFCodec.) false)]
(.getHeader vcf-reader)))

(defn get-samples
"Retrieve samples from VCF header"
[vcf-file]
(.getGenotypeSamples (get-vcf-header vcf-file)))

;; ## Writing VCF files

(defn merge-headers
Expand Down

0 comments on commit ae632b0

Please sign in to comment.