From 3bc069d2425b233942bd2618722fa0d73f8384ef Mon Sep 17 00:00:00 2001
From: Ilya Soifer
Date: Thu, 12 May 2022 16:50:39 +0300
Subject: [PATCH 01/46] Squashed changes brought in by flow based support
---
.../GATKAnnotationPluginDescriptor.java | 21 +
.../cmdline/StandardArgumentDefinitions.java | 1 +
...MarkDuplicatesSparkArgumentCollection.java | 57 +-
.../programgroups/FlowBasedProgramGroup.java | 16 +
.../engine/AssemblyRegionWalker.java | 10 +-
.../hellbender/engine/GATKTool.java | 10 +
.../hellbender/engine/PartialReadWalker.java | 75 +
.../hellbender/engine/filters/ReadFilter.java | 12 +
.../engine/filters/ReadFilterLibrary.java | 9 +
.../FlowBasedHmerBasedReadFilterHelper.java | 76 +
...lowBasedTPAttributeSymetricReadFilter.java | 34 +
.../FlowBasedTPAttributeValidReadFilter.java | 46 +
.../flow/HmerQualitySymetricReadFilter.java | 31 +
.../flow/ReadGroupHasFlowOrderReadFilter.java | 44 +
.../flow/WellformedFlowBasedReadFilter.java | 71 +
.../engine/spark/GATKRegistrator.java | 6 +-
.../CalculateAverageCombinedAnnotations.java | 78 +
.../hellbender/tools/ClipReads.java | 111 +-
.../FlowBasedAlignmentArgumentCollection.java | 26 +
.../tools/FlowBasedArgumentCollection.java | 154 +
.../tools/HaplotypeCallerSpark.java | 5 +-
.../hellbender/tools/SplitCRAM.java | 133 +
.../tools/genomicsdb/GenomicsDBUtils.java | 12 +
.../markduplicates/MarkDuplicatesSpark.java | 26 +-
.../MarkDuplicatesSparkUtils.java | 111 +-
.../tools/walkers/GenotypeGVCFs.java | 7 +-
.../tools/walkers/GenotypeGVCFsEngine.java | 20 +-
.../walkers/annotator/AnnotationUtils.java | 2 +-
.../walkers/annotator/AssemblyComplexity.java | 40 +-
.../HaplotypeFilteringAnnotation.java | 47 +
.../annotator/JumboInfoAnnotation.java | 19 +-
.../tools/walkers/annotator/RawGtCount.java | 120 +
.../walkers/annotator/StrandBiasBySample.java | 6 +
.../walkers/annotator/StrandBiasTest.java | 54 +-
.../walkers/annotator/StrandOddsRatio.java | 12 +-
.../annotator/VariantAnnotatorEngine.java | 36 +-
.../annotator/flow/CycleSkipStatus.java | 56 +
.../annotator/flow/FlowAnnotatorBase.java | 476 ++
.../walkers/annotator/flow/GcContent.java | 45 +
.../annotator/flow/HmerIndelLength.java | 46 +
.../walkers/annotator/flow/HmerIndelNuc.java | 25 +
.../walkers/annotator/flow/HmerMotifs.java | 46 +
.../walkers/annotator/flow/IndelClassify.java | 41 +
.../walkers/annotator/flow/IndelLength.java | 25 +
.../flow/StandardFlowBasedAnnotation.java | 9 +
.../walkers/annotator/flow/VariantType.java | 46 +
.../walkers/featuremapping/FeatureMapper.java | 12 +
.../featuremapping/FlowFeatureMapper.java | 626 ++
.../FlowFeatureMapperArgumentCollection.java | 105 +
.../walkers/featuremapping/SNVMapper.java | 213 +
.../IndependentSampleGenotypesModel.java | 6 +-
.../afcalc/AlleleFrequencyCalculator.java | 86 +-
.../AncestralContigLocationTranslator.java | 51 +
.../groundtruth/GroundTruthReadsBuilder.java | 986 +++
.../LocationTranslationException.java | 13 +
.../SingleFileLocationTranslator.java | 41 +
.../haplotypecaller/AlleleAndContext.java | 54 +
.../haplotypecaller/AlleleFiltering.java | 593 ++
.../haplotypecaller/AlleleFilteringHC.java | 70 +
.../AlleleFilteringMutect.java | 86 +
.../AlleleLikelihoodWriter.java | 106 +
...AssemblyBasedCallerArgumentCollection.java | 99 +-
.../AssemblyBasedCallerUtils.java | 132 +-
.../AssemblyRegionTrimmer.java | 10 +
.../haplotypecaller/AssemblyResultSet.java | 46 +-
.../FlowBasedAlignmentLikelihoodEngine.java | 418 +
.../haplotypecaller/FlowBasedHMMEngine.java | 270 +
.../haplotypecaller/HaplotypeCaller.java | 31 +-
.../HaplotypeCallerArgumentCollection.java | 86 +-
.../HaplotypeCallerEngine.java | 230 +-
.../HaplotypeCallerGenotypingEngine.java | 101 +-
...dThreadingAssemblerArgumentCollection.java | 10 +-
.../LikelihoodEngineArgumentCollection.java | 51 +-
.../haplotypecaller/LocationAndAlleles.java | 5 +-
...gHomopolymerHaplotypeCollapsingEngine.java | 460 +
...SymmetricalPairHMMInputScoreImputator.java | 60 +
.../haplotypecaller/OccurrenceMatrix.java | 132 +
.../PairHMMLikelihoodCalculationEngine.java | 182 +-
.../PairHMMNativeArgumentCollection.java | 2 +-
.../RampedHaplotypeCaller.java | 79 +
...mpedHaplotypeCallerArgumentCollection.java | 84 +
.../RampedHaplotypeCallerEngine.java | 619 ++
.../ReadLikelihoodCalculationEngine.java | 221 +-
...dThreadingAssemblerArgumentCollection.java | 65 +-
.../ReferenceConfidenceModel.java | 112 +-
.../haplotypecaller/graphs/BaseGraph.java | 5 +-
.../haplotypecaller/graphs/InverseAllele.java | 79 +
.../JunctionTreeKBestHaplotypeFinder.java | 2 +-
.../walkers/haplotypecaller/graphs/Path.java | 4 +-
.../graphs/SharedVertexSequenceSplitter.java | 4 +-
.../ramps/AssemblerOffRamp.java | 104 +
.../haplotypecaller/ramps/OffRampBase.java | 104 +
.../haplotypecaller/ramps/OnRampBase.java | 214 +
.../ramps/PostAssemblerOnRamp.java | 155 +
.../ramps/PostFilterOnRamp.java | 158 +
.../ramps/PreFilterOffRamp.java | 126 +
.../haplotypecaller/ramps/RampBase.java | 64 +
.../haplotypecaller/ramps/RampUtils.java | 174 +
.../AbstractReadThreadingGraph.java | 2 +-
.../JunctionTreeLinkedDeBruijnGraph.java | 10 +-
.../readthreading/ReadThreadingAssembler.java | 104 +-
.../readthreading/ReadThreadingGraph.java | 1 -
.../walkers/mutect/M2ArgumentCollection.java | 63 +-
.../tools/walkers/mutect/Mutect2.java | 23 +-
.../tools/walkers/mutect/Mutect2Engine.java | 92 +-
.../mutect/SomaticGenotypingEngine.java | 13 +-
.../SomaticReferenceConfidenceModel.java | 14 +-
.../mutect/SubsettedLikelihoodMatrix.java | 2 +-
.../FilterAlignmentArtifacts.java | 4 +-
.../HaplotypeBasedVariantRecaller.java | 239 +
...asedVariantRecallerArgumentCollection.java | 31 +
.../HaplotypeRegionWalker.java | 130 +
.../variantrecalling/TrimmedReadsReader.java | 112 +
.../VariantRecallerResultWriter.java | 226 +
.../LeftAlignAndTrimVariants.java | 61 +-
.../walkers/vqsr/CNNVariantWriteTensors.java | 1 -
.../hellbender/utils/BaseUtils.java | 75 +
.../hellbender/utils/clipping/ClippingOp.java | 1 +
.../utils/clipping/ReadClipper.java | 23 +-
.../downsampling/PositionalDownsampler.java | 4 +
.../downsampling/ReservoirDownsampler.java | 17 +-
.../utils/genotyper/AlleleLikelihoods.java | 133 +-
.../hellbender/utils/haplotype/EventMap.java | 19 +-
.../utils/haplotype/FlowBasedHaplotype.java | 73 +
.../hellbender/utils/haplotype/Haplotype.java | 55 +-
.../utils/haplotype/HaplotypeBAMWriter.java | 59 +-
.../hellbender/utils/help/HelpConstants.java | 9 +
.../utils/pairhmm/FlowBasedPairHMM.java | 267 +
.../utils/read/ArtificialReadUtils.java | 22 +-
.../utils/read/FlowBasedKeyCodec.java | 152 +
.../hellbender/utils/read/FlowBasedRead.java | 1183 +++
.../utils/read/FlowBasedReadUtils.java | 350 +
.../hellbender/utils/read/GATKRead.java | 9 +-
.../hellbender/utils/read/ReadUtils.java | 8 +-
.../read/SAMRecordToGATKReadAdapter.java | 62 +-
.../MarkDuplicatesScoringStrategy.java | 5 +-
.../utils/read/markduplicates/ReadsKey.java | 16 +-
.../sparkrecords/FlowModeFragment.java | 143 +
.../MarkDuplicatesSparkRecord.java | 13 +-
.../utils/runtime/RuntimeUtils.java | 2 -
.../utils/variant/GATKVCFConstants.java | 22 +
.../utils/variant/GATKVCFHeaderLines.java | 20 +-
.../variant/GATKVariantContextUtils.java | 63 +
...medFlowBasedReadFilterIntegrationTest.java | 53 +
...ageCombinedAnnotationsIntegrationTest.java | 48 +
.../tools/ClipReadsIntegrationTest.java | 2 +
.../tools/SplitCRAMIntegrationTest.java | 71 +
.../MarkDuplicatesSparkIntegrationTest.java | 34 +
.../MarkDuplicatesSparkUnitTest.java | 3 +-
.../MarkDuplicatesSparkUtilsUnitTest.java | 10 +-
.../walkers/GenotypeGVCFsIntegrationTest.java | 28 +
.../tools/walkers/GenotypeGVCFsUnitTest.java | 16 +-
.../annotator/flow/FlowAnnotatorUnitTest.java | 165 +
.../FlowFeatureMapperIntegrationTest.java | 54 +
...roundTruthReadsBuilderIntegrationTest.java | 88 +
.../AlleleFilteringUnitTest.java | 315 +
.../AssemblyBasedCallerUtilsUnitTest.java | 18 +-
.../FlowBasedAlignmentIntegrationTest.java | 55 +
...sedAlignmentLikelihoodEngineTestUtils.java | 42 +
...asedAlignmentLikelihoodEngineUnitTest.java | 153 +
.../FlowBasedHaplotypeIntegrationTest.java | 95 +
...plotypeCallerGenotypingEngineUnitTest.java | 2 +-
.../HaplotypeCallerIntegrationTest.java | 351 +-
...ymerHaplotypeCollapsingEngineUnitTest.java | 118 +
.../OccurrenceMatrixUnitTest.java | 96 +
...MMLikelihoodCalculationEngineUnitTest.java | 5 +-
.../RampedHaplotypeCallerIntegrationTest.java | 127 +
.../ReferenceConfidenceModelUnitTest.java | 162 +-
.../graphs/SharedSequenceMergerUnitTest.java | 32 +
.../ReadThreadingAssemblerUnitTest.java | 11 +-
.../mutect/Mutect2IntegrationTest.java | 25 +-
.../variantrecalling/FlowTestConstants.java | 11 +
...peBasedVariantRecallerIntegrationTest.java | 49 +
.../TrimmedReadsReaderUnitTest.java | 92 +
.../VariantRecallerResultWriterUnitTest.java | 45 +
.../LeftAlignAndTrimVariantsUnitTest.java | 160 -
.../hellbender/utils/BaseUtilsUnitTest.java | 65 +-
.../haplotype/FlowBasedHaplotypeUnitTest.java | 113 +
.../pairhmm/FlowBasedPairHMMUnitTest.java | 83 +
.../utils/read/FlowBasedKeyCodecUnitTest.java | 83 +
.../read/FlowBasedReadIntegrationTest.java | 72 +
.../utils/read/FlowBasedReadUnitTest.java | 169 +
.../utils/read/GATKReadAdaptersUnitTest.java | 69 +-
.../GATKVariantContextUtilsUnitTest.java | 144 +
.../FlowBasedHaplotype_HC_flow_chr9.part.bam | 3 +
...owBasedHaplotype_HC_flow_chr9.part.bam.bai | 3 +
.../large/expected_SplitCRAM_output_0000.cram | 3 +
.../large/expected_SplitCRAM_output_0001.cram | 3 +
.../snv_feature_mapper_input.bam | 3 +
.../snv_feature_mapper_input.bam.bai | 3 +
.../snv_feature_mapper_output.vcf | 3 +
.../snv_feature_mapper_output.vcf.idx | 3 +
.../input_jukebox_for_test.expected.alm | 3 +
...2.highconf.q60.chr6_30000000_40000000.cram | 3 +
...hconf.q60.chr6_30000000_40000000.cram.crai | 3 +
.../large/groundTruth/chr6_HG001.map | 3 +
.../groundTruth/chr6_HG001_maternal.dict | 3 +
.../large/groundTruth/chr6_HG001_maternal.fa | 3 +
.../groundTruth/chr6_HG001_maternal.fa.fai | 3 +
.../groundTruth/chr6_HG001_paternal.dict | 3 +
.../large/groundTruth/chr6_HG001_paternal.fa | 3 +
.../groundTruth/chr6_HG001_paternal.fa.fai | 3 +
.../large/groundTruth/ground_truth_output.csv | 3 +
.../ground_truth_output_limited.csv | 3 +
.../large/groundTruth/maternal.chr6.csv | 3 +
.../large/groundTruth/paternal.chr6.csv | 3 +
.../large/input_jukebox_for_test.bai | 3 +
.../large/input_jukebox_for_test.bam | 3 +
.../large/readFilter/read_filter_output.sam | 3 +
.../large/testFlowModeFlag_expected.bam | 3 +
.../large/testFlowModeFlag_expected.bam.bai | 3 +
.../large/testFlowModeFlag_expected.bam.sbi | 3 +
.../large/testFlowModeFlag_expected.txt | 3 +
.../large/variantRecalling/150292-BC05.vcf.gz | 3 +
.../variantRecalling/150292-BC05.vcf.gz.tbi | 3 +
.../variantRecalling/chr5.bam1.rename.bam | 3 +
.../variantRecalling/chr5.bam1.rename.bam.bai | 3 +
.../variantRecalling/chr5.bam2.rename.bam | 3 +
.../variantRecalling/chr5.bam2.rename.bam.bai | 3 +
.../large/variantRecalling/haps_chr5.bam | 3 +
.../large/variantRecalling/haps_chr5.bam.bai | 3 +
.../variantRecallerBasic.expected.csv | 3 +
.../tools/ClipReads/clipAdapters.bam | Bin 0 -> 111319 bytes
.../ClipReads/expected.clipAdapters.CA.bam | Bin 0 -> 97939 bytes
.../ClipReads/expected.clipAdapters.CA.tmp | 12 +
...calculate_average_combined_annotations.vcf | 70 +
...OfSmithWatermanParameters.HC.gatk4.vcf.idx | Bin 10511 -> 10536 bytes
...estGVCFMode.gatk4.alleleSpecific.g.vcf.idx | Bin 40797 -> 40822 bytes
.../expected.testGVCFMode.gatk4.g.vcf.idx | Bin 40782 -> 40807 bytes
...ltAlleleBasedOnHaptypeScores.gatk4.vcf.idx | Bin 2129 -> 2154 bytes
...ected.testLinkedDebruijnMode.gatk4.vcf.idx | Bin 2916 -> 2941 bytes
.../expected.testVCFMode.gatk4.DRAGEN.vcf.idx | Bin 2912 -> 2937 bytes
.../expected.testVCFMode.gatk4.FRDBQD.vcf.idx | Bin 3849 -> 3874 bytes
.../expected.testVCFMode.gatk4.vcf.idx | Bin 2904 -> 2929 bytes
...cted.testVCFMode.gatk4.withDDandDF.vcf.idx | Bin 2928 -> 2914 bytes
.../ramps/test_noramps.expected.vcf | 7550 ++++++++++++++++
.../ramps/test_noramps.expected.vcf.idx | Bin 0 -> 438900 bytes
.../test_post_assembler_offramp.expected.zip | Bin 0 -> 218158 bytes
.../test_post_assembler_output.expected.vcf | 7091 ++++++++++++++++
...est_post_assembler_output.expected.vcf.idx | Bin 0 -> 438915 bytes
.../test_pre_assembler_offramp.expected.zip | Bin 0 -> 911 bytes
.../test_pre_filter_offramp.expected.zip | Bin 0 -> 318323 bytes
...estGvcfBeforeRebase.expected.flowbased.vcf | 7543 ++++++++++++++++
...vcfBeforeRebase.expected.flowbased.vcf.idx | Bin 0 -> 438886 bytes
...ingFlowModeAdvanced.expected.flowbased.vcf | 7555 +++++++++++++++++
...lowModeAdvanced.expected.flowbased.vcf.idx | Bin 0 -> 438933 bytes
...ingFlowModeStandard.expected.flowbased.vcf | 7517 ++++++++++++++++
...lowModeStandard.expected.flowbased.vcf.idx | Bin 0 -> 438934 bytes
...GvcfKeepLoneAlleles.expected.flowbased.vcf | 7545 ++++++++++++++++
...KeepLoneAlleles.expected.flowbased.vcf.idx | Bin 0 -> 438887 bytes
...ityAnnotationRevamp.expected.flowbased.vcf | 6349 ++++++++++++++
...nnotationRevamp.expected.flowbased.vcf.idx | Bin 0 -> 438909 bytes
...testVcfBeforeRebase.expected.flowbased.vcf | 3443 ++++++++
...VcfBeforeRebase.expected.flowbased.vcf.idx | Bin 0 -> 114821 bytes
...omplexityAnnotation.expected.flowbased.vcf | 3446 ++++++++
...exityAnnotation.expected.flowbased.vcf.idx | Bin 0 -> 114841 bytes
.../test_flowBasedHMM.expected.vcf | 3444 ++++++++
.../test_flowBasedHMM.expected.vcf.idx | Bin 0 -> 114824 bytes
.../test_flowBasedHMM_Stepwise.expected.vcf | 3443 ++++++++
...est_flowBasedHMM_Stepwise.expected.vcf.idx | Bin 0 -> 114818 bytes
.../walkers/GenotypeGVCFs/twoReblocked.g.vcf | 48 +-
....AS.chr20snippet.reblocked.hiRes.g.vcf.idx | Bin 0 -> 331 bytes
.../testJustOneSample.expected.g.vcf.idx | Bin 0 -> 232 bytes
.../testNonRefADCorrection.expected.g.vcf.idx | Bin 0 -> 245 bytes
.../expected/fromHeaderSAM.sam | 2 +-
.../HaplotypeBAMWriter/expected/testBAM.bam | Bin 457 -> 421 bytes
.../HaplotypeBAMWriter/expected/testSAM.sam | 2 +-
.../utils/read/flow/reads/input/sample.bam | Bin 0 -> 126359 bytes
.../utils/read/flow/reads/input/sample.t0.bam | Bin 0 -> 123418 bytes
.../read/flow/reads/outputs/sample.0.key.txt | 371 +
.../flow/reads/outputs/sample.0.matrix.txt | 5565 ++++++++++++
.../read/flow/reads/outputs/sample.1.key.txt | 460 +
.../flow/reads/outputs/sample.1.matrix.txt | 6900 +++++++++++++++
.../read/flow/reads/outputs/sample.10.key.txt | 421 +
.../flow/reads/outputs/sample.10.matrix.txt | 6315 ++++++++++++++
.../read/flow/reads/outputs/sample.11.key.txt | 290 +
.../flow/reads/outputs/sample.11.matrix.txt | 4350 ++++++++++
.../read/flow/reads/outputs/sample.12.key.txt | 206 +
.../flow/reads/outputs/sample.12.matrix.txt | 3090 +++++++
.../read/flow/reads/outputs/sample.13.key.txt | 404 +
.../flow/reads/outputs/sample.13.matrix.txt | 6060 +++++++++++++
.../read/flow/reads/outputs/sample.14.key.txt | 312 +
.../flow/reads/outputs/sample.14.matrix.txt | 4680 ++++++++++
.../read/flow/reads/outputs/sample.15.key.txt | 358 +
.../flow/reads/outputs/sample.15.matrix.txt | 5370 ++++++++++++
.../read/flow/reads/outputs/sample.16.key.txt | 443 +
.../flow/reads/outputs/sample.16.matrix.txt | 6645 +++++++++++++++
.../read/flow/reads/outputs/sample.17.key.txt | 321 +
.../flow/reads/outputs/sample.17.matrix.txt | 4815 +++++++++++
.../read/flow/reads/outputs/sample.18.key.txt | 357 +
.../flow/reads/outputs/sample.18.matrix.txt | 5355 ++++++++++++
.../read/flow/reads/outputs/sample.19.key.txt | 409 +
.../flow/reads/outputs/sample.19.matrix.txt | 6135 +++++++++++++
.../read/flow/reads/outputs/sample.2.key.txt | 396 +
.../flow/reads/outputs/sample.2.matrix.txt | 5940 +++++++++++++
.../read/flow/reads/outputs/sample.20.key.txt | 257 +
.../flow/reads/outputs/sample.20.matrix.txt | 3855 +++++++++
.../read/flow/reads/outputs/sample.21.key.txt | 231 +
.../flow/reads/outputs/sample.21.matrix.txt | 3465 ++++++++
.../read/flow/reads/outputs/sample.22.key.txt | 184 +
.../flow/reads/outputs/sample.22.matrix.txt | 2760 ++++++
.../read/flow/reads/outputs/sample.23.key.txt | 412 +
.../flow/reads/outputs/sample.23.matrix.txt | 6180 ++++++++++++++
.../read/flow/reads/outputs/sample.24.key.txt | 267 +
.../flow/reads/outputs/sample.24.matrix.txt | 4005 +++++++++
.../read/flow/reads/outputs/sample.25.key.txt | 376 +
.../flow/reads/outputs/sample.25.matrix.txt | 5640 ++++++++++++
.../read/flow/reads/outputs/sample.3.key.txt | 425 +
.../flow/reads/outputs/sample.3.matrix.txt | 6375 ++++++++++++++
.../read/flow/reads/outputs/sample.4.key.txt | 444 +
.../flow/reads/outputs/sample.4.matrix.txt | 6660 +++++++++++++++
.../read/flow/reads/outputs/sample.5.key.txt | 326 +
.../flow/reads/outputs/sample.5.matrix.txt | 4890 +++++++++++
.../read/flow/reads/outputs/sample.6.key.txt | 396 +
.../flow/reads/outputs/sample.6.matrix.txt | 5940 +++++++++++++
.../read/flow/reads/outputs/sample.7.key.txt | 434 +
.../flow/reads/outputs/sample.7.matrix.txt | 6510 ++++++++++++++
.../read/flow/reads/outputs/sample.8.key.txt | 410 +
.../flow/reads/outputs/sample.8.matrix.txt | 6150 ++++++++++++++
.../read/flow/reads/outputs/sample.9.key.txt | 316 +
.../flow/reads/outputs/sample.9.matrix.txt | 4740 +++++++++++
.../flow/reads/outputs/sample.t0.0.key.txt | 369 +
.../flow/reads/outputs/sample.t0.0.matrix.txt | 5535 ++++++++++++
.../flow/reads/outputs/sample.t0.1.key.txt | 385 +
.../flow/reads/outputs/sample.t0.1.matrix.txt | 5775 +++++++++++++
.../flow/reads/outputs/sample.t0.10.key.txt | 339 +
.../reads/outputs/sample.t0.10.matrix.txt | 5085 +++++++++++
.../flow/reads/outputs/sample.t0.11.key.txt | 358 +
.../reads/outputs/sample.t0.11.matrix.txt | 5370 ++++++++++++
.../flow/reads/outputs/sample.t0.12.key.txt | 326 +
.../reads/outputs/sample.t0.12.matrix.txt | 4890 +++++++++++
.../flow/reads/outputs/sample.t0.13.key.txt | 377 +
.../reads/outputs/sample.t0.13.matrix.txt | 5655 ++++++++++++
.../flow/reads/outputs/sample.t0.14.key.txt | 300 +
.../reads/outputs/sample.t0.14.matrix.txt | 4500 ++++++++++
.../flow/reads/outputs/sample.t0.15.key.txt | 398 +
.../reads/outputs/sample.t0.15.matrix.txt | 5970 +++++++++++++
.../flow/reads/outputs/sample.t0.16.key.txt | 401 +
.../reads/outputs/sample.t0.16.matrix.txt | 6015 +++++++++++++
.../flow/reads/outputs/sample.t0.17.key.txt | 275 +
.../reads/outputs/sample.t0.17.matrix.txt | 4125 +++++++++
.../flow/reads/outputs/sample.t0.18.key.txt | 328 +
.../reads/outputs/sample.t0.18.matrix.txt | 4920 +++++++++++
.../flow/reads/outputs/sample.t0.19.key.txt | 402 +
.../reads/outputs/sample.t0.19.matrix.txt | 6030 +++++++++++++
.../flow/reads/outputs/sample.t0.2.key.txt | 398 +
.../flow/reads/outputs/sample.t0.2.matrix.txt | 5970 +++++++++++++
.../flow/reads/outputs/sample.t0.20.key.txt | 214 +
.../reads/outputs/sample.t0.20.matrix.txt | 3210 +++++++
.../flow/reads/outputs/sample.t0.21.key.txt | 372 +
.../reads/outputs/sample.t0.21.matrix.txt | 5580 ++++++++++++
.../flow/reads/outputs/sample.t0.22.key.txt | 399 +
.../reads/outputs/sample.t0.22.matrix.txt | 5985 +++++++++++++
.../flow/reads/outputs/sample.t0.23.key.txt | 400 +
.../reads/outputs/sample.t0.23.matrix.txt | 6000 +++++++++++++
.../flow/reads/outputs/sample.t0.24.key.txt | 399 +
.../reads/outputs/sample.t0.24.matrix.txt | 5985 +++++++++++++
.../flow/reads/outputs/sample.t0.3.key.txt | 353 +
.../flow/reads/outputs/sample.t0.3.matrix.txt | 5295 ++++++++++++
.../flow/reads/outputs/sample.t0.4.key.txt | 353 +
.../flow/reads/outputs/sample.t0.4.matrix.txt | 5295 ++++++++++++
.../flow/reads/outputs/sample.t0.5.key.txt | 400 +
.../flow/reads/outputs/sample.t0.5.matrix.txt | 6000 +++++++++++++
.../flow/reads/outputs/sample.t0.6.key.txt | 300 +
.../flow/reads/outputs/sample.t0.6.matrix.txt | 4500 ++++++++++
.../flow/reads/outputs/sample.t0.7.key.txt | 401 +
.../flow/reads/outputs/sample.t0.7.matrix.txt | 6015 +++++++++++++
.../flow/reads/outputs/sample.t0.8.key.txt | 381 +
.../flow/reads/outputs/sample.t0.8.matrix.txt | 5715 +++++++++++++
.../flow/reads/outputs/sample.t0.9.key.txt | 303 +
.../flow/reads/outputs/sample.t0.9.matrix.txt | 4545 ++++++++++
.../testutils/IntegrationTestSpec.java | 99 +-
.../testutils/SamAssertionUtils.java | 7 +-
373 files changed, 372274 insertions(+), 884 deletions(-)
create mode 100644 src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/FlowBasedProgramGroup.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/engine/PartialReadWalker.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedHmerBasedReadFilterHelper.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeSymetricReadFilter.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeValidReadFilter.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/engine/filters/flow/HmerQualitySymetricReadFilter.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/engine/filters/flow/ReadGroupHasFlowOrderReadFilter.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/engine/filters/flow/WellformedFlowBasedReadFilter.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/CalculateAverageCombinedAnnotations.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/FlowBasedAlignmentArgumentCollection.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/SplitCRAM.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/HaplotypeFilteringAnnotation.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RawGtCount.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/CycleSkipStatus.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/GcContent.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelLength.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelNuc.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerMotifs.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelClassify.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelLength.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/StandardFlowBasedAnnotation.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/VariantType.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FeatureMapper.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AncestralContigLocationTranslator.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/LocationTranslationException.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SingleFileLocationTranslator.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleAndContext.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringMutect.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleLikelihoodWriter.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/LongHomopolymerHaplotypeCollapsingEngine.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/NonSymmetricalPairHMMInputScoreImputator.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/OccurrenceMatrix.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCaller.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerArgumentCollection.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/InverseAllele.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/AssemblerOffRamp.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/OffRampBase.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/OnRampBase.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/PostAssemblerOnRamp.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/PostFilterOnRamp.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/PreFilterOffRamp.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/RampBase.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ramps/RampUtils.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeBasedVariantRecaller.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeBasedVariantRecallerArgumentCollection.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeRegionWalker.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/TrimmedReadsReader.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/VariantRecallerResultWriter.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/haplotype/FlowBasedHaplotype.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/pairhmm/FlowBasedPairHMM.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/read/FlowBasedKeyCodec.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/read/FlowBasedRead.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/read/FlowBasedReadUtils.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/sparkrecords/FlowModeFragment.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/engine/filters/flow/WellFormedFlowBasedReadFilterIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/CalculateAverageCombinedAnnotationsIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/SplitCRAMIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilderIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngineTestUtils.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngineUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHaplotypeIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/LongHomopolymerHaplotypeCollapsingEngineUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/OccurrenceMatrixUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/SharedSequenceMergerUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/FlowTestConstants.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeBasedVariantRecallerIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/TrimmedReadsReaderUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/VariantRecallerResultWriterUnitTest.java
delete mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/utils/haplotype/FlowBasedHaplotypeUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/utils/pairhmm/FlowBasedPairHMMUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/utils/read/FlowBasedKeyCodecUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/utils/read/FlowBasedReadIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/utils/read/FlowBasedReadUnitTest.java
create mode 100644 src/test/resources/large/FlowBasedHaplotype_HC_flow_chr9.part.bam
create mode 100644 src/test/resources/large/FlowBasedHaplotype_HC_flow_chr9.part.bam.bai
create mode 100644 src/test/resources/large/expected_SplitCRAM_output_0000.cram
create mode 100644 src/test/resources/large/expected_SplitCRAM_output_0001.cram
create mode 100644 src/test/resources/large/featureMapping/snv_feature_mapper_input.bam
create mode 100644 src/test/resources/large/featureMapping/snv_feature_mapper_input.bam.bai
create mode 100644 src/test/resources/large/featureMapping/snv_feature_mapper_output.vcf
create mode 100644 src/test/resources/large/featureMapping/snv_feature_mapper_output.vcf.idx
create mode 100644 src/test/resources/large/flowBasedAlignment/input_jukebox_for_test.expected.alm
create mode 100644 src/test/resources/large/groundTruth/150548_1-UGAv3-2.highconf.q60.chr6_30000000_40000000.cram
create mode 100644 src/test/resources/large/groundTruth/150548_1-UGAv3-2.highconf.q60.chr6_30000000_40000000.cram.crai
create mode 100644 src/test/resources/large/groundTruth/chr6_HG001.map
create mode 100644 src/test/resources/large/groundTruth/chr6_HG001_maternal.dict
create mode 100644 src/test/resources/large/groundTruth/chr6_HG001_maternal.fa
create mode 100644 src/test/resources/large/groundTruth/chr6_HG001_maternal.fa.fai
create mode 100644 src/test/resources/large/groundTruth/chr6_HG001_paternal.dict
create mode 100644 src/test/resources/large/groundTruth/chr6_HG001_paternal.fa
create mode 100644 src/test/resources/large/groundTruth/chr6_HG001_paternal.fa.fai
create mode 100644 src/test/resources/large/groundTruth/ground_truth_output.csv
create mode 100644 src/test/resources/large/groundTruth/ground_truth_output_limited.csv
create mode 100644 src/test/resources/large/groundTruth/maternal.chr6.csv
create mode 100644 src/test/resources/large/groundTruth/paternal.chr6.csv
create mode 100644 src/test/resources/large/input_jukebox_for_test.bai
create mode 100644 src/test/resources/large/input_jukebox_for_test.bam
create mode 100644 src/test/resources/large/readFilter/read_filter_output.sam
create mode 100644 src/test/resources/large/testFlowModeFlag_expected.bam
create mode 100644 src/test/resources/large/testFlowModeFlag_expected.bam.bai
create mode 100644 src/test/resources/large/testFlowModeFlag_expected.bam.sbi
create mode 100644 src/test/resources/large/testFlowModeFlag_expected.txt
create mode 100644 src/test/resources/large/variantRecalling/150292-BC05.vcf.gz
create mode 100644 src/test/resources/large/variantRecalling/150292-BC05.vcf.gz.tbi
create mode 100644 src/test/resources/large/variantRecalling/chr5.bam1.rename.bam
create mode 100644 src/test/resources/large/variantRecalling/chr5.bam1.rename.bam.bai
create mode 100644 src/test/resources/large/variantRecalling/chr5.bam2.rename.bam
create mode 100644 src/test/resources/large/variantRecalling/chr5.bam2.rename.bam.bai
create mode 100644 src/test/resources/large/variantRecalling/haps_chr5.bam
create mode 100644 src/test/resources/large/variantRecalling/haps_chr5.bam.bai
create mode 100644 src/test/resources/large/variantRecalling/variantRecallerBasic.expected.csv
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/ClipReads/clipAdapters.bam
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/ClipReads/expected.clipAdapters.CA.bam
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/ClipReads/expected.clipAdapters.CA.tmp
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/calculate_average_combined_annotations.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/ramps/test_noramps.expected.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/ramps/test_noramps.expected.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/ramps/test_post_assembler_offramp.expected.zip
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/ramps/test_post_assembler_output.expected.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/ramps/test_post_assembler_output.expected.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/ramps/test_pre_assembler_offramp.expected.zip
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/ramps/test_pre_filter_offramp.expected.zip
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfBeforeRebase.expected.flowbased.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfBeforeRebase.expected.flowbased.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfBeforeRebaseUsingFlowModeAdvanced.expected.flowbased.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfBeforeRebaseUsingFlowModeAdvanced.expected.flowbased.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfBeforeRebaseUsingFlowModeStandard.expected.flowbased.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfBeforeRebaseUsingFlowModeStandard.expected.flowbased.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfKeepLoneAlleles.expected.flowbased.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfKeepLoneAlleles.expected.flowbased.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfWithAssemblyComplexityAnnotationRevamp.expected.flowbased.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testGvcfWithAssemblyComplexityAnnotationRevamp.expected.flowbased.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testVcfBeforeRebase.expected.flowbased.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testVcfBeforeRebase.expected.flowbased.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testVcfWithAssemblyComplexityAnnotation.expected.flowbased.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/testVcfWithAssemblyComplexityAnnotation.expected.flowbased.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/test_flowBasedHMM.expected.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/test_flowBasedHMM.expected.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/test_flowBasedHMM_Stepwise.expected.vcf
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/test_flowBasedHMM_Stepwise.expected.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/expected.NA12878.AS.chr20snippet.reblocked.hiRes.g.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testJustOneSample.expected.g.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF/testNonRefADCorrection.expected.g.vcf.idx
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/input/sample.bam
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/input/sample.t0.bam
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.0.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.0.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.1.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.1.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.10.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.10.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.11.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.11.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.12.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.12.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.13.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.13.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.14.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.14.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.15.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.15.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.16.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.16.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.17.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.17.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.18.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.18.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.19.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.19.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.2.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.2.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.20.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.20.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.21.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.21.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.22.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.22.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.23.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.23.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.24.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.24.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.25.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.25.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.3.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.3.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.4.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.4.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.5.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.5.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.6.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.6.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.7.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.7.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.8.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.8.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.9.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.9.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.0.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.0.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.1.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.1.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.10.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.10.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.11.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.11.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.12.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.12.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.13.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.13.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.14.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.14.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.15.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.15.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.16.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.16.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.17.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.17.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.18.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.18.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.19.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.19.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.2.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.2.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.20.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.20.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.21.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.21.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.22.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.22.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.23.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.23.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.24.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.24.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.3.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.3.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.4.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.4.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.5.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.5.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.6.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.6.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.7.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.7.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.8.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.8.matrix.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.9.key.txt
create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/read/flow/reads/outputs/sample.t0.9.matrix.txt
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKAnnotationPluginDescriptor.java b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKAnnotationPluginDescriptor.java
index 56f41e4ac00..f9aaa537b4b 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKAnnotationPluginDescriptor.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKAnnotationPluginDescriptor.java
@@ -11,6 +11,7 @@
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.PedigreeAnnotation;
+import org.broadinstitute.hellbender.tools.walkers.annotator.flow.FlowAnnotatorBase;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.config.ConfigFactory;
import org.broadinstitute.hellbender.utils.config.GATKConfig;
@@ -81,6 +82,9 @@ public class GATKAnnotationPluginDescriptor extends CommandLinePluginDescriptor<
@Argument(fullName = StandardArgumentDefinitions.PEDIGREE_FILE_LONG_NAME, shortName = StandardArgumentDefinitions.PEDIGREE_FILE_SHORT_NAME, doc="Pedigree file for determining the population \"founders\"", optional=true)
private GATKPath pedigreeFile;
+ @Argument(fullName = "flow-order-for-annotations", doc = "flow order used for this annotations. [readGroup:]flowOrder", optional = true)
+ private List flowOrder;
+
/**
* @return the class object for the base class of all plugins managed by this descriptor
*/
@@ -413,6 +417,23 @@ public void validateAndResolvePlugins() throws CommandLineException {
"founder-id",
allDiscoveredAnnotations.values().stream().filter(PedigreeAnnotation.class::isInstance).map(a -> a.getClass().getSimpleName()).collect(Collectors.joining(", "))));
}
+
+ // Populating any discovered flow annotations with the flowOrder arguments from the command line.
+ if (flowOrder!=null && !flowOrder.isEmpty() && getResolvedInstances().stream()
+ .filter(FlowAnnotatorBase.class::isInstance)
+ .map(a -> (FlowAnnotatorBase) a)
+ .peek(a -> {
+ a.setFlowOrder(flowOrder);
+ })
+ .count() == 0) {
+ // Throwing an exception if no flow based annotations were found
+ throw new CommandLineException(
+ String.format(
+ "Flow argument \"%s\" was specified without a flow based annotation being requested, (eg: %s))",
+ StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS,
+ allDiscoveredAnnotations.values().stream().filter(FlowAnnotatorBase.class::isInstance).map(a -> a.getClass().getSimpleName()).collect(Collectors.joining(", "))));
+ }
+
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java
index 50dddc82009..1c8596eb91b 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java
@@ -45,6 +45,7 @@ private StandardArgumentDefinitions(){}
public static final String SITES_ONLY_LONG_NAME = "sites-only-vcf-output";
public static final String INVALIDATE_PREVIOUS_FILTERS_LONG_NAME = "invalidate-previous-filters";
public static final String SORT_ORDER_LONG_NAME = "sort-order";
+ public static final String FLOW_ORDER_FOR_ANNOTATIONS = "flow-order-for-annotations";
public static final String INPUT_SHORT_NAME = "I";
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/MarkDuplicatesSparkArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/MarkDuplicatesSparkArgumentCollection.java
index f02dc2eb1ad..c3b1a5bb452 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/MarkDuplicatesSparkArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/MarkDuplicatesSparkArgumentCollection.java
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.cmdline.argumentcollections;
+import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
@@ -20,14 +21,24 @@ public final class MarkDuplicatesSparkArgumentCollection implements Serializable
public static final String REMOVE_ALL_DUPLICATE_READS = "remove-all-duplicates";
public static final String REMOVE_SEQUENCING_DUPLICATE_READS = "remove-sequencing-duplicates";
+ public static final String FLOW_MD_MODE_LONG_NAME = "flowbased";
+
+ public static final String FLOW_QUALITY_SUM_STRATEGY_LONG_NAME = "flow-quality-sum-strategy";
+ public static final String SINGLE_END_READS_END_POSITION_SIGNIFICANT = "single-end-reads-end-position-significant";
+ public static final String FLOW_END_POS_UNCERTAINTY_LONG_NAME = "flow-end-pos-uncertainty";
+ public static final String SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME = "single-end-reads-clipping-is-end";
+ public static final String FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME = "flow-skip-start-homopolymers";
+ public static final String FLOW_Q_IS_KNOWN_END_LONG_NAME = "flow-q-is-known-end";
+
@Argument(shortName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_SHORT_NAME, fullName = StandardArgumentDefinitions.DUPLICATE_SCORING_STRATEGY_LONG_NAME, doc = "The scoring strategy for choosing the non-duplicate among candidates.")
public MarkDuplicatesScoringStrategy duplicatesScoringStrategy = MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES;
@Argument(fullName = MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME, doc = "Enabling this option will mean unmapped mates of duplicate marked reads will not be marked as duplicates.")
public boolean dontMarkUnmappedMates = false;
+
@Argument(fullName = MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, doc = "Determines how duplicate types are recorded in the DT optional attribute.", optional = true,
- mutex = {REMOVE_ALL_DUPLICATE_READS, REMOVE_SEQUENCING_DUPLICATE_READS})
+ mutex = {REMOVE_ALL_DUPLICATE_READS, REMOVE_SEQUENCING_DUPLICATE_READS})
public MarkDuplicates.DuplicateTaggingPolicy taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.DontTag;
@Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS, doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.",
@@ -37,4 +48,48 @@ public final class MarkDuplicatesSparkArgumentCollection implements Serializable
@Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write optical/sequencing duplicates to the output file instead of writing them with appropriate flags set.",
mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS}, optional = true)
public boolean removeSequencingDuplicates = false;
+
+ @Advanced
+ @Argument(fullName = FLOW_QUALITY_SUM_STRATEGY_LONG_NAME, doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " +
+ "(and correct) quality value is used for all bases of the same homopolymer. Default false.", optional = true)
+ public boolean FLOW_QUALITY_SUM_STRATEGY = false;
+
+ @Advanced
+ @Argument(fullName = SINGLE_END_READS_END_POSITION_SIGNIFICANT, doc = "Make end location of read (fragment) be significant when considering duplicates, " +
+ "in addition to the start location, which is always significant (should only be applied to flow based reads). Default false.", optional = true)
+ public boolean FLOW_END_LOCATION_SIGNIFICANT = false;
+
+ @Advanced
+ @Argument(fullName = FLOW_END_POS_UNCERTAINTY_LONG_NAME, doc = "Maximal number of bases of reads (fragment) ends difference that is marked as match (should only be applied to flow based reads). Default 0.", optional = true)
+ public int ENDS_READ_UNCERTAINTY = 0;
+
+ @Advanced
+ @Argument(fullName = SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME, doc = "Use clipped, rather than unclipped, when considering duplicates (should only be applied to flow based reads). Default false.", optional = true)
+ public boolean FLOW_USE_CLIPPED_LOCATIONS = false;
+
+ @Advanced
+ @Argument(fullName = FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME, doc = "Skip first N flows, when considering duplicates (should only be applied to flow based reads). Default 0.", optional = true)
+ public int FLOW_SKIP_START_HOMOPOLYMERS = 0;
+
+ @Advanced
+ @Argument(fullName = FLOW_Q_IS_KNOWN_END_LONG_NAME, doc = "Treat reads (fragment) clipped on tm:Q as known end position (should only be applied to flow based reads) (default: false)", optional = true)
+ public boolean FLOW_Q_IS_KNOWN_END = false;
+
+ @Advanced
+ @Argument(fullName = FLOW_MD_MODE_LONG_NAME, optional = true, doc="Single argument for enabling the bulk of flow based features (should only be applied to flow based reads).")
+ public Boolean useFlowFragments = false;
+
+ public boolean isFlowEnabled() {
+ return FLOW_QUALITY_SUM_STRATEGY || FLOW_END_LOCATION_SIGNIFICANT || FLOW_USE_CLIPPED_LOCATIONS || FLOW_SKIP_START_HOMOPOLYMERS != 0;
+ }
+
+ public String[] getFlowModeArgValues() {
+ return new String[] {
+ MarkDuplicatesSparkArgumentCollection.SINGLE_END_READS_END_POSITION_SIGNIFICANT, "true",
+ MarkDuplicatesSparkArgumentCollection.SINGLE_END_READS_CLIPPING_IS_END_LONG_NAME, "true",
+ MarkDuplicatesSparkArgumentCollection.FLOW_END_POS_UNCERTAINTY_LONG_NAME, "1",
+ MarkDuplicatesSparkArgumentCollection.FLOW_SKIP_START_HOMOPOLYMERS_LONG_NAME, "0"
+ };
+
+ }
}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/FlowBasedProgramGroup.java b/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/FlowBasedProgramGroup.java
new file mode 100644
index 00000000000..7ae8e60e1e9
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/programgroups/FlowBasedProgramGroup.java
@@ -0,0 +1,16 @@
+package org.broadinstitute.hellbender.cmdline.programgroups;
+
+import org.broadinstitute.barclay.argparser.CommandLineProgramGroup;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+
+/**
+ * Tools that perform variant calling and genotyping for short variants (SNPs, SNVs and Indels)
+ */
+public class FlowBasedProgramGroup implements CommandLineProgramGroup {
+
+ @Override
+ public String getName() { return HelpConstants.DOC_CAT_SHORT_FLOW_BASED; }
+
+ @Override
+ public String getDescription() { return HelpConstants.DOC_CAT_SHORT_FLOW_BASED_SUMMARY; }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java
index 969d1ec3204..bf001eaa55a 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java
@@ -67,6 +67,8 @@ public abstract class AssemblyRegionWalker extends WalkerBase {
private List readShards;
+ private boolean nonRandomDownsamplingMode;
+
/**
* Initialize data sources for traversal.
*
@@ -139,7 +141,7 @@ public List getDefaultReadFilters() {
}
protected ReadsDownsampler createDownsampler() {
- return assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, getHeaderForReads()) : null;
+ return assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ? new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, getHeaderForReads(), nonRandomDownsamplingMode) : null;
}
/**
@@ -252,4 +254,10 @@ protected final void onShutdown() {
* @param featureContext features overlapping the padded span of the assembly region
*/
public abstract void apply( final AssemblyRegion region, final ReferenceContext referenceContext, final FeatureContext featureContext );
+
+ public void setNonRandomDownsamplingMode(boolean nonRandomDownsamplingMode) {
+ this.nonRandomDownsamplingMode = nonRandomDownsamplingMode;
+ }
+
+
}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
index a6b5ae36dcc..b6400b71d11 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
@@ -1021,6 +1021,16 @@ public String getToolName() {
return String.format("%s %s", getToolkitShortName(), getClass().getSimpleName());
}
+ /**
+ * Expose a read-only version of the raw user-supplied intervals. This can be used by tools that need to explicitly
+ * traverse the intervals themselves (rather than, for example, walking the reads based on the intervals)
+ *
+ * @return - the raw user-supplied intervals, as an unmodifiable list
+ */
+ public List getUserSuppliedIntervals() {
+ return Collections.unmodifiableList(userIntervals);
+ }
+
/**
* Returns the list of intervals to iterate, either limited to the user-supplied intervals or the entire reference genome if none were specified.
* If no reference was supplied, null is returned
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/PartialReadWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/PartialReadWalker.java
new file mode 100644
index 00000000000..48d15e263ae
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/PartialReadWalker.java
@@ -0,0 +1,75 @@
+package org.broadinstitute.hellbender.engine;
+
+import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.util.Spliterator;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.BiConsumer;
+import java.util.stream.Stream;
+
+/**
+ * A specialized read walker that may be gracefully stopped before the input stream ends
+ *
+ * A tool derived from this class should implement {@link PartialReadWalker#shouldExitEarly(GATKRead)}
+ * to indicate when to stop. This method is called before {@link ReadWalker#apply(GATKRead, ReferenceContext, FeatureContext)}
+ *
+ */
+abstract public class PartialReadWalker extends ReadWalker {
+
+ /**
+ * traverse is overridden to consult the implementation class whether to stop
+ *
+ * The stoppage is implemented using a custom forEach method to compensate for the
+ * lack of .takeWhile() in Java 8
+ */
+
+ @Override
+ public void traverse() {
+
+ final CountingReadFilter countedFilter = makeReadFilter();
+ breakableForEach(getTransformedReadStream(countedFilter), (read, breaker) -> {
+
+ // check if we should stop
+ if ( shouldExitEarly(read) ) {
+ breaker.set(true);
+ } else {
+ // this is the body of the iteration
+ final SimpleInterval readInterval = getReadInterval(read);
+ apply(read,
+ new ReferenceContext(reference, readInterval), // Will create an empty ReferenceContext if reference or readInterval == null
+ new FeatureContext(features, readInterval)); // Will create an empty FeatureContext if features or readInterval == null
+
+ progressMeter.update(readInterval);
+ }
+ });
+
+ logger.info(countedFilter.getSummaryLine());
+ }
+
+ /**
+ * Method to be overridden by the implementation class to determine when to stop the read stream traversal
+ * @param read - the read to be processed next (in case it is needed)
+ * @return boolean indicator: true means stop!
+ */
+ protected abstract boolean shouldExitEarly(GATKRead read);
+
+ /**
+ * Java 8 does not have a .takeWhile() on streams. The code below implements a custom forEach to allow
+ * breaking out of a stream prematurely.
+ *
+ * code adapted from: https://www.baeldung.com/java-break-stream-foreach
+ */
+ private static void breakableForEach(Stream stream, BiConsumer consumer) {
+ Spliterator spliterator = stream.spliterator();
+ boolean hadNext = true;
+ AtomicBoolean breaker = new AtomicBoolean();
+
+ while (hadNext && !breaker.get()) {
+ hadNext = spliterator.tryAdvance(elem -> {
+ consumer.accept(elem, breaker);
+ });
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java
index 6d186b921cb..f3003b00cd0 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java
@@ -58,6 +58,18 @@ public ReadFilterBinOp(final ReadFilter lhs, final ReadFilter rhs) {
this.lhs = lhs;
this.rhs = rhs;
}
+
+ @Override
+ public void setHeader(SAMFileHeader samHeader) {
+ super.setHeader(samHeader);
+ if ( lhs != null ) {
+ lhs.setHeader(samHeader);
+ }
+ if ( rhs != null ) {
+ rhs.setHeader(samHeader);
+ }
+ }
+
}
@VisibleForTesting
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilterLibrary.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilterLibrary.java
index a0906bf9f91..eeaf34a6fcf 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilterLibrary.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilterLibrary.java
@@ -2,6 +2,10 @@
import htsjdk.samtools.Cigar;
import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.filters.flow.FlowBasedTPAttributeSymetricReadFilter;
+import org.broadinstitute.hellbender.engine.filters.flow.FlowBasedTPAttributeValidReadFilter;
+import org.broadinstitute.hellbender.engine.filters.flow.HmerQualitySymetricReadFilter;
+import org.broadinstitute.hellbender.engine.filters.flow.ReadGroupHasFlowOrderReadFilter;
import org.broadinstitute.hellbender.tools.AddOriginalAlignmentTags;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
@@ -328,4 +332,9 @@ public static class MateUnmappedAndUnmappedReadFilter extends ReadFilter {
public static final ValidAlignmentEndReadFilter VALID_ALIGNMENT_END = new ValidAlignmentEndReadFilter();
public static final NonChimericOriginalAlignmentReadFilter NON_CHIMERIC_ORIGINAL_ALIGNMENT_READ_FILTER = new NonChimericOriginalAlignmentReadFilter();
public static final MateUnmappedAndUnmappedReadFilter MATE_UNMAPPED_AND_UNMAPPED_READ_FILTER = new MateUnmappedAndUnmappedReadFilter();
+
+ public static final ReadGroupHasFlowOrderReadFilter READ_GROUP_HAS_FLOW_ORDER_READ_FILTER = new ReadGroupHasFlowOrderReadFilter();
+ public static final HmerQualitySymetricReadFilter HMER_QUALITY_SYMETRIC_READ_FILTER = new HmerQualitySymetricReadFilter();
+ public static final FlowBasedTPAttributeValidReadFilter FLOW_BASED_TP_ATTRIBUTE_VALID_READ_FILTER = new FlowBasedTPAttributeValidReadFilter();
+ public static final FlowBasedTPAttributeSymetricReadFilter FLOW_BASED_TP_ATTRIBUTE_SYMETRIC_READ_FILTER = new FlowBasedTPAttributeSymetricReadFilter();
}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedHmerBasedReadFilterHelper.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedHmerBasedReadFilterHelper.java
new file mode 100644
index 00000000000..63935b73b64
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedHmerBasedReadFilterHelper.java
@@ -0,0 +1,76 @@
+package org.broadinstitute.hellbender.engine.filters.flow;
+
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMFileHeader;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.BaseUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+/**
+ * A common base class for flow based filters which test for conditions on an hmer basis
+ */
+public class FlowBasedHmerBasedReadFilterHelper {
+
+ interface FilterImpl {
+
+ // provide the area of values associated with read hmers
+ byte[] getValuesOfInterest(final GATKRead read);
+
+ // check that the range of values associated with a single hmer are passing the filter
+ boolean testHmer(final byte[] values, final int hmerStartingOffset, final int hmerLength);
+ }
+
+ // check if an area is a palindrome
+ static boolean isPalindrome(final byte[] values, final int ofs, final int length) {
+
+ // check that a range of bytes in the array forms an palindrome
+ for (int i = 0; i < length / 2; i++) {
+ if (values[ofs + i] != values[ofs + length - 1 - i]) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ static boolean test(final GATKRead read, FilterImpl impl) {
+
+ // access qualities
+ final byte[] values = impl.getValuesOfInterest(read);
+ if ( values == null )
+ return false;
+
+ // establish if edges are hard clipped
+ final boolean startHardClipped = read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.HARD_CLIP;
+ final boolean endHardClipped = read.getCigar().getLastCigarElement().getOperator() == CigarOperator.HARD_CLIP;
+
+ // iterate over hmers
+ final BaseUtils.HmerIterator iter = new BaseUtils.HmerIterator(read.getBasesNoCopy());
+ int ofs = 0;
+ while ( iter.hasNext() ) {
+
+ // find hmer
+ final Pair hmer = iter.next();
+ final int hmerLength = hmer.getRight();
+
+ // establish first/last
+ final boolean first = ofs == 0;
+ final boolean last = !iter.hasNext();
+
+ // skip edge hmers if hard clipped
+ if ( !((first && startHardClipped) || (last && endHardClipped)) ) {
+ if (!impl.testHmer(values, ofs, hmerLength)) {
+ return false;
+ }
+ }
+
+ // advance
+ ofs += hmerLength;
+ }
+
+ // if here, all symetric
+ return true;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeSymetricReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeSymetricReadFilter.java
new file mode 100644
index 00000000000..ee149dfde27
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeSymetricReadFilter.java
@@ -0,0 +1,34 @@
+package org.broadinstitute.hellbender.engine.filters.flow;
+
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+/**
+ * A read filter to test if the TP values for each hmer in a flow based read form
+ * a polindrome (as they should)
+ */
+public class FlowBasedTPAttributeSymetricReadFilter extends ReadFilter implements FlowBasedHmerBasedReadFilterHelper.FilterImpl {
+ private static final long serialVersionUID = 1l;
+
+ public FlowBasedTPAttributeSymetricReadFilter() {
+ super();
+ }
+
+ @Override
+ public boolean test(final GATKRead read) {
+
+ return FlowBasedHmerBasedReadFilterHelper.test(read, this);
+ }
+
+ @Override
+ public byte[] getValuesOfInterest(GATKRead read) {
+ return read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_TAG_NAME);
+ }
+
+ @Override
+ public boolean testHmer(byte[] values, int hmerStartingOffset, int hmerLength) {
+ return FlowBasedHmerBasedReadFilterHelper.isPalindrome(values, hmerStartingOffset, hmerLength);
+ }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeValidReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeValidReadFilter.java
new file mode 100644
index 00000000000..6e52cbe4556
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/FlowBasedTPAttributeValidReadFilter.java
@@ -0,0 +1,46 @@
+package org.broadinstitute.hellbender.engine.filters.flow;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+/**
+ * A read filter to test if the TP values for each hmer in a flow based read form
+ * are wihin the allowed range (being the possible lengths of hmers - maxHmer)
+ */public class FlowBasedTPAttributeValidReadFilter extends ReadFilter implements FlowBasedHmerBasedReadFilterHelper.FilterImpl {
+ private static final long serialVersionUID = 1l;
+
+ @Argument(fullName = "read-filter-max-hmer",
+ doc = "maxHmer to use for testing in the filter", optional = true)
+ public int maxHmer = 12;
+
+ public FlowBasedTPAttributeValidReadFilter() {
+ super();
+ }
+
+ @Override
+ public boolean test(final GATKRead read) {
+ return FlowBasedHmerBasedReadFilterHelper.test(read, this);
+ }
+
+ @Override
+ public byte[] getValuesOfInterest(final GATKRead read) {
+ return read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_TAG_NAME);
+ }
+
+ @Override
+ public boolean testHmer(final byte[] values, final int hmerStartingOffset, final int hmerLength) {
+
+ // check matrix index resulting from tp value does not exceed limits
+ // (note that tp value is a 1/0/-1 adjustment of the hmer length
+ for ( int i = 0 ; i < hmerLength ; i++ ) {
+ final int targetValue = values[hmerStartingOffset + i] + hmerLength;
+
+ if (targetValue < 0 || targetValue > maxHmer)
+ return false;
+ }
+
+ return true;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/HmerQualitySymetricReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/HmerQualitySymetricReadFilter.java
new file mode 100644
index 00000000000..9d19e7306da
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/HmerQualitySymetricReadFilter.java
@@ -0,0 +1,31 @@
+package org.broadinstitute.hellbender.engine.filters.flow;
+
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+/**
+ * A read filter to test if the quality values for each hmer in a flow based read form
+ * a polindrome (as they should)
+ */
+public class HmerQualitySymetricReadFilter extends ReadFilter implements FlowBasedHmerBasedReadFilterHelper.FilterImpl {
+ private static final long serialVersionUID = 1l;
+
+ public HmerQualitySymetricReadFilter() {
+ super();
+ }
+
+ @Override
+ public boolean test(final GATKRead read) {
+ return FlowBasedHmerBasedReadFilterHelper.test(read, this);
+ }
+
+ @Override
+ public byte[] getValuesOfInterest(GATKRead read) {
+ return read.getBaseQualitiesNoCopy();
+ }
+
+ @Override
+ public boolean testHmer(byte[] values, int hmerStartingOffset, int hmerLength) {
+ return FlowBasedHmerBasedReadFilterHelper.isPalindrome(values, hmerStartingOffset, hmerLength);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/ReadGroupHasFlowOrderReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/ReadGroupHasFlowOrderReadFilter.java
new file mode 100644
index 00000000000..e3d8a3ad366
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/ReadGroupHasFlowOrderReadFilter.java
@@ -0,0 +1,44 @@
+package org.broadinstitute.hellbender.engine.filters.flow;
+
+import htsjdk.samtools.SAMFileHeader;
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+/**
+ * A read filter to test if the read's readGroup has a flow order associated with it
+ */
+public class ReadGroupHasFlowOrderReadFilter extends ReadFilter {
+ private static final long serialVersionUID = 1l;
+ private final static OneShotLogger readGroupFiltered = new OneShotLogger(ReadGroupHasFlowOrderReadFilter.class);
+
+ public ReadGroupHasFlowOrderReadFilter() {
+
+ }
+
+ public ReadGroupHasFlowOrderReadFilter(final SAMFileHeader header) {
+ setHeader(header);
+ }
+
+ @Override
+ public boolean test(final GATKRead read) {
+
+ final boolean result;
+
+ if ( read.getReadGroup() == null ) {
+ result = false;
+ } else if ( samHeader.getReadGroup(read.getReadGroup()) == null ) {
+ result = false;
+ } else if ( samHeader.getReadGroup(read.getReadGroup()).getFlowOrder() == null ) {
+ result = false;
+ } else {
+ result = true;
+ }
+
+ if ( !result ) {
+ readGroupFiltered.warn("at least one of readgroup is missing or missing a flow order.");
+ }
+
+ return result;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/WellformedFlowBasedReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/WellformedFlowBasedReadFilter.java
new file mode 100644
index 00000000000..88fb16e30db
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/flow/WellformedFlowBasedReadFilter.java
@@ -0,0 +1,71 @@
+package org.broadinstitute.hellbender.engine.filters.flow;
+
+import htsjdk.samtools.SAMFileHeader;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.filters.AlignmentAgreesWithHeaderReadFilter;
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
+import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+/**
+ * Tests whether a flow based read is "well-formed" -- that is, is free of major internal inconsistencies and issues that could lead
+ * to errors downstream. If a read passes this filter, the rest of the engine should be able to process it without
+ * blowing up. Note that checks already present in WellformedReadFilter are not duplicated here.
+ *
+ * Well-formed flow based reads definition
+ *
+ * Flow order: read group must have flow order
+ * Quality: should be symtrical within each hmer.
+ * tp attribute: should be symtrical within each hmer.
+ * tp attribute: tp+hmer_length should be within [0, maxhmer],
+ * Hardclipped hmer: is excempted from above checks.
+ *
+ * @see ReadGroupHasFlowOrderReadFilter
+ * @see HmerQualitySymetricReadFilter
+ * @see FlowBasedTPAttributeSymetricReadFilter
+ * @see FlowBasedTPAttributeValidReadFilter
+ *
+ *
+ */
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_READFILTERS,
+ groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY,
+ summary = "Keep only flow based reads that are well-formed",
+ extraDocs = {
+ ReadGroupHasFlowOrderReadFilter.class,
+ HmerQualitySymetricReadFilter.class,
+ FlowBasedTPAttributeSymetricReadFilter.class,
+ FlowBasedTPAttributeValidReadFilter.class
+ }
+)
+public final class WellformedFlowBasedReadFilter extends ReadFilter {
+ private static final long serialVersionUID = 1l;
+
+ private ReadFilter wellFormedFilter = null;
+
+ public WellformedFlowBasedReadFilter() {
+
+ }
+
+ @Override
+ public void setHeader(SAMFileHeader header) {
+ super.setHeader(header);
+ createFilter();
+ }
+
+ private void createFilter() {
+
+ wellFormedFilter = (new WellformedReadFilter(samHeader))
+ .and(new ReadGroupHasFlowOrderReadFilter(samHeader))
+ .and(ReadFilterLibrary.HMER_QUALITY_SYMETRIC_READ_FILTER)
+ .and(ReadFilterLibrary.FLOW_BASED_TP_ATTRIBUTE_VALID_READ_FILTER)
+ .and(ReadFilterLibrary.FLOW_BASED_TP_ATTRIBUTE_SYMETRIC_READ_FILTER);
+
+ }
+
+ @Override
+ public boolean test(final GATKRead read ) {
+ return wellFormedFilter.test(read);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java
index 0f3f1709939..8204f427b2e 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java
@@ -20,10 +20,7 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;
-import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.EmptyFragment;
-import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.Fragment;
-import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.Pair;
-import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.Passthrough;
+import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.*;
import org.objenesis.instantiator.ObjectInstantiator;
import java.util.ArrayList;
@@ -138,6 +135,7 @@ private void registerGATKClasses(Kryo kryo) {
kryo.register(SAMReadGroupRecord.class);
kryo.register(EmptyFragment.class, new FieldSerializer(kryo, EmptyFragment.class));
kryo.register(Fragment.class, new FieldSerializer(kryo, Fragment.class));
+ kryo.register(FlowModeFragment.class, new FieldSerializer(kryo, FlowModeFragment.class));
kryo.register(Pair.class, new Pair.Serializer());
kryo.register(Passthrough.class, new FieldSerializer(kryo, Passthrough.class));
kryo.register(MarkDuplicatesSparkUtils.IndexPair.class, new FieldSerializer(kryo, MarkDuplicatesSparkUtils.IndexPair.class));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CalculateAverageCombinedAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/CalculateAverageCombinedAnnotations.java
new file mode 100644
index 00000000000..5d9812f04da
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/CalculateAverageCombinedAnnotations.java
@@ -0,0 +1,78 @@
+package org.broadinstitute.hellbender.tools;
+
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLineType;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.*;
+
+@CommandLineProgramProperties(
+ summary = "Divides annotations that were summed across samples by genomicsDB by the number of samples with het or hom var calls. " +
+ "This is an approximation of taking the average of these annotations.",
+ oneLineSummary = "Divides annotations that were summed by genomicsDB by number of samples to calculate average.",
+ programGroup = FlowBasedProgramGroup.class,
+ omitFromCommandLine = true
+)
+@ExperimentalFeature
+public final class CalculateAverageCombinedAnnotations extends VariantWalker {
+ public static final String ANNOTATION_LIST_LONG_NAME = "summed-annotation-to-divide";
+ public static final String ANNOTATION_LIST_SHORT_NAME = "summed-annotation";
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (if not provided, defaults to STDOUT)", common = false, optional = false)
+ private GATKPath outputFile = null;
+
+ @Argument(fullName = ANNOTATION_LIST_LONG_NAME, shortName = ANNOTATION_LIST_SHORT_NAME, doc = "INFO Annotations in VCF that have been summed by GenomicsDB and need to be divided by the number of het or homvar samples to calculate the average value. Must use annotation string as it's defined in the VCF.", common = false, optional = false)
+ private List annotations = new ArrayList<>();
+
+ private VariantContextWriter vcfWriter = null;
+
+ @Override
+ public void onTraversalStart() {
+ vcfWriter = createVCFWriter(outputFile);
+ if (annotations.size() == 0) {
+ throw new UserException("--" + ANNOTATION_LIST_LONG_NAME + " must be provided.");
+ }
+ VCFHeader header = getHeaderForVariants();
+ for(String annot : annotations) {
+ header.addMetaDataLine(new VCFInfoHeaderLine("AVERAGE_" + annot, 1, VCFHeaderLineType.Float, "Average of "+ annot +" annotation across samples. See "+ annot +" header line for more information."));
+ }
+ vcfWriter.writeHeader(header);
+ }
+
+ @Override
+ public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
+ if (!variant.hasAttribute(GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY)){
+ throw new UserException(String.format("Need annotation %s at site %s:%d", GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, variant.getContig(), variant.getStart()));
+ }
+ List genotypeCounts = variant.getAttributeAsStringList(GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, "");
+ double counter = Double.parseDouble(genotypeCounts.get(1)) + Double.parseDouble(genotypeCounts.get(2)); //Het and hom var counts, all alleles are lumped together.
+ if (counter > 0 ) {
+ Map finalAnnotations = new HashMap<>();
+ for (String annot : annotations) {
+ if (variant.hasAttribute(annot)) {
+ finalAnnotations.put("AVERAGE_" + annot, variant.getAttributeAsDouble(annot, 0) / counter);
+ }
+ }
+ VariantContext vc = new VariantContextBuilder(variant).putAttributes(finalAnnotations).make();
+ vcfWriter.add(vc);
+ } else {
+ vcfWriter.add(variant);
+ }
+ }
+
+ @Override
+ public void closeTool() {
+ vcfWriter.close();
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/ClipReads.java b/src/main/java/org/broadinstitute/hellbender/tools/ClipReads.java
index 57d2b4c88b1..ab8444b923c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/ClipReads.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/ClipReads.java
@@ -11,21 +11,22 @@
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
-import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
+import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.engine.GATKPath;
-import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.clipping.ClippingOp;
import org.broadinstitute.hellbender.utils.clipping.ClippingRepresentation;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
+import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import java.io.PrintStream;
import java.util.*;
@@ -68,6 +69,8 @@
* filtering only bases whose sequence exactly matches SEQ.
*
*
+ * Adapter locations
+ * Only on uBAM: if adapter on five prime XF or adapter in three prime XT is given - clip it.
* Input
* Any number of SAM/BAM/CRAM files.
*
@@ -86,6 +89,9 @@
* Number of quality-score clipped bases 126
* Number of range clipped bases 0
* Number of sequence clipped bases 0
+ *
+ * if --clip-adapter is provided, an additional line will be appended:
+ * Number of adapter clipped bases 18228
*
*
* Example usage
@@ -142,6 +148,8 @@
public final class ClipReads extends ReadWalker {
private final Logger logger = LogManager.getLogger(ClipReads.class);
+ private static final OneShotLogger tooShortOneShotLogger = new OneShotLogger(ClipReads.class);
+ private static final OneShotLogger noAttrOneShotLogger = new OneShotLogger(ClipReads.class);
public static final String OUTPUT_STATISTICS_LONG_NAME = "output-statistics";
public static final String OUTPUT_STATISTICS_SHORT_NAME = "os";
@@ -155,8 +163,18 @@ public final class ClipReads extends ReadWalker {
public static final String CLIP_SEQUENCE_SHORT_NAME = "X";
public static final String CLIP_REPRESENTATION_LONG_NAME = "clip-representation";
public static final String CLIP_REPRESENTATION_SHORT_NAME = "CR";
+ public static final String CLIP_ADAPTER_LONG_NAME = "clip-adapter";
+ public static final String CLIP_ADAPTER_SHORT_NAME = "CA";
public static final String READ_LONG_NAME = "read";
public static final String READ_SHORT_NAME = READ_LONG_NAME;
+ public static final String MIN_READ_LENGTH_TO_REPORT_LONG_NAME = "min-read-length-to-output";
+ public static final String FIVE_PRIME_TRIMMING_TAG = "tf";
+ public static final String THREE_PRIME_TRIMMING_TAG = "tm";
+ public static final String FIVE_PRIME_ADAPTER_LOCATION_TAG = "XF";
+ public static final String THREE_PRIME_ADAPTER_LOCATION_TAG = "XT";
+
+
+
/**
* The output SAM/BAM/CRAM file will be written here
@@ -179,6 +197,7 @@ public final class ClipReads extends ReadWalker {
@Argument(fullName = Q_TRIMMING_THRESHOLD_LONG_NAME, shortName = Q_TRIMMING_THRESHOLD_SHORT_NAME, doc = "If provided, the Q-score clipper will be applied", optional = true)
int qTrimmingThreshold = -1;
+
/**
* Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based
@@ -211,6 +230,13 @@ public final class ClipReads extends ReadWalker {
@Argument(fullName=READ_LONG_NAME, shortName = READ_SHORT_NAME, doc="", optional = true)
String onlyDoRead = null;
+ @Argument(fullName = CLIP_ADAPTER_LONG_NAME, shortName = CLIP_ADAPTER_SHORT_NAME, doc = "Clip locations according to XF, XT tags. This will destroy reads which have none of these tags", optional = true)
+ boolean clipAdapter = false;
+
+ //Note: relevant only to the single ended reads
+ @Argument(fullName = MIN_READ_LENGTH_TO_REPORT_LONG_NAME, doc = "Shortest read to output. Note that this works correctly only on non-paired reads.", optional = true)
+ private final Integer minReadLength = 0;
+
/**
* List of sequence that should be clipped from the reads
*/
@@ -313,6 +339,7 @@ public void apply( GATKRead read, ReferenceContext ref, FeatureContext featureCo
clipBadQualityScores(clipper);
clipCycles(clipper);
clipSequences(clipper);
+ clipAdapter(clipper);
accumulate(clipper);
}
}
@@ -400,6 +427,7 @@ private Pair strandAwarePositions(GATKRead read, int start, in
return new MutablePair<>(start, stop);
}
+
/**
* clip bases at cycles between the ranges in cyclesToClip by adding appropriate ClippingOps to clipper.
*
@@ -477,15 +505,73 @@ private void clipBadQualityScores(ReadClipperWithData clipper) {
clipper.setData(data);
}
+ /**
+ * Clip bases on the reads that have a trimming mark. Some programs use tags to indicate
+ * the number of bases that should be trimmed from the read to trim the adapter and do not
+ * trim the adapter itself. This function uses these tags (XF and XT) to make the trimming
+ *
+ * @param clipper
+ */
+ private void clipAdapter(ReadClipperWithData clipper) {
+ if (clipAdapter) {
+ GATKRead read = clipper.getRead();
+ ClippingData data = clipper.getData();
+ Integer xf = read.getAttributeAsInteger(FIVE_PRIME_ADAPTER_LOCATION_TAG);
+ Integer xt = read.getAttributeAsInteger(THREE_PRIME_ADAPTER_LOCATION_TAG);
+ if ((xf != null) && (xt != null) && (xf == 0) && (xt == 0)) {
+ ClippingOp clip = new ClippingOp(0, read.getLength());
+ clipper.addOp(clip);
+ data.incNAdapterClippedBases((read.getLength()));
+ return;
+ }
+ if ((xt != null) && (xt <= read.getLength())) { //XT is the location of the first nucleotide in the 3' adapter to be clipped (one-based)
+ ClippingOp xt_clip = new ClippingOp(xt - 1, read.getLength());
+ clipper.addOp(xt_clip);
+ addAdapterTag(clipper, THREE_PRIME_TRIMMING_TAG);
+ data.incNAdapterClippedBases(read.getLength() - xt + 1);
+ }
+
+ if ((xf != null) && (xf > 1)) { // XF is the location of the first nucleotide to be not-clipped (one-based to be consistent with XT)
+ ClippingOp xf_clip = new ClippingOp(0, xf - 2); //stop is included
+ clipper.addOp(xf_clip);
+ addAdapterTag(clipper, FIVE_PRIME_TRIMMING_TAG);
+ data.incNAdapterClippedBases(xf);
+ }
+
+ if ( (xf == null) && (xt == null)) {
+ noAttrOneShotLogger.warn("clipAdapter requested, yet neither " + FIVE_PRIME_ADAPTER_LOCATION_TAG + " nor " +
+ THREE_PRIME_ADAPTER_LOCATION_TAG + " attributes found. first read: " + read);
+ }
+ }
+
+ }
+
+ private void addAdapterTag(ReadClipperWithData clipper,final String tag){
+ String curTagValue = clipper.getRead().getAttributeAsString(tag);
+ if (curTagValue == null ){
+ clipper.getRead().setAttribute(tag, "A");
+ } else if (!curTagValue.contains("A")){
+ clipper.getRead().setAttribute(tag, curTagValue + "A");
+ }
+ }
+
+
private void accumulate(ReadClipperWithData clipper) {
if ( clipper == null )
return;
GATKRead clippedRead = clipper.clipRead(clippingRepresentation);
- outputBam.addRead(clippedRead);
-
+ if ( minReadLength > 0 ) {
+ if (clippedRead.isPaired()){
+ tooShortOneShotLogger.warn("Limit on the read length is not implemented for PE reads. continuing anyway. first read:" + clippedRead);
+ }
+ }
+ if (clippedRead.getLength() >= minReadLength) {
+ outputBam.addRead(clippedRead);
+ }
accumulator.nTotalReads++;
accumulator.nTotalBases += clipper.getRead().getLength();
+
if (clipper.wasClipped()) {
accumulator.nClippedReads++;
accumulator.addData(clipper.getData());
@@ -512,7 +598,7 @@ public SeqToClip(String name, byte[] bytez) {
}
}
- public static final class ClippingData {
+ public final class ClippingData {
public long nTotalReads = 0;
public long nTotalBases = 0;
public long nClippedReads = 0;
@@ -520,6 +606,7 @@ public static final class ClippingData {
public long nQClippedBases = 0;
public long nRangeClippedBases = 0;
public long nSeqClippedBases = 0;
+ public long nAdapterClippedBases = 0;
SortedMap seqClipCounts = new TreeMap<>();
@@ -539,6 +626,11 @@ public void incNRangeClippedBases(int n) {
nClippedBases += n;
}
+ public void incNAdapterClippedBases(int n){
+ nAdapterClippedBases += n;
+ nClippedBases +=n;
+ }
+
public void incSeqClippedBases(final String seq, int n) {
nSeqClippedBases += n;
nClippedBases += n;
@@ -553,7 +645,7 @@ public void addData (ClippingData data) {
nQClippedBases += data.nQClippedBases;
nRangeClippedBases += data.nRangeClippedBases;
nSeqClippedBases += data.nSeqClippedBases;
-
+ nAdapterClippedBases += data.nAdapterClippedBases;
for (String seqClip : data.seqClipCounts.keySet()) {
Long count = data.seqClipCounts.get(seqClip);
if (seqClipCounts.containsKey(seqClip))
@@ -579,13 +671,16 @@ public String toString() {
for (Map.Entry elt : seqClipCounts.entrySet()) {
s.append(String.format(" %8d clip sites matching %s%n", elt.getValue(), elt.getKey()));
}
+ if ( clipAdapter ) {
+ s.append(String.format("Number of adapter clipped bases %d%n", nAdapterClippedBases));
+ }
s.append(StringUtils.repeat('-', 80) + "\n");
return s.toString();
}
}
- public static final class ReadClipperWithData extends ReadClipper {
+ public final class ReadClipperWithData extends ReadClipper {
private ClippingData data;
public ReadClipperWithData(GATKRead read, List clipSeqs) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedAlignmentArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedAlignmentArgumentCollection.java
new file mode 100644
index 00000000000..9a9f0468388
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedAlignmentArgumentCollection.java
@@ -0,0 +1,26 @@
+package org.broadinstitute.hellbender.tools;
+
+import org.broadinstitute.barclay.argparser.Advanced;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.Hidden;
+
+public class FlowBasedAlignmentArgumentCollection extends FlowBasedArgumentCollection {
+
+ private static final long serialVersionUID = 0;
+
+ public static final String FLOW_LIKELIHOOD_PARALLEL_THREADS_LONG_NAME = "flow-likelihood-parallel-threads";
+ public static final String FLOW_LIKELIHOOD_OPTIMIZED_COMP_LONG_NAME = "flow-likelihood-optimized-comp";
+
+ @Advanced
+ @Hidden
+ @Argument(fullName = FLOW_LIKELIHOOD_PARALLEL_THREADS_LONG_NAME, doc = "Number of threads to parallelize likelihood computation inner (read) loop with", optional=true)
+ public int flowLikelihoodParallelThreads = 0;
+
+ @Advanced
+ @Hidden
+ @Argument(fullName = FLOW_LIKELIHOOD_OPTIMIZED_COMP_LONG_NAME, doc = "Use optimized likelihood computation version. The code is otimized in that it performs fewer log10 calls - which are expensive - by using precomputed values " +
+ "for common probability values", optional=true)
+ public boolean flowLikelihoodOptimizedComp = false;
+
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java
new file mode 100644
index 00000000000..98733d00ba7
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java
@@ -0,0 +1,154 @@
+package org.broadinstitute.hellbender.tools;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.broadinstitute.barclay.argparser.Advanced;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.Hidden;
+import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.cmdline.ModeArgumentUtils;
+import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
+
+import java.io.Serializable;
+
+public class FlowBasedArgumentCollection implements Serializable {
+ private static final long serialVersionUID = 0;
+
+ public static final String FLOW_USE_T0_TAG = "flow-use-t0-tag";
+ public static final String PROBABILITY_RATIO_THRESHOLD_LONG_NAME = "flow-probability-threshold";
+ public static final String REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME = "flow-remove-non-single-base-pair-indels";
+ public static final String REMOVE_ONE_TO_ZERO_PROBS_LONG_NAME = "flow-remove-one-zero-probs";
+ public static final String NUMBER_OF_POSSIBLE_PROBS_LONG_NAME = "flow-quantization-bins";
+ public static final String FILLING_VALUE_LONG_NAME = "flow-fill-empty-bins-value";
+ public static final String SYMMETRIC_INDELS_LONG_NAME = "flow-symmetric-indel-probs";
+ public static final String REPORT_INS_OR_DEL_LONG_NAME = "flow-report-insertion-or-deletion";
+ public static final String DISALLOW_LARGER_PROBS_LONG_NAME = "flow-disallow-probs-larger-than-call";
+ public static final String LUMP_PROBS_LONG_NAME = "flow-lump-probs";
+ public static final String PROB_SF_LONG_NAME = "flow-probability-scaling-factor";
+ public static final String RETAIN_MAX_N_PROBS_BASE_LONG_NAME = "flow-retain-max-n-probs-base-format";
+ public static final String FLOW_ORDER_CYCLE_LENGTH_LONG_NAME = "flow-order-cycle-length";
+ public static final String NUM_UNCERTAIN_FLOWS_LONG_NAME = "flow-number-of-uncertain-flows-to-clip";
+ public static final String FIRST_UNCERTAIN_FLOW_LONG_NAME = "flow-nucleotide-of-first-uncertain-flow";
+ public static final String FLOW_MATRIX_MODS_LONG_NAME = "flow-matrix-mods";
+ public static final String FLOW_KEEP_BOUNDARY_FLOWS_LONG_NAME = "keep-boundary-flows";
+
+
+
+ private static final double DEFAULT_RATIO_THRESHOLD = 0.003;
+ private static final double DEFAULT_FILLING_VALUE = 0.001;
+ private static final boolean DEFAULT_REMOVE_LONGER_INDELS = false;
+ private static final boolean DEFAULT_REMOVE_ONE_TO_ZERO = false;
+ private static final boolean DEFAULT_SYMMETRIC_INDELS = false;
+ private static final int DEFAULT_QUANTIZATION = 121;
+ private static final boolean DEFAULT_ONLY_INS_OR_DEL = false;
+ private static final boolean DEFAULT_DISALLOW_LARGER_PROBS = false;
+ private static final boolean DEFAULT_LUMP_PROBS = false;
+ private static final boolean DEFAULT_RETAIN_MAX_N_PROBS = false;
+ private static final int DEFAULT_PROB_SCALING_FACTOR = 10;
+ private static final int DEFAULT_FLOW_ORDER_CYCLE_LENGTH = 4;
+ private static final int DEFAULT_NUM_UNCERTAIN_FLOWS = 0;
+ private static final String DEFAULT_FIRST_UNCERTAIN_FLOW = "T";
+ private static final boolean DEFAULT_FLOW_USE_T0_TAG = false;
+
+ @Advanced
+ @Argument(fullName = FLOW_USE_T0_TAG, doc = "Use t0 tag if exists in the read to create flow matrix", optional = true)
+ public boolean useT0Tag = DEFAULT_FLOW_USE_T0_TAG;
+
+ @Advanced
+ @Argument(fullName = PROBABILITY_RATIO_THRESHOLD_LONG_NAME, doc = "Lowest probability ratio to be used as an option", optional = true)
+ public double probabilityRatioThreshold = DEFAULT_RATIO_THRESHOLD;
+
+ @Advanced
+ @Argument(fullName = REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME, doc = "Should the probabilities of more then 1 indel be used", optional = true)
+ public boolean removeLongerThanOneIndels = DEFAULT_REMOVE_LONGER_INDELS;
+
+ @Advanced
+ @Argument(fullName = REMOVE_ONE_TO_ZERO_PROBS_LONG_NAME, doc = "Remove probabilities of basecall of zero from non-zero genome", optional = true)
+ public boolean removeOneToZeroProbs = DEFAULT_REMOVE_ONE_TO_ZERO;
+
+ @Advanced
+ @Argument(fullName = NUMBER_OF_POSSIBLE_PROBS_LONG_NAME, doc = "Number of bins for probability quantization", optional = true)
+ public int probabilityQuantization = DEFAULT_QUANTIZATION;
+
+ @Advanced
+ @Argument(fullName = FILLING_VALUE_LONG_NAME, doc = "Value to fill the zeros of the matrix with", optional=true)
+ public double fillingValue = DEFAULT_FILLING_VALUE;
+
+ @Advanced
+ @Argument(fullName = SYMMETRIC_INDELS_LONG_NAME, doc = "Should indel probabilities be symmetric in flow", optional=true)
+ public boolean symmetricIndels = DEFAULT_SYMMETRIC_INDELS;
+
+ @Advanced
+ @Argument(fullName = REPORT_INS_OR_DEL_LONG_NAME, doc = "Report either insertion or deletion, probability, not both", optional=true)
+ public boolean onlyInsOrDel = DEFAULT_ONLY_INS_OR_DEL;
+
+ @Advanced
+ @Argument(fullName = DISALLOW_LARGER_PROBS_LONG_NAME, doc = "Cap probabilities of error to 1 relative to base call", optional=true)
+ public boolean disallowLargerProbs = DEFAULT_DISALLOW_LARGER_PROBS;
+
+ @Advanced
+ @Argument(fullName = LUMP_PROBS_LONG_NAME, doc = "Should all probabilities of insertion or deletion in the flow be combined together", optional=true)
+ public boolean lumpProbs = DEFAULT_LUMP_PROBS;
+
+ @Advanced
+ @Argument(fullName = RETAIN_MAX_N_PROBS_BASE_LONG_NAME, doc = "Keep only hmer/2 probabilities (like in base format)", optional=true)
+ public boolean retainMaxNProbs = DEFAULT_RETAIN_MAX_N_PROBS;
+
+ @Advanced
+ @Argument(fullName = PROB_SF_LONG_NAME, doc = "probability scaling factor for (phred=10) for probability quantization", optional=true)
+ public int probabilityScalingFactor = DEFAULT_PROB_SCALING_FACTOR;
+
+ @Advanced
+ @Hidden
+ @Argument(fullName = FLOW_ORDER_CYCLE_LENGTH_LONG_NAME, doc = "Length of flow order cycle", optional=true)
+ public int flowOrderCycleLength = DEFAULT_FLOW_ORDER_CYCLE_LENGTH;
+
+ @Advanced
+ @Hidden
+ @Argument(fullName = NUM_UNCERTAIN_FLOWS_LONG_NAME, doc = "Number of uncertain flows to trim on the 5' end of the read", optional=true)
+ public int flowNumUncertainFlows = DEFAULT_NUM_UNCERTAIN_FLOWS;
+
+ @Advanced
+ @Hidden
+ @Argument(fullName = FIRST_UNCERTAIN_FLOW_LONG_NAME, doc = "Nucleotide that is being read in the first uncertain (5') flow", optional=true)
+ public String flowFirstUncertainFlowBase = DEFAULT_FIRST_UNCERTAIN_FLOW;
+
+ @Advanced
+ @Argument(fullName=FLOW_MATRIX_MODS_LONG_NAME, doc="Modifications instructions to the read flow matrix. " +
+ "Format is src,dst{,src,dst}+. Example: 10,12,11,12 - these instructions will copy element 10 into 11 and 12", optional = true)
+ public String flowMatrixMods = null;
+
+ @Advanced
+ @Argument(fullName=FLOW_KEEP_BOUNDARY_FLOWS_LONG_NAME, doc="prevent spreading of boundary flows.", optional = true)
+ public boolean keepBoundaryFlows = false;
+
+ public FlowBasedArgumentCollection() {}
+
+
+ /**
+ * This matrix contains logic for modifying the flow matrix as it is read in.
+ *
+ * If the value of [n] is not zero, then the hmer probability for hmer length n will be copied to the [n] position
+ * For the implementation logic, see fillFlowMatrix
+ */
+ private int[] flowMatrixModsInstructions = null;
+
+ public int[] getFlowMatrixModsInstructions() {
+
+ if ( flowMatrixMods != null && flowMatrixModsInstructions == null ) {
+ flowMatrixModsInstructions = new int[FlowBasedRead.MAX_CLASS + 1];
+
+ final String[] toks = flowMatrixMods.split(",");
+ for ( int i = 0 ; i < toks.length - 1 ; i += 2 ) {
+ final int hmer = Utils.validIndex(Integer.parseInt(toks[i]), flowMatrixModsInstructions.length);
+ flowMatrixModsInstructions[hmer] = Integer.parseInt(toks[i + 1]);
+ }
+ }
+
+ return flowMatrixModsInstructions;
+ }
+
+ ;
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java
index c14a8c2bef4..8ff76a1ae6c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java
@@ -29,10 +29,7 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
-import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCaller;
-import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection;
-import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerEngine;
-import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReferenceConfidenceMode;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.io.IOUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/SplitCRAM.java b/src/main/java/org/broadinstitute/hellbender/tools/SplitCRAM.java
new file mode 100644
index 00000000000..efd5fceff9b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/SplitCRAM.java
@@ -0,0 +1,133 @@
+package org.broadinstitute.hellbender.tools;
+
+import htsjdk.samtools.cram.build.CramContainerIterator;
+import htsjdk.samtools.cram.build.CramIO;
+import htsjdk.samtools.cram.structure.Container;
+import htsjdk.samtools.cram.structure.CramHeader;
+import org.broadinstitute.barclay.argparser.*;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import picard.cmdline.programgroups.OtherProgramGroup;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.regex.Pattern;
+
+/***
+ * SplitCRAM - split a cram file into smaller cram files (shards) containing a minimal number of records
+ * while still respecting container boundaries.
+ *
+ * The tool operates on a CRAM container level and therefore is efficient but not exact in the number of
+ * records on each output file (container boundaries are maintained)
+ *
+ * Note that CRAM files have relative record counters embedded in each container. These are not reset by
+ * this tool. Therefore, the resulting files may not contain correct record counter values.
+ *
+ * Usage
+ *
+ * ./gatk SplitCRAM \
+ * -I
+ * input.cram
+ * -O
+ * output_%04d.cram
+ * --shard-records
+ * 5000000
+ *
+ *
+ * Notes:
+ * 1. shard-records is optional. defaults to 10M
+ * 2. output filename should contain a %d formatter pattern
+ */
+
+@CommandLineProgramProperties(
+ summary = "Splits CRAM files efficiently by taking advantage of their container based structure",
+ oneLineSummary = "Split CRAM files to smaller files efficiently",
+ programGroup = FlowBasedProgramGroup.class
+)
+@WorkflowProperties
+@ExperimentalFeature
+public class SplitCRAM extends CommandLineProgram {
+
+ public static final int DEFAULT_SHARD_RECORDS = 10000000;
+ public static final String SHARD_RECORDS_FULL_NAME = "shard-records";
+ public static final Pattern numeratorFormat = Pattern.compile("%[0-9]*d");
+
+ @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
+ doc = "input cram file to split")
+ private GATKPath cramInput = null;
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc = "output cram file template. should contain %d, which will be replaced by shard index", optional = true)
+ private String cramOutputTemplate = "output_%04d.cram";
+
+ @Argument(fullName = SHARD_RECORDS_FULL_NAME, doc = "minimum threshold for number of records per shard.", optional = true)
+ private long shardRecords = DEFAULT_SHARD_RECORDS;
+
+ // locals
+ CramContainerIterator cramContainerIterator;
+ int shard;
+
+ @Override
+ protected void onStartup() {
+ super.onStartup();
+
+ // check that output template contains a %d formatter
+ if ( !numeratorFormat.matcher(cramOutputTemplate).find() ) {
+ throw new IllegalArgumentException("output template missing a %d enumerator formatter: " + cramOutputTemplate);
+ }
+ }
+
+ @Override
+ protected Object doWork() {
+
+ try (final CramContainerIterator cramContainerIterator = new CramContainerIterator(new BufferedInputStream(cramInput.getInputStream())) ){
+
+ // get header
+ final CramHeader cramHeader = cramContainerIterator.getCramHeader();
+
+ // iterate
+ while (cramContainerIterator.hasNext()) {
+
+ try (final OutputStream os = nextOutputStream()) {
+
+ // write headers
+ CramIO.writeCramHeader(cramContainerIterator.getCramHeader(), os);
+ Container.writeSAMFileHeaderContainer(cramContainerIterator.getCramHeader().getCRAMVersion(), cramContainerIterator.getSamFileHeader(), os);
+
+ // iterate
+ long records = 0;
+ while (cramContainerIterator.hasNext() && (records < shardRecords)) {
+
+ // get next container
+ final Container container = cramContainerIterator.next();
+
+ // write container to output stream
+ container.write(cramHeader.getCRAMVersion(), os);
+
+ // update record count
+ records += container.getContainerHeader().getNumberOfRecords();
+ }
+
+ CramIO.writeCramEOF(cramContainerIterator.getCramHeader().getCRAMVersion(), os);
+ }
+ }
+ } catch (IOException e) {
+ throw new GATKException(e.getMessage(), e);
+ }
+
+ return null;
+ }
+
+ private OutputStream nextOutputStream() {
+
+ final String filename = String.format(cramOutputTemplate, shard++);
+ final GATKPath path = new GATKPath(filename);
+
+ return new BufferedOutputStream(path.getOutputStream());
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java
index a18747c4552..f81e825e39e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java
@@ -84,6 +84,18 @@ public static void updateImportProtobufVidMapping(GenomicsDBImporter importer) {
GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY, true);
vidMapPB = updateFieldSetDisableRemapMissingAlleleToNonRef(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_SB_TABLE_KEY, true);
+ vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
+ GATKVCFConstants.TREE_SCORE, SUM);
+ vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
+ GATKVCFConstants.HAPLOTYPE_COMPLEXITY_KEY, ELEMENT_WISE_SUM);
+ vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
+ GATKVCFConstants.HAPLOTYPE_DOMINANCE_KEY, ELEMENT_WISE_SUM);
+ vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
+ GATKVCFConstants.HAPLOTYPES_BEFORE_FILTERING_KEY, SUM);
+ vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
+ GATKVCFConstants.HAPLOTYPES_FILTERED_KEY, SUM);
+
+
importer.updateProtobufVidMapping(vidMapPB);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
index d67c4b09ddc..d5adf93e978 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
@@ -21,6 +21,7 @@
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.cmdline.ModeArgumentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics;
@@ -198,7 +199,8 @@ public static JavaRDD mark(final JavaRDD reads, final SAMFil
final MarkDuplicatesScoringStrategy scoringStrategy,
final OpticalDuplicateFinder opticalDuplicateFinder,
final int numReducers, final boolean dontMarkUnmappedMates,
- final MarkDuplicates.DuplicateTaggingPolicy taggingPolicy) {
+ final MarkDuplicates.DuplicateTaggingPolicy taggingPolicy,
+ final MarkDuplicatesSparkArgumentCollection mdArgs) {
final boolean markUnmappedMates = !dontMarkUnmappedMates;
SAMFileHeader headerForTool = header.clone();
@@ -208,7 +210,7 @@ public static JavaRDD mark(final JavaRDD reads, final SAMFil
// If we need to remove optical duplicates or tag them, then make sure we are keeping track
final boolean markOpticalDups = (taggingPolicy != MarkDuplicates.DuplicateTaggingPolicy.DontTag);
- final JavaPairRDD, Integer> namesOfNonDuplicates = MarkDuplicatesSparkUtils.transformToDuplicateNames(headerForTool, scoringStrategy, opticalDuplicateFinder, sortedReadsForMarking, numReducers, markOpticalDups);
+ final JavaPairRDD, Integer> namesOfNonDuplicates = MarkDuplicatesSparkUtils.transformToDuplicateNames(headerForTool, scoringStrategy, opticalDuplicateFinder, sortedReadsForMarking, numReducers, markOpticalDups, mdArgs);
// Here we explicitly repartition the read names of the unmarked reads to match the partitioning of the original bam
final JavaRDD> repartitionedReadNames = namesOfNonDuplicates
@@ -273,7 +275,9 @@ public static JavaRDD mark(final JavaRDD reads, final SAMFil
finder,
numReducers,
mdArgs.dontMarkUnmappedMates,
- mdArgs.taggingPolicy);
+ mdArgs.taggingPolicy,
+ mdArgs
+ );
}
@@ -379,4 +383,20 @@ private boolean treatAsReadGroupOrdered(SAMFileHeader header, boolean treatUnsor
return false;
}
+ /**
+ * mode adjustments
+ * @return error messages
+ */
+ @Override
+ protected String[] customCommandLineValidation() {
+ if (markDuplicatesSparkArgumentCollection.useFlowFragments) {
+ ModeArgumentUtils.setArgValues(
+ getCommandLineParser(),
+ markDuplicatesSparkArgumentCollection.getFlowModeArgValues(),
+ MarkDuplicatesSparkArgumentCollection.FLOW_MD_MODE_LONG_NAME);
+ }
+ return null;
+ }
+
+
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java
index e152c881aec..aa2332dd216 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java
@@ -11,11 +11,13 @@
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
+import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.metrics.MetricsUtils;
import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
@@ -115,7 +117,7 @@ public String toString() {
* highest scoring as duplicates.
* (b) Determine which duplicates are optical duplicates and increase the overall count.
*/
- static JavaPairRDD, Integer> transformToDuplicateNames(final SAMFileHeader header, final MarkDuplicatesScoringStrategy scoringStrategy, final OpticalDuplicateFinder finder, final JavaRDD reads, final int numReducers, final boolean markOpticalDups) {
+ static JavaPairRDD, Integer> transformToDuplicateNames(final SAMFileHeader header, final MarkDuplicatesScoringStrategy scoringStrategy, final OpticalDuplicateFinder finder, final JavaRDD reads, final int numReducers, final boolean markOpticalDups, final MarkDuplicatesSparkArgumentCollection mdArgs) {
// we treat these specially and don't mark them as duplicates
final JavaRDD mappedReads = reads.filter(ReadFilterLibrary.MAPPED::test);
@@ -136,8 +138,8 @@ static JavaPairRDD, Integer> transformToDuplicateNames(final S
final GATKRead read = readWithIndex.getValue();
if (!(read.isSecondaryAlignment()||read.isSupplementaryAlignment())) {
PairedEnds fragment = (ReadUtils.readHasMappedMate(read)) ?
- MarkDuplicatesSparkRecord.newEmptyFragment(read, header, libraryIndex.getValue()) :
- MarkDuplicatesSparkRecord.newFragment(read, header, readWithIndex.getIndex(), scoringStrategy, libraryIndex.getValue());
+ MarkDuplicatesSparkRecord.newEmptyFragment(read, header, libraryIndex.getValue(), mdArgs) :
+ MarkDuplicatesSparkRecord.newFragment(read, header, readWithIndex.getIndex(), scoringStrategy, libraryIndex.getValue(), mdArgs);
out.add(new Tuple2<>(fragment.key(), fragment));
} else {
@@ -195,7 +197,7 @@ static JavaPairRDD, Integer> transformToDuplicateNames(final S
final JavaPairRDD> keyedPairs = pairedEnds.groupByKey(); //TODO evaluate replacing this with a smart aggregate by key.
- return markDuplicateRecords(keyedPairs, finder, markOpticalDups);
+ return markDuplicateRecords(keyedPairs, finder, markOpticalDups, mdArgs.FLOW_END_LOCATION_SIGNIFICANT, mdArgs.ENDS_READ_UNCERTAINTY);
}
/**
@@ -285,7 +287,8 @@ private static JavaPairRDD>> spanReadsByKey
*/
@SuppressWarnings("unchecked")
private static JavaPairRDD, Integer> markDuplicateRecords(final JavaPairRDD> keyedPairs,
- final OpticalDuplicateFinder finder, final boolean markOpticalDups) {
+ final OpticalDuplicateFinder finder, final boolean markOpticalDups,
+ final boolean handleFragmentEnds, final int flowEndUncert) {
return keyedPairs.flatMapToPair(keyedPair -> {
Iterable pairGroups = keyedPair._2();
@@ -301,8 +304,13 @@ private static JavaPairRDD, Integer> markDuplicateRecords(fina
//empty MarkDuplicatesSparkRecord signify that a pair has a mate somewhere else
// If there are any non-fragment placeholders at this site, mark everything as duplicates, otherwise compute the best score
if (Utils.isNonEmpty(fragments) && !Utils.isNonEmpty(emptyFragments)) {
- final Tuple2, Integer> bestFragment = handleFragments(fragments, finder);
- nonDuplicates.add(bestFragment);
+ if ( !handleFragmentEnds ) {
+ final Tuple2, Integer> bestFragment = handleFragments(fragments, finder);
+ nonDuplicates.add(bestFragment);
+ } else {
+ nonDuplicates.addAll(handleFragmentsWithEndPosition(fragments, finder, flowEndUncert));
+ }
+
}
if (Utils.isNonEmpty(pairs)) {
@@ -344,6 +352,93 @@ private static List,Integer>> handlePassthroughs(List, Integer>> handleFragmentsWithEndPosition(List duplicateFragmentGroup, OpticalDuplicateFinder finder, final int endUncert) {
+
+ // easy case? (there is only one member)
+ if (duplicateFragmentGroup.size() == 1) {
+ return Collections.singletonList(new Tuple2<>(new IndexPair<>(duplicateFragmentGroup.get(0).getName(), duplicateFragmentGroup.get(0).getPartitionIndex()), 0));
+ }
+
+ // this should only be called with FlowModeFragments
+ if (duplicateFragmentGroup.stream().filter(r -> !(r instanceof FlowModeFragment )).count() > 0 ) {
+ throw new IllegalArgumentException("handleFragmentsWithEndPosition currently only supports FlowModeFragment(s) in duplicateFragmentGroup");
+ }
+
+ // collect as flow mode fragments and sort on end to ensure consistency
+ final List flowDuplicateFragmentGroup = duplicateFragmentGroup.stream()
+ .map(r -> (FlowModeFragment)r)
+ .sorted(Comparator.comparingInt(FlowModeFragment::getEnd))
+ .collect(Collectors.toList());
+
+ // this will accumulate the primary from each subgroup
+ List, Integer>> output = new ArrayList<>();
+
+ // loop on fragments, break into subgroups
+ List subGroup = new LinkedList<>();
+ int subGroupMinEnd = 0;
+ int subGroupMaxEnd = 0;
+ for ( FlowModeFragment fragment : flowDuplicateFragmentGroup ) {
+
+ final int end = fragment.getEnd();
+
+ if ( subGroup.size() == 0 ) {
+ // first one?
+ subGroup.add(fragment);
+ if ( end != FlowBasedReadUtils.FLOW_BASED_INSIGNIFICANT_END) {
+ subGroupMinEnd = end - endUncert;
+ subGroupMaxEnd = end + endUncert;
+ }
+ } else if ( end == FlowBasedReadUtils.FLOW_BASED_INSIGNIFICANT_END) {
+ // insignificant end, simply accumulate
+ subGroup.add(fragment);
+ } else if ( subGroupMinEnd == 0 ) {
+ // first significant, make it dominate end range
+ subGroup.add(fragment);
+ subGroupMinEnd = end - endUncert;
+ subGroupMaxEnd = end + endUncert;
+ } else if ( end >= subGroupMinEnd && end <= subGroupMaxEnd ) {
+ // fits into existing group w/ proper end
+ subGroup.add(fragment);
+ subGroupMinEnd = Math.min(subGroupMinEnd, end - endUncert);
+ subGroupMaxEnd = Math.max(subGroupMaxEnd, end + endUncert);
+ } else {
+ // does not belong to subgroup, pick best from existing and start new
+ output.add(handleFragments(subGroup, finder));
+ subGroup.clear();
+ subGroup.add(fragment);
+ if ( end != FlowBasedReadUtils.FLOW_BASED_INSIGNIFICANT_END) {
+ subGroupMinEnd = end - endUncert;
+ subGroupMaxEnd = end + endUncert;
+ } else {
+ subGroupMinEnd = 0;
+ subGroupMaxEnd = 0;
+ }
+ }
+ }
+
+ // handle leftovers
+ if ( subGroup.size() != 0 )
+ output.add(handleFragments(subGroup, finder));
+
+ return output;
+ }
+
private static List, Integer>> handlePairs(final List pairs, final OpticalDuplicateFinder finder, final boolean markOpticalDups) {
// save ourselves the trouble when there are no optical duplicates to worry about
if (pairs.size() == 1) {
@@ -391,7 +486,7 @@ private static int countOpticalDuplicates(OpticalDuplicateFinder finder, List, Integer> handleFragments(List duplicateFragmentGroup, OpticalDuplicateFinder finder) {
return duplicateFragmentGroup.stream()
- .map(f -> (Fragment)f)
+ .map(f -> (TransientFieldPhysicalLocation)f)
.peek(f -> finder.addLocationInformation(f.getName(), f))
.max(PAIRED_ENDS_SCORE_COMPARATOR)
.map(best -> new Tuple2<>(new IndexPair<>(best.getName(), best.getPartitionIndex()), -1))
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
index 98c03b75179..3b358306704 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
@@ -29,6 +29,7 @@
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import java.util.*;
+import java.util.stream.Collectors;
/**
* Perform joint genotyping on one or more samples pre-called with HaplotypeCaller
@@ -259,7 +260,8 @@ public void onTraversalStart() {
intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();
- annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);
+ Collection variantAnnotations = makeVariantAnnotations();
+ annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);
merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput, false, true);
@@ -268,7 +270,8 @@ public void onTraversalStart() {
vcfWriter = createVCFWriter(outputFile);
//create engine object
- gvcfEngine = new GenotypeGVCFsEngine(annotationEngine, genotypeArgs, includeNonVariants, inputVCFHeader);
+ final boolean keepSB = variantAnnotations.stream().map(a -> a.getClass().getSimpleName()).collect(Collectors.toList()).contains("StrandBiasBySample");
+ gvcfEngine = new GenotypeGVCFsEngine(annotationEngine, genotypeArgs, includeNonVariants, inputVCFHeader, keepSB);
//call initialize method in engine class that creates VCFWriter object and writes a header to it
vcfWriter = gvcfEngine.setupVCFWriter(defaultToolVCFHeaderLines, keepCombined, dbsnp, vcfWriter);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
index ad280bce6f3..312a90872e7 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
@@ -66,21 +66,24 @@ public class GenotypeGVCFsEngine
final VCFHeader inputVCFHeader;
+ final boolean keepSB;
+
/**
* Create and initialize a new GenotypeGVCFsEngine given a collection of GenotypeGVCF arguments and a VCF header
- *
- * @param annotationEngine variantAnnotatorEngine with annotations to process already added
+ * @param annotationEngine variantAnnotatorEngine with annotations to process already added
* @param genotypeArgs command-line arguments for the GenotypeGVCFs caller
* @param includeNonVariants true to save INFO header names that require alt alleles
* @param inputVCFHeader header for the VCF
+ * @param keepSB keep SB attribute (STRAND_BIAS_BY_SAMPLE)
*/
public GenotypeGVCFsEngine(final VariantAnnotatorEngine annotationEngine, final GenotypeCalculationArgumentCollection genotypeArgs,
- final boolean includeNonVariants, final VCFHeader inputVCFHeader)
+ final boolean includeNonVariants, final VCFHeader inputVCFHeader, final boolean keepSB)
{
this.annotationEngine = annotationEngine;
this.genotypeArgs = genotypeArgs;
this.includeNonVariants = includeNonVariants;
this.inputVCFHeader = inputVCFHeader;
+ this.keepSB = keepSB;
initialize();
}
@@ -183,10 +186,10 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
//don't count sites with no depth and no confidence towards things like AN and InbreedingCoeff
vcBuilder.genotypes(assignNoCallsAnnotationExcludedGenotypes(result.getGenotypes()));
VariantContext annotated = annotationEngine.annotateContext(vcBuilder.make(), features, ref, null, a -> true);
- return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false)).make();
+ return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false, keepSB)).make();
} else if (includeNonVariants) {
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
- VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
+ VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true, false)).make();
return annotationEngine.annotateContext(preannotated, features, ref, null, GenotypeGVCFsEngine::annotationShouldBeSkippedForHomRefSites);
} else {
return null;
@@ -429,10 +432,11 @@ public VariantContextWriter setupVCFWriter(Set defaultToolVCFHead
*
* @param vc the VariantContext with the Genotypes to fix
* @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic
+ * @param keepSB keep value of SB attribute
* @return a new set of Genotypes
*/
@VisibleForTesting
- static List cleanupGenotypeAnnotations(final VariantContext vc, final boolean createRefGTs) {
+ static List cleanupGenotypeAnnotations(final VariantContext vc, final boolean createRefGTs, final boolean keepSB) {
final GenotypesContext oldGTs = vc.getGenotypes();
final List recoveredGs = new ArrayList<>(oldGTs.size());
for ( final Genotype oldGT : oldGTs ) {
@@ -448,7 +452,9 @@ static List cleanupGenotypeAnnotations(final VariantContext vc, final
attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY);
}
- attrs.remove(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
+ if ( !keepSB ) {
+ attrs.remove(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
+ }
// update PGT for hom vars
if ( oldGT.isHomVar() && oldGT.hasExtendedAttribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY) ) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
index 83683e4f161..73a07a5dea4 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
@@ -92,7 +92,7 @@ public static List getAlleleLengthListOfString(String rawDataString) {
return Arrays.asList(rawDataString.split(ALLELE_SPECIFIC_SPLIT_REGEX, -1)); //-1 to keep empty data
}
- static String generateMissingDataWarning(final VariantContext vc, final Genotype g, final AlleleLikelihoods likelihoods) {
+ static public String generateMissingDataWarning(final VariantContext vc, final Genotype g, final AlleleLikelihoods likelihoods) {
final StringBuilder outString = new StringBuilder("Annotation will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
" and possibly subsequent");
if (!g.isCalled()) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
index 91403e9cf9f..13b003af94d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
@@ -2,32 +2,22 @@
import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.mutable.MutableInt;
+import org.broadinstitute.barclay.argparser.Argument;
import org.apache.commons.lang3.tuple.Triple;
import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.MathUtils;
-import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.EventMap;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
-import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
-import org.broadinstitute.hellbender.utils.read.CigarUtils;
-import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
-import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
-import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import java.util.*;
-import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@@ -37,6 +27,11 @@
summary="Describe the complexity of an assembly region")
public class AssemblyComplexity implements JumboInfoAnnotation {
+ @Argument(fullName = "assembly-complexity-reference-mode",
+ doc="If enabled will treat the reference as the basis for assembly complexity as opposed to estimated germline haplotypes",
+ optional=true)
+ public boolean germlineMode = false;
+
public AssemblyComplexity() { }
@Override
@@ -44,8 +39,8 @@ public Map annotate(final ReferenceContext ref,
final FeatureContext features,
final VariantContext vc,
final AlleleLikelihoods likelihoods,
- final AlleleLikelihoods fragmentLikelihoods,
- final AlleleLikelihoods haplotypeLikelihoods) {
+ final AlleleLikelihoods, Allele> fragmentLikelihoods,
+ final AlleleLikelihoods, Haplotype> haplotypeLikelihoods) {
final Triple annotations = annotate(vc, haplotypeLikelihoods);
final Map result = new HashMap<>();
@@ -88,13 +83,21 @@ public static Triple annotate(final VariantContext vc, f
.map(entry -> entry.getKey())
.collect(Collectors.toList());
- final List germlineHaplotypes = new ArrayList<>();
- germlineHaplotypes.add(haplotypesByDescendingSupport.get(0));
- if (haplotypesByDescendingSupport.size() > 1 && haplotypeSupportCounts.get(haplotypesByDescendingSupport.get(1)).intValue() >= haplotypeSupportCounts.get(haplotypesByDescendingSupport.get(0)).intValue()/2) {
- germlineHaplotypes.add(haplotypesByDescendingSupport.get(1));
+ final List germlineHaplotypes;
+ if (germlineMode) {
+ germlineHaplotypes = Collections.singletonList(haplotypeLikelihoods.getAllele(haplotypeLikelihoods.indexOfReference()));
+ } else {
+ germlineHaplotypes = new ArrayList<>();
+ germlineHaplotypes.add(haplotypesByDescendingSupport.get(0));
+ if (haplotypesByDescendingSupport.size() > 1 && haplotypeSupportCounts.get(haplotypesByDescendingSupport.get(1)).intValue() >= haplotypeSupportCounts.get(haplotypesByDescendingSupport.get(0)).intValue() / 2) {
+ germlineHaplotypes.add(haplotypesByDescendingSupport.get(1));
+ }
}
final int[] editDistances = IntStream.range(0, vc.getNAlleles() - 1).map(altAlleleIndex -> {
+ if (vc.getAlternateAllele(altAlleleIndex).isSymbolic() || vc.getAlternateAllele(altAlleleIndex).getBases()[0] == '*') {
+ return 0;
+ }
final Haplotype mostSupportedHaplotypeWithAllele = haplotypesByDescendingSupport.stream()
.filter(hap -> containsAltAllele(hap.getEventMap(), vc, altAlleleIndex))
.findFirst().get();
@@ -104,6 +107,9 @@ public static Triple annotate(final VariantContext vc, f
// measure which proportion of reads supporting each alt allele fit the most-supported haplotype for that allele
final double[] haplotypeDominance = IntStream.range(0, vc.getNAlleles() - 1).mapToDouble(altAlleleIndex -> {
+ if (vc.getAlternateAllele(altAlleleIndex).isSymbolic() || vc.getAlternateAllele(altAlleleIndex).getBases()[0] == '*') {
+ return 0;
+ }
final int[] counts = haplotypesByDescendingSupport.stream()
.filter(hap -> containsAltAllele(hap.getEventMap(), vc, altAlleleIndex))
.mapToInt(hap -> haplotypeSupportCounts.get(hap).intValue())
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/HaplotypeFilteringAnnotation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/HaplotypeFilteringAnnotation.java
new file mode 100644
index 00000000000..2204c00cc9f
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/HaplotypeFilteringAnnotation.java
@@ -0,0 +1,47 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.*;
+
+/**
+ * Set of annotations meant to be reflective of HaplotypeFiltering operations that were applied in FlowBased HaplotypeCaller.
+ */
+@DocumentedFeature(groupName= HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY,
+ summary="Summary of the haplotype filtering steps.")
+public class HaplotypeFilteringAnnotation implements JumboInfoAnnotation {
+
+
+ public HaplotypeFilteringAnnotation() {
+ }
+
+ @Override
+ public Map annotate(final ReferenceContext ref,
+ final FeatureContext features,
+ final VariantContext vc,
+ final AlleleLikelihoods likelihoods,
+ final AlleleLikelihoods, Allele> fragmentLikelihoods,
+ final AlleleLikelihoods, Haplotype> haplotypeLikelihoods) {
+
+ final Map result = new HashMap<>();
+ result.put(GATKVCFConstants.HAPLOTYPES_BEFORE_FILTERING_KEY, haplotypeLikelihoods.alleles().size());
+ result.put(GATKVCFConstants.HAPLOTYPES_FILTERED_KEY, haplotypeLikelihoods.getFilteredHaplotypeCount());
+
+ return result;
+ }
+
+
+ @Override
+ public List getKeyNames() {
+ return Arrays.asList(GATKVCFConstants.HAPLOTYPES_BEFORE_FILTERING_KEY, GATKVCFConstants.HAPLOTYPES_FILTERED_KEY);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/JumboInfoAnnotation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/JumboInfoAnnotation.java
index 0247d9b4ed0..7bde3ce4e6b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/JumboInfoAnnotation.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/JumboInfoAnnotation.java
@@ -1,23 +1,14 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
-import htsjdk.variant.vcf.VCFFormatHeaderLine;
-import htsjdk.variant.vcf.VCFHeaderLine;
-import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
-import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
-import java.util.ArrayList;
-import java.util.List;
import java.util.Map;
/**
@@ -28,9 +19,9 @@ public interface JumboInfoAnnotation extends VariantAnnotation{
default VCFCompoundHeaderLine.SupportedHeaderLineType annotationType() { return VCFCompoundHeaderLine.SupportedHeaderLineType.INFO; }
Map annotate(final ReferenceContext ref,
- final FeatureContext features,
- final VariantContext vc,
- final AlleleLikelihoods likelihoods,
- final AlleleLikelihoods fragmentLikelihoods,
- final AlleleLikelihoods haplotypeLikelihoods);
+ final FeatureContext features,
+ final VariantContext vc,
+ final AlleleLikelihoods likelihoods,
+ final AlleleLikelihoods, Allele> fragmentLikelihoods,
+ final AlleleLikelihoods, Haplotype> haplotypeLikelihoods);
}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RawGtCount.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RawGtCount.java
new file mode 100644
index 00000000000..cbf34f767f5
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RawGtCount.java
@@ -0,0 +1,120 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFCompoundHeaderLine;
+import htsjdk.variant.vcf.VCFStandardHeaderLines;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
+import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
+
+import java.util.*;
+
+@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Counts of genotypes w.r.t. the reference allele: 0/0, 0/*, */*, i.e. all alts lumped together")
+public class RawGtCount implements InfoFieldAnnotation, ReducibleAnnotation {
+ private static final String SEPARATOR = ",";
+
+ @Override
+ public String getPrimaryRawKey() { return GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY; }
+
+ @Override
+ public boolean hasSecondaryRawKeys() {
+ return false;
+ }
+
+ @Override
+ public List getSecondaryRawKeys() {
+ return null;
+ }
+
+ @Override
+ public Map annotateRawData(ReferenceContext ref, VariantContext vc, AlleleLikelihoods likelihoods) {
+ return null;
+ }
+
+ @Override
+ @SuppressWarnings({"unchecked", "rawtypes"})//FIXME generics here blow up
+ public Map combineRawData(List allelesList, List> listOfRawData) {
+ ReducibleAnnotationData combinedData = new ReducibleAnnotationData(null);
+
+ for (final ReducibleAnnotationData currentValue : listOfRawData) {
+ parseRawDataString(currentValue);
+ combineAttributeMap(currentValue, combinedData);
+ }
+ final Map annotations = new HashMap<>();
+ String annotationString = makeRawAnnotationString(allelesList, combinedData.getAttributeMap());
+ annotations.put(getPrimaryRawKey(), annotationString);
+ return annotations;
+ }
+
+ private String makeRawAnnotationString(final List vcAlleles, final Map> perAlleleData) {
+ //TODO: We can't calculate the true hom ref count since there is no annotation on hom ref calls that we are combining
+ //TODO: For now it's better not to include the incorrect value of 0.
+ return "." + SEPARATOR + perAlleleData.get(Allele.NO_CALL).get(1) + SEPARATOR + perAlleleData.get(Allele.NO_CALL).get(2);
+ }
+
+ private void parseRawDataString(ReducibleAnnotationData> myData) {
+ myData.putAttribute(Allele.NO_CALL, parseRawDataString(myData.getRawData()));
+ }
+
+ private List parseRawDataString(String rawDataString) {
+ try {
+ final String[] parsed = rawDataString.trim().replaceAll(AnnotationUtils.BRACKET_REGEX, "").split(", *");
+ if (parsed.length != 3) {
+ throw new UserException.BadInput(String.format("Raw value for %s has %d values, expected 3. Annotation value is %s", GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, parsed.length, rawDataString));
+ }
+ final int homRefCount = Integer.parseInt(parsed[0]);
+ final int hetCount = Integer.parseInt(parsed[1]);
+ final int homVarCount = Integer.parseInt(parsed[2]);
+ return Arrays.asList(homRefCount, hetCount, homVarCount);
+ } catch (final NumberFormatException e) {
+ throw new UserException.BadInput("malformed " + GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY + " annotation: " + rawDataString, e);
+ }
+ }
+
+ private void combineAttributeMap(ReducibleAnnotationData> toAdd, ReducibleAnnotationData> combined) {
+ if (combined.getAttribute(Allele.NO_CALL) != null) {
+ combined.putAttribute(Allele.NO_CALL, Arrays.asList(combined.getAttribute(Allele.NO_CALL).get(0) + toAdd.getAttribute(Allele.NO_CALL).get(0),
+ combined.getAttribute(Allele.NO_CALL).get(1) + toAdd.getAttribute(Allele.NO_CALL).get(1),
+ combined.getAttribute(Allele.NO_CALL).get(2) + toAdd.getAttribute(Allele.NO_CALL).get(2)));
+ } else {
+ combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
+ }
+ }
+
+ @Override
+ public Map finalizeRawData(VariantContext vc, VariantContext originalVC) {
+ return null;
+ }
+
+ @Override
+ public List getKeyNames() {
+ return getRawKeyNames();
+ }
+
+ @Override
+ public List getDescriptions() {
+ return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+
+ @Override
+ public List getRawDescriptions() {
+ final List lines = new ArrayList<>(1);
+ for (final String rawKey : getRawKeyNames()) {
+ lines.add(GATKVCFHeaderLines.getInfoLine(rawKey));
+ }
+ return lines;
+ }
+
+ @Override
+ public Map annotate(ReferenceContext ref, VariantContext vc, AlleleLikelihoods likelihoods) {
+ return null;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasBySample.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasBySample.java
index baddefa797b..a1fd6c829e5 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasBySample.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasBySample.java
@@ -68,6 +68,12 @@ public void annotate(final ReferenceContext ref,
Utils.nonNull(g);
Utils.nonNull(gb);
+ // Do not recalculate StrandBiasBySampleKey when likelihoods is null (in genotypeGVCF) and the variant
+ // already has StrandBiasTable
+ if ( g.hasExtendedAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY) && (likelihoods == null)) {
+ return;
+ }
+
if ( likelihoods == null || !g.isCalled() ) {
droppedElementLogger.warn(() -> AnnotationUtils.generateMissingDataWarning(vc, g, likelihoods));
return;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java
index d65f1a6a7ff..327f0ad5408 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java
@@ -6,6 +6,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.InverseAllele;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
@@ -120,9 +121,48 @@ else if (((ArrayList)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_S
public static int[][] getContingencyTable( final AlleleLikelihoods likelihoods,
final VariantContext vc,
final int minCount) {
- return getContingencyTable(likelihoods, vc, minCount, likelihoods.samples());
+ if( likelihoods == null || vc == null) {
+ return null;
+ }
+ final Allele ref = vc.getReference();
+ final List allAlts = vc.getAlternateAlleles();
+
+ return getContingencyTable(likelihoods, ref, allAlts, minCount);
+ }
+
+ public static int[][] getContingencyTable( final AlleleLikelihoods likelihoods,
+ final Allele ref,
+ final List alts,
+ final int minCount) {
+ return getContingencyTable(likelihoods, ref, alts, minCount, likelihoods.samples());
}
+
+ /**
+ * Generates a contingency table where the strand bias of an allele is estimated relative to all alleles
+ * (total coverage) in the location. Useful when the question is about strand bias of an allele rather
+ * than of a location (like in AlleleFiltering)
+ * @param likelihoods likelihood matrix
+ * @param ref reference allele
+ * @param alts alternative alleles
+ * @param minCount minimal count (pseudocount)
+ * @return 2x2 contingency table
+ */
+ public static int[][] getContingencyTableWrtAll( final AlleleLikelihoods likelihoods,
+ final Allele ref,
+ final List alts,
+ final int minCount) {
+ int [][] table = getContingencyTable(likelihoods, ref, alts, minCount);
+ for (int i =0 ; i < ARRAY_DIM; i ++ ) {
+ for (int j = 1; j < ARRAY_DIM; j++) {
+ table[0][i] += table[j][i];
+ }
+ }
+ return table;
+ }
+
+
+
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
@@ -140,6 +180,15 @@ public static int[][] getContingencyTable( final AlleleLikelihoods allAlts = vc.getAlternateAlleles();
+ return getContingencyTable(likelihoods, ref, allAlts, minCount, samples);
+ }
+
+ private static int[][] getContingencyTable( final AlleleLikelihoods likelihoods,
+ final Allele ref,
+ final List allAlts,
+ final int minCount,
+ final Collection samples) {
+
final int[][] table = new int[ARRAY_DIM][ARRAY_DIM];
for (final String sample : samples) {
@@ -155,6 +204,7 @@ public static int[][] getContingencyTable( final AlleleLikelihoods allAlts) {
- final boolean matchesRef = allele.equals(ref, true);
+ final boolean matchesRef = (ref instanceof InverseAllele) ? (ref).equals(allele) : allele.equals(ref, true);
final boolean matchesAnyAlt = allAlts.contains(allele);
if ( matchesRef || matchesAnyAlt ) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandOddsRatio.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandOddsRatio.java
index 9340bd04435..b97541cd447 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandOddsRatio.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandOddsRatio.java
@@ -117,9 +117,10 @@
@DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Strand bias estimated by the symmetric odds ratio test (SOR)")
public final class StrandOddsRatio extends StrandBiasTest implements StandardAnnotation {
- private static final double PSEUDOCOUNT = 1.0;
+ private static final double PSEUDOCOUNT = 1;
private static final int MIN_COUNT = 0;
+
@Override
protected Map calculateAnnotationFromGTfield(final GenotypesContext genotypes){
final int[][] tableFromPerSampleAnnotations = getTableFromSamples(genotypes, MIN_COUNT);
@@ -133,6 +134,8 @@ protected Map calculateAnnotationFromLikelihoods(final AlleleLik
return annotationForOneTable(calculateSOR(table));
}
+
+
/**
* Computes the SOR value of a table after augmentation. Based on the symmetric odds ratio but modified to take on
* low values when the reference +/- read count ratio is skewed but the alt count ratio is not. Natural log is taken
@@ -143,6 +146,7 @@ protected Map calculateAnnotationFromLikelihoods(final AlleleLik
* @param table The table before adding pseudocounts
* @return the SOR annotation value
*/
+
public static double calculateSOR(final int[][] table) {
final double t00 = table[0][0] + PSEUDOCOUNT;
final double t01 = table[0][1] + PSEUDOCOUNT;
@@ -151,12 +155,12 @@ public static double calculateSOR(final int[][] table) {
final double ratio = (t00 / t01) * (t11 / t10) + (t01 / t00) * (t10 / t11);
- final double refRatio = min(t00, t01)/ max(t00, t01);
- final double altRatio = min(t10, t11)/ max(t10, t11);
+ final double refRatio = min(t00, t01) / max(t00, t01);
+ final double altRatio = min(t10, t11) / max(t10, t11);
return Math.log(ratio) + Math.log(refRatio) - Math.log(altRatio);
}
-
+
/**
* Returns an annotation result given a sor
*
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
index ae67a7a7a23..a3b3d0ade69 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
@@ -303,7 +303,7 @@ public VariantContext annotateContext(final VariantContext vc,
final ReferenceContext ref,
final AlleleLikelihoods likelihoods,
final Predicate addAnnot) {
- return annotateContext(vc, features, ref, likelihoods, Optional.empty(), Optional.empty(), addAnnot);
+ return annotateContext(vc, features, ref, likelihoods, Optional.empty(), Optional.empty(), Optional.empty(), addAnnot);
}
/**
@@ -320,7 +320,8 @@ public VariantContext annotateContext(final VariantContext vc,
final ReferenceContext ref,
final AlleleLikelihoods readLikelihoods,
final Optional> fragmentLikelihoods,
- final Optional> haplotypeLikelihoods,
+ final Optional> fragmentHaplotypeLikelihoods,
+ final Optional> readHaplotypeAlleleLikelihoods,
final Predicate addAnnot) {
Utils.nonNull(vc, "vc cannot be null");
Utils.nonNull(features, "features cannot be null");
@@ -328,10 +329,10 @@ public VariantContext annotateContext(final VariantContext vc,
// annotate genotypes, creating another new VC in the process
final VariantContextBuilder builder = new VariantContextBuilder(vc);
- builder.genotypes(annotateGenotypes(ref, features, vc, readLikelihoods, fragmentLikelihoods, haplotypeLikelihoods, addAnnot));
+ builder.genotypes(annotateGenotypes(ref, features, vc, readLikelihoods, fragmentLikelihoods, fragmentHaplotypeLikelihoods, addAnnot));
final VariantContext newGenotypeAnnotatedVC = builder.make();
- final Map infoAnnotMap = addInfoAnnotations(vc, features, ref, readLikelihoods, fragmentLikelihoods, haplotypeLikelihoods, addAnnot, newGenotypeAnnotatedVC);
+ final Map infoAnnotMap = addInfoAnnotations(vc, features, ref, readLikelihoods, fragmentLikelihoods, fragmentHaplotypeLikelihoods, readHaplotypeAlleleLikelihoods, addAnnot, newGenotypeAnnotatedVC);
// create a new VC with info and genotype annotations
final VariantContext annotated = builder.attributes(infoAnnotMap).make();
@@ -342,7 +343,8 @@ public VariantContext annotateContext(final VariantContext vc,
private Map addInfoAnnotations(VariantContext vc, FeatureContext features, ReferenceContext ref,
AlleleLikelihoods likelihoods, final Optional> fragmentLikelihoods,
- final Optional> haplotypeLikelihoods, Predicate addAnnot, VariantContext newGenotypeAnnotatedVC) {
+ final Optional> haplotypeLikelihoods, final Optional> readHaplotypeAlleleLikelihoods,
+ Predicate addAnnot, VariantContext newGenotypeAnnotatedVC) {
final Map infoAnnotMap = new LinkedHashMap<>(newGenotypeAnnotatedVC.getAttributes());
annotateExpressions(vc, features, ref, infoAnnotMap);
@@ -359,9 +361,12 @@ private Map addInfoAnnotations(VariantContext vc, FeatureContext
}
}
}
- if (fragmentLikelihoods.isPresent() && haplotypeLikelihoods.isPresent()) {
+ //TODO this whole thing should be refactored if this is useful or ripped out.
+ if ((fragmentLikelihoods.isPresent() && haplotypeLikelihoods.isPresent()) || readHaplotypeAlleleLikelihoods.isPresent()) {
jumboInfoAnnotations.stream()
- .map(annot -> annot.annotate(ref, features, vc, likelihoods, fragmentLikelihoods.get(), haplotypeLikelihoods.get()))
+ .map(annot -> annot.annotate(ref, features, vc, likelihoods,
+ fragmentLikelihoods.isPresent()? fragmentLikelihoods.get() : null,
+ haplotypeLikelihoods.isPresent()? haplotypeLikelihoods.get(): readHaplotypeAlleleLikelihoods.get()))
.forEach(infoAnnotMap::putAll);
}
return infoAnnotMap;
@@ -384,11 +389,20 @@ private GenotypesContext annotateGenotypes(final ReferenceContext ref,
final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
for ( final Genotype genotype : vc.getGenotypes() ) {
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
- genotypeAnnotations.stream().filter(addAnnot).forEach(annot -> annot.annotate(ref, vc, genotype, gb, likelihoods));
+ for ( final GenotypeAnnotation annotation : genotypeAnnotations) {
- if (fragmentLikelihoods.isPresent() && haplotypeLikelihoods.isPresent()) {
- jumboGenotypeAnnotations.stream().filter(addAnnot).forEach(annot ->
- annot.annotate(ref, features, vc, genotype, gb, likelihoods, fragmentLikelihoods.get(), haplotypeLikelihoods.get()));
+ genotypeAnnotations.stream().filter(addAnnot)
+ .forEach(annot -> annot.annotate(ref, vc, genotype, gb, likelihoods));
+
+ if (fragmentLikelihoods.isPresent() && haplotypeLikelihoods.isPresent()) {
+ jumboGenotypeAnnotations.stream().filter(addAnnot).forEach(annot ->
+ annot.annotate(ref, features, vc, genotype, gb, likelihoods, fragmentLikelihoods.get(), haplotypeLikelihoods.get()));
+ }
+
+
+ if (addAnnot.test(annotation)) {
+ annotation.annotate(ref, vc, genotype, gb, likelihoods);
+ }
}
genotypes.add(gb.make());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/CycleSkipStatus.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/CycleSkipStatus.java
new file mode 100644
index 00000000000..d31c0dfa600
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/CycleSkipStatus.java
@@ -0,0 +1,56 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import com.google.common.annotations.VisibleForTesting;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.StandardMutectAnnotation;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="Cycle Skip Status Flow Annotation")
+public class CycleSkipStatus extends FlowAnnotatorBase implements StandardFlowBasedAnnotation {
+ private final Logger logger = LogManager.getLogger(CycleSkipStatus.class);
+
+ @Override
+ public Map annotate(ReferenceContext ref,
+ VariantContext vc,
+ AlleleLikelihoods likelihoods) {
+
+ final LocalContext localContext = new LocalContext(ref, vc, likelihoods, true);
+
+ if ( localContext.generateAnnotation ) {
+ indelClassify(vc, localContext);
+ isHmerIndel(vc, localContext);
+ getLeftMotif(vc, localContext);
+ getRightMotif(vc, localContext);
+ cycleSkip(vc, localContext);
+ }
+
+ return localContext.asAttributes();
+ }
+
+ @Override
+ public List getKeyNames() {
+
+ return Collections.singletonList(GATKVCFConstants.FLOW_CYCLESKIP_STATUS);
+ }
+
+ protected boolean isActualFlowOrderRequired() {
+ return true;
+ }
+
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
new file mode 100644
index 00000000000..30531f9b153
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
@@ -0,0 +1,476 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.walkers.annotator.InfoFieldAnnotation;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
+import org.broadinstitute.hellbender.utils.read.FlowBasedKeyCodec;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Base class for flow based annotations
+ *
+ * Some flow based annotations depend on the results from other annotations, regardless
+ * if they were called for by user arguments. To overcome this, this class contains all shared
+ * code to compute flow based annotations.
+ *
+ * Each (specific) annotation is implemented as a subclass of this class. It then invokes the
+ * annotation calculation methods contained here (the shared code) to compute its prerequisite
+ * and itself.
+ *
+ * State between such calls is kept in a LocalContext, a local class. Its is there were annotations
+ * are accumulated as well.
+ */
+public abstract class FlowAnnotatorBase implements InfoFieldAnnotation {
+ private final static Logger logger = LogManager.getLogger(FlowAnnotatorBase.class);
+ protected final OneShotLogger flowMissingOneShotLogger = new OneShotLogger(FlowAnnotatorBase.class);
+
+
+ // additional constants
+ protected static final String C_INSERT = "ins";
+ protected static final String C_DELETE = "del";
+ protected static final String C_NA = "NA";
+ protected static final String C_CSS_CS = "cycle-skip";
+ protected static final String C_CSS_PCS = "possible-cycle-skip";
+ protected static final String C_CSS_NS = "non-skip";
+
+ protected static final String C_SNP = "snp";
+ protected static final String C_NON_H_MER = "non-h-indel";
+ protected static final String C_H_MER = "h-indel";
+
+
+ protected static final int MOTIF_SIZE = 5;
+ protected static final int GC_CONTENT_SIZE = 10;
+ protected static final int BASE_TYPE_COUNT = 4;
+
+ private List flowOrder;
+
+
+ protected class LocalContext {
+ ReferenceContext ref;
+ AlleleLikelihoods likelihoods;
+ String flowOrder;
+
+ List indel;
+ List indelLength;
+ List hmerIndelLength;
+ List leftMotif;
+ List rightMotif;
+
+ Map attributes = new LinkedHashMap<>();
+
+ boolean generateAnnotation;
+
+ protected LocalContext(final ReferenceContext ref,
+ final VariantContext vc,
+ final AlleleLikelihoods likelihoods,
+ final boolean needsRef) {
+ Utils.nonNull(vc);
+ if ( needsRef ) {
+ Utils.validate(ref == null || ref.hasBackingDataSource(), "-R (reference) argument must be provided");
+ }
+
+ // some annotators share results
+ this.ref = ref;
+ this.likelihoods = likelihoods;
+
+ // annotation will be generated by default
+ this.generateAnnotation = !needsRef || (ref != null);
+ }
+
+ protected Map asAttributes() {
+
+ if ( !generateAnnotation ) {
+ return Collections.emptyMap();
+ } else {
+ return attributes.entrySet().stream()
+ .filter(x -> getKeyNames().contains(x.getKey()))
+ .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
+ }
+ }
+ }
+
+ /*
+ This function establishes the flow order to be used for manipulating reads in flow space.
+
+ The most natural source for the flow order is are the reads themselves. Alas reads will
+ not always be sourced from a bam file with a flow order. In these cases, we can either get it from a
+ --flow-order parameter (VariantAnnotator tool) or the default input source (bam)
+ */
+ private String establishFlowOrder(final LocalContext localContext, final AlleleLikelihoods likelihoods) {
+
+ // extract from a read
+ if ( (likelihoods != null) && (likelihoods.numberOfSamples() > 0) ) {
+ final List reads = likelihoods.sampleEvidence(0);
+ if ( reads.size() > 0 ) {
+ GATKRead read = reads.get(0);
+ if ( read instanceof FlowBasedRead ) {
+ return ((FlowBasedRead)read).getFlowOrder();
+ } else if ( flowOrder != null ) {
+ establishReadGroupFlowOrder(localContext, read.getReadGroup());
+ }
+ }
+ }
+
+ // use global
+ return establishReadGroupFlowOrder(localContext, null);
+ }
+
+ /*
+ the flow order might be different for each read group.
+ provided flow order can be a list of [group:]flowOrder separated by a comma
+ no group: means all/rest
+ */
+ private String establishReadGroupFlowOrder(final LocalContext localContext, final String readGroup) {
+
+ // find flow order for the readGroup
+ if ( flowOrder != null ) {
+ for (String elem : flowOrder) {
+ final String toks[] = elem.split(":");
+ if (toks.length == 1) {
+ return toks[0];
+ } else if (toks[0].equals(readGroup)) {
+ return toks[1];
+ }
+ }
+ }
+
+ // if here, no flow order was found. may we use a default?
+ if ( isActualFlowOrderRequired() ) {
+ localContext.generateAnnotation = false;
+ flowMissingOneShotLogger.warn("this.getClass().getSimpleName() + \" annotation will not be calculated, no '\" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + \"' argument provided\"");
+ }
+
+ return FlowBasedRead.DEFAULT_FLOW_ORDER;
+ }
+
+ protected boolean isActualFlowOrderRequired() {
+ return false;
+ }
+
+ // "indel_classify" and "indel_length"
+ protected void indelClassify(final VariantContext vc, final LocalContext localContext) {
+
+ final List indelClassify = new LinkedList<>();
+ final List indelLength = new LinkedList<>();
+ final int refLength = vc.getReference().length();
+ for ( Allele a : vc.getAlleles() ) {
+ if ( !a.isReference() ) {
+ indelClassify.add(refLength == a.length() ? C_NA : (refLength < a.length() ? C_INSERT : C_DELETE));
+ if ( !isSpecial(a) && (a.length() != refLength) ) {
+ indelLength.add(Math.abs(refLength - a.length()));
+ } else {
+ indelLength.add(null);
+ }
+ }
+ }
+ localContext.attributes.put(GATKVCFConstants.FLOW_INDEL_CLASSIFY, localContext.indel = indelClassify);
+ localContext.attributes.put(GATKVCFConstants.FLOW_INDEL_LENGTH, localContext.indelLength = indelLength);
+ }
+
+ // "indel_classify" and "indel_length"
+ protected void variantType(final VariantContext vc, final LocalContext localContext) {
+ List alleles = vc.getAlternateAlleles();
+ boolean isSnp = true;
+ for (int i = 0; i < alleles.size(); i++){
+ if (isSpecial(alleles.get(i))){
+ continue;
+ }
+ if (!localContext.indel.get(i).equals(C_NA)){
+ isSnp=false;
+ }
+ }
+ if (isSnp){
+ localContext.attributes.put(GATKVCFConstants.FLOW_VARIANT_TYPE, C_SNP);
+ return;
+ }
+
+ boolean isHmer = true;
+ for (int i = 0; i < alleles.size(); i++){
+ if (isSpecial(alleles.get(i))){
+ continue;
+ }
+ if ((localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){
+ isHmer=false;
+ }
+ if ((localContext.hmerIndelLength.get(i)==1) && (localContext.indel.equals(C_DELETE))){ // new definition 1->0 has hmer indel 1
+ isHmer=false;
+ }
+
+ }
+
+ if (isHmer){
+ localContext.attributes.put(GATKVCFConstants.FLOW_VARIANT_TYPE, C_H_MER);
+ return;
+ }
+
+ localContext.attributes.put(GATKVCFConstants.FLOW_VARIANT_TYPE, C_NON_H_MER);
+ }
+
+
+ /*
+ This function determines if the vc is an hmer indel. If so, it marks it as such
+ */
+ protected void isHmerIndel(final VariantContext vc, final LocalContext localContext) {
+
+ // loop over all allels
+ final List hmerIndelLength = new LinkedList<>();
+ final List hmerIndelNuc = new LinkedList<>();
+ final List rightMotif = new LinkedList<>();
+ for ( Allele a : vc.getAlleles() ) {
+
+ // skip reference
+ if ( a.isReference() ) {
+ continue;
+ }
+
+ // establish flow order
+ if ( localContext.flowOrder == null ) {
+ localContext.flowOrder = establishFlowOrder(localContext, localContext.likelihoods);}
+
+ // assume no meaningful result
+ hmerIndelLength.add(0);
+ hmerIndelNuc.add(null);
+ rightMotif.add(null);
+
+ // access alleles
+ final Allele ref = vc.getReference();
+ final Allele alt = a;
+ if ( isSpecial(a) )
+ continue;;
+
+ // get byte before and after
+ final byte before = getReferenceNucleotide(localContext, vc.getStart() - 1);
+ final byte[] after = getReferenceHmerPlus(localContext, vc.getEnd() + 1, MOTIF_SIZE);
+
+ // build two haplotypes. add byte before and after
+ final byte[] refHap = buildHaplotype(before, ref.getBases(), after);
+ final byte[] altHap = buildHaplotype(before, alt.getBases(), after);
+
+ // convert to flow space
+ final int[] refKey = FlowBasedKeyCodec.baseArrayToKey(refHap, localContext.flowOrder);
+ final int[] altKey = FlowBasedKeyCodec.baseArrayToKey(altHap, localContext.flowOrder);
+ if ( refKey == null || altKey == null ) {
+ throw new GATKException("failed to generate key from reference or alternate sequence");
+ }
+
+ // key must be the same length to begin with
+ if ( refKey.length != altKey.length ) {
+ continue;
+ }
+
+ // key must have only one difference, which should not be between a zero and something
+ int diffIndex = -1;
+ int refBasesCountUpInclHmer = 0;
+ for ( int n = 0 ; n < refKey.length ; n++ ) {
+ // count ref bases up to and including difference key
+ if ( diffIndex < 0 ) {
+ refBasesCountUpInclHmer += refKey[n];
+ }
+
+ // is this the (one) difference key?
+ if ( refKey[n] != altKey[n] ) {
+ if ( diffIndex >= 0 ) {
+ // break away
+ diffIndex = -1;
+ break;
+ } else {
+ diffIndex = n;
+ }
+ }
+ }
+
+ // check if we've actually encountered a significant different key
+ if ( diffIndex < 0 ) {
+ continue;
+ }
+ if ( Math.max(refKey[diffIndex], altKey[diffIndex]) == 0 ) {
+ continue;
+ }
+
+ // if here, we found the difference. replace last element of list
+ final int length = Math.max(refKey[diffIndex], altKey[diffIndex]);
+ final byte nuc = localContext.flowOrder.getBytes()[diffIndex % localContext.flowOrder.length()];
+ hmerIndelLength.set(hmerIndelLength.size() - 1, length);
+ hmerIndelNuc.set(hmerIndelNuc.size() - 1, Character.toString((char)nuc));
+
+ // at this point, we can generate the right motif (for the hmer indel) as we already have the location
+ // of the hmer-indel and the bases following it
+ if ( a.length() != ref.length() ) {
+ final String motif = new String(Arrays.copyOfRange(refHap, refBasesCountUpInclHmer, Math.min(refHap.length, refBasesCountUpInclHmer + MOTIF_SIZE)));
+ rightMotif.set(rightMotif.size() - 1, motif);
+ }
+ }
+
+ // reflect back to attributs and context
+ localContext.attributes.put(GATKVCFConstants.FLOW_HMER_INDEL_LENGTH, localContext.hmerIndelLength = hmerIndelLength);
+ localContext.attributes.put(GATKVCFConstants.FLOW_HMER_INDEL_NUC, hmerIndelNuc);
+ localContext.rightMotif = rightMotif;
+ }
+
+ private byte[] buildHaplotype(final byte before, final byte[] bases, final byte[] after) {
+
+ final byte[] hap = new byte[1 + bases.length + after.length];
+
+ hap[0] = before;
+ System.arraycopy(bases, 0, hap, 1, bases.length);
+ System.arraycopy(after, 0, hap, 1 + bases.length, after.length);
+
+ return hap;
+ }
+
+ protected void getLeftMotif(final VariantContext vc, final LocalContext localContext) {
+
+ final int refLength = vc.getReference().length();
+ final List leftMotif = new LinkedList<>();
+
+ for ( Allele a : vc.getAlleles() ) {
+ if ( a.isReference() ) {
+ continue;
+ }
+
+ String motif = getRefMotif(localContext, vc.getStart() - MOTIF_SIZE, MOTIF_SIZE);
+ if ( a.length() != refLength ) {
+ motif = motif.substring(1) + vc.getReference().getBaseString().substring(0, 1);
+ }
+ leftMotif.add(motif);
+ }
+
+ localContext.attributes.put(GATKVCFConstants.FLOW_LEFT_MOTIF, localContext.leftMotif = leftMotif);
+ }
+
+ protected void getRightMotif(final VariantContext vc, final LocalContext localContext) {
+
+ final int refLength = vc.getReference().length();
+ final String motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE);
+
+ // fill empty entries (non indel alelles)
+ for ( int i = 0 ; i < localContext.rightMotif.size() ; i++ ) {
+ if ( localContext.rightMotif.get(i) == null ) {
+ localContext.rightMotif.set(i, motif);
+ }
+ }
+
+ localContext.attributes.put(GATKVCFConstants.FLOW_RIGHT_MOTIF, localContext.rightMotif);
+ }
+
+ protected void gcContent(final VariantContext vc, final LocalContext localContext) {
+
+ final int begin = vc.getStart() - (GC_CONTENT_SIZE / 2);
+ final String seq = getRefMotif(localContext, begin + 1, GC_CONTENT_SIZE);
+ int gcCount = 0;
+ for ( byte b : seq.getBytes() ) {
+ if ( b == 'G' || b == 'C' ) {
+ gcCount++;
+ }
+ }
+ localContext.attributes.put(GATKVCFConstants.FLOW_GC_CONTENT, (float)gcCount / seq.length());
+ }
+
+ protected void cycleSkip(final VariantContext vc, final LocalContext localContext) {
+
+ // establish flow order
+ if ( localContext.flowOrder == null ) {
+ localContext.flowOrder = establishFlowOrder(localContext, localContext.likelihoods);
+ }
+
+ // loop over alleles
+ final List css = new LinkedList<>();
+ final int refLength = vc.getReference().length();
+ for ( Allele a : vc.getAlleles() ) {
+ if ( a.isReference() ) {
+ continue;
+ }
+
+ // meaningful only for non indels
+ if ( isSpecial(a) || (a.length() != refLength) ) {
+ css.add(C_NA);
+ } else {
+
+ // access alleles
+ final Allele ref = vc.getReference();
+ final Allele alt = a;
+
+ // convert to flow space
+ final int i = css.size(); // always working on the last
+ final int[] refKey = FlowBasedKeyCodec.baseArrayToKey((localContext.leftMotif.get(i) + ref.getBaseString() + localContext.rightMotif.get(i)).getBytes(), localContext.flowOrder);
+ final int[] altKey = FlowBasedKeyCodec.baseArrayToKey((localContext.leftMotif.get(i) + (!isSpecial(alt) ? alt.getBaseString() : "") + localContext.rightMotif.get(i)).getBytes(), localContext.flowOrder);
+
+ // assign initial css
+ String cssValue = (refKey.length != altKey.length) ? C_CSS_CS : C_CSS_NS;
+
+ // if same length (NS) then see if it is possible-cycle-skip
+ if ( cssValue == C_CSS_NS ) {
+ for ( int n = 0 ; n < refKey.length ; n++ ) {
+ if ( (refKey[n] == 0) ^ (altKey[n] == 0) ) {
+ cssValue = C_CSS_PCS;
+ break;
+ }
+ }
+ }
+
+ css.add(cssValue);
+ }
+ }
+
+ localContext.attributes.put(GATKVCFConstants.FLOW_CYCLESKIP_STATUS, css);
+ }
+
+ // get a single nucleoid from reference
+ private byte getReferenceNucleotide(final LocalContext localContext, final int start) {
+ final int index = start - localContext.ref.getWindow().getStart();
+ final byte[] bases = localContext.ref.getBases();
+ Utils.validIndex(index, bases.length);
+ return bases[index];
+ }
+
+ // get an hmer from reference plus a number of additional bases
+ private byte[] getReferenceHmerPlus(final LocalContext localContext, final int start, final int additional) {
+ int index = start - localContext.ref.getWindow().getStart();
+ final byte[] bases = localContext.ref.getBases();
+ Utils.validIndex(index, bases.length);
+
+ // get hmer
+ final StringBuilder sb = new StringBuilder();
+ final byte base0 = bases[index++];
+ sb.append((char)base0);
+ for ( ; index < bases.length && bases[index] == base0 ; index++ ) {
+ sb.append((char) bases[index]);
+ }
+
+ // get additional
+ for ( int n = 0 ; n < additional && index < bases.length ; n++, index++ ) {
+ sb.append((char) bases[index]);
+ }
+
+ return sb.toString().getBytes();
+ }
+ // get motif from reference
+ private String getRefMotif(final LocalContext localContext, final int start, final int length) {
+ final byte[] bases = localContext.ref.getBases();
+ final int startIndex = start - localContext.ref.getWindow().getStart();
+ final int endIndex = startIndex + length;
+ Utils.validIndex(startIndex, bases.length);
+ Utils.validIndex(endIndex-1, bases.length);
+ return new String(Arrays.copyOfRange(bases, startIndex, endIndex));
+ }
+
+ public void setFlowOrder(final List flowOrder) {
+ this.flowOrder = flowOrder;
+ }
+
+ private boolean isSpecial(Allele a) {
+ return a.equals(Allele.SPAN_DEL) || a.equals(Allele.NON_REF_ALLELE);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/GcContent.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/GcContent.java
new file mode 100644
index 00000000000..a1e22f6fa06
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/GcContent.java
@@ -0,0 +1,45 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.StandardMutectAnnotation;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="GC Content Flow Annotation")
+public class GcContent extends FlowAnnotatorBase implements StandardFlowBasedAnnotation {
+ private final Logger logger = LogManager.getLogger(GcContent.class);
+
+ @Override
+ public Map annotate(ReferenceContext ref,
+ VariantContext vc,
+ AlleleLikelihoods likelihoods) {
+
+ final LocalContext localContext = new LocalContext(ref, vc, likelihoods, true);
+
+ if ( localContext.generateAnnotation ) {
+ gcContent(vc, localContext);
+ }
+
+ return localContext.asAttributes();
+ }
+
+ @Override
+ public List getKeyNames() {
+
+ return Collections.singletonList(GATKVCFConstants.FLOW_GC_CONTENT);
+ }
+
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelLength.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelLength.java
new file mode 100644
index 00000000000..d88906444e8
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelLength.java
@@ -0,0 +1,46 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.StandardMutectAnnotation;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="Hmer Indel Length Flow Annotation")
+public class HmerIndelLength extends FlowAnnotatorBase implements StandardFlowBasedAnnotation {
+ private final Logger logger = LogManager.getLogger(HmerIndelLength.class);
+
+ @Override
+ public Map annotate(ReferenceContext ref,
+ VariantContext vc,
+ AlleleLikelihoods likelihoods) {
+
+ final LocalContext localContext = new LocalContext(ref, vc, likelihoods, true);
+
+ if ( localContext.generateAnnotation ) {
+ indelClassify(vc, localContext);
+ isHmerIndel(vc, localContext);
+ }
+
+ return localContext.asAttributes();
+ }
+
+ @Override
+ public List getKeyNames() {
+
+ return Collections.singletonList(GATKVCFConstants.FLOW_HMER_INDEL_LENGTH);
+ }
+
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelNuc.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelNuc.java
new file mode 100644
index 00000000000..222f27e8925
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerIndelNuc.java
@@ -0,0 +1,25 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.tools.walkers.annotator.StandardMutectAnnotation;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.List;
+
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="Hmer Indel Nucleotide Flow Annotation")
+public class HmerIndelNuc extends HmerIndelLength implements StandardFlowBasedAnnotation {
+ private final Logger logger = LogManager.getLogger(HmerIndelNuc.class);
+
+ @Override
+ public List getKeyNames() {
+
+ return Collections.singletonList(GATKVCFConstants.FLOW_HMER_INDEL_NUC);
+ }
+
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerMotifs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerMotifs.java
new file mode 100644
index 00000000000..99b1f8fff96
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/HmerMotifs.java
@@ -0,0 +1,46 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="Right Motif Flow Annotation")
+public class HmerMotifs extends FlowAnnotatorBase implements StandardFlowBasedAnnotation {
+
+ @Override
+ public Map annotate(ReferenceContext ref,
+ VariantContext vc,
+ AlleleLikelihoods likelihoods) {
+
+ final LocalContext localContext = new LocalContext(ref, vc, likelihoods, true);
+
+ if ( localContext.generateAnnotation ) {
+ getLeftMotif(vc, localContext);
+ indelClassify(vc, localContext);
+ isHmerIndel(vc, localContext);
+ getRightMotif(vc, localContext);
+ }
+
+ return localContext.asAttributes();
+ }
+
+ @Override
+ public List getKeyNames() {
+
+ return Arrays.asList(GATKVCFConstants.FLOW_LEFT_MOTIF, GATKVCFConstants.FLOW_RIGHT_MOTIF);
+ }
+
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelClassify.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelClassify.java
new file mode 100644
index 00000000000..75ea4ac42bb
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelClassify.java
@@ -0,0 +1,41 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.StandardMutectAnnotation;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.*;
+
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="Indel Classify Flow Annotation")
+public class IndelClassify extends FlowAnnotatorBase implements StandardFlowBasedAnnotation {
+ private final Logger logger = LogManager.getLogger(IndelClassify.class);
+
+ @Override
+ public Map annotate(ReferenceContext ref,
+ VariantContext vc,
+ AlleleLikelihoods likelihoods) {
+
+ final LocalContext localContext = new LocalContext(ref, vc, likelihoods, false);
+
+ if ( localContext.generateAnnotation ) {
+ indelClassify(vc, localContext);
+ }
+ return localContext.asAttributes();
+ }
+
+ @Override
+ public List getKeyNames() {
+ return Collections.singletonList(GATKVCFConstants.FLOW_INDEL_CLASSIFY);
+ }
+
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelLength.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelLength.java
new file mode 100644
index 00000000000..8c3dcbe96a0
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/IndelLength.java
@@ -0,0 +1,25 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.tools.walkers.annotator.StandardMutectAnnotation;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.List;
+
+@DocumentedFeature(groupName=HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="Indel Length Flow Annotation")
+public class IndelLength extends IndelClassify implements StandardFlowBasedAnnotation {
+ private final Logger logger = LogManager.getLogger(IndelLength.class);
+
+ @Override
+ public List getKeyNames() {
+
+ return Collections.singletonList(GATKVCFConstants.FLOW_INDEL_LENGTH);
+ }
+
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/StandardFlowBasedAnnotation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/StandardFlowBasedAnnotation.java
new file mode 100644
index 00000000000..4d947ef7d57
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/StandardFlowBasedAnnotation.java
@@ -0,0 +1,9 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
+
+/**
+ * This is a marker interface used to indicate which annotations are part of the standard flow based group
+ */
+public interface StandardFlowBasedAnnotation extends Annotation {
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/VariantType.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/VariantType.java
new file mode 100644
index 00000000000..32962073413
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/VariantType.java
@@ -0,0 +1,46 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator.flow;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+@DocumentedFeature(groupName= HelpConstants.DOC_CAT_FLOW_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_FLOW_ANNOTATORS_SUMMARY, summary="Variant type Flow Annotation")
+public class VariantType extends FlowAnnotatorBase implements StandardFlowBasedAnnotation {
+ private final Logger logger = LogManager.getLogger(org.broadinstitute.hellbender.tools.walkers.annotator.flow.IndelClassify.class);
+
+ @Override
+ public Map annotate(ReferenceContext ref,
+ VariantContext vc,
+ AlleleLikelihoods likelihoods) {
+
+ final LocalContext localContext = new LocalContext(ref, vc, likelihoods, true);
+
+ if ( localContext.generateAnnotation ) {
+ indelClassify(vc, localContext);
+ isHmerIndel(vc, localContext);
+ variantType(vc, localContext);
+ }
+
+ return localContext.asAttributes();
+ }
+
+ @Override
+ public List getKeyNames() {
+ return Collections.singletonList(GATKVCFConstants.FLOW_VARIANT_TYPE);
+ }
+
+
+}
+
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FeatureMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FeatureMapper.java
new file mode 100644
index 00000000000..3845620294e
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FeatureMapper.java
@@ -0,0 +1,12 @@
+package org.broadinstitute.hellbender.tools.walkers.featuremapping;
+
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.util.function.Consumer;
+
+public interface FeatureMapper {
+
+ void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Consumer super FlowFeatureMapper.MappedFeature> action);
+ boolean noFeatureButFilterAt(GATKRead read, ReferenceContext referenceContext, int start);
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
new file mode 100644
index 00000000000..ab6e2be4d91
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
@@ -0,0 +1,626 @@
+package org.broadinstitute.hellbender.tools.walkers.featuremapping;
+
+import htsjdk.samtools.*;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.Options;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.*;
+import org.apache.commons.math3.util.Precision;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.argparser.*;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
+import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.hellbender.utils.variant.writers.GVCFWriter;
+import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;
+import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
+
+import java.util.*;
+
+
+/**
+ * Finds specific features in reads, scores the confidence of each feature relative to the
+ * reference in each read and writes them into a VCF file.
+ *
+ * The sense of what a 'feature' is left somewhat open. In the most general sense, it is a haplotype
+ * located in a specific location on the read. It is not necessarily defined as a deviation from the reference.
+ *
+ * A feature is indeed scored against the reference (in terms of its deviation).
+ *
+ * The current version implements a single type of feature: a SNP (aka SNV).
+ *
+ *
+ * At this point, this tool finds SNVs
+ *
+ *
+ * Input
+ *
+ * Coordinate-sorted and indexed SAM/BAM/CRAM
+ *
+ *
+ * Output
+ *
+ * Coordinate-sorted and indexed VCF
+ *
+ *
+ * Usage examples
+ * Find SNVs in chromosome 20.
+ *
+ * gatk FlowFeatureMapper \
+ * -I input.bam \
+ * -L 20 \
+ * -O chr20_snv.vcf
+ *
+ *
+ * {@GATK.walkertype ReadWalker}
+ */
+@CommandLineProgramProperties(
+ summary = "Mapping features (flow space processing)",
+ oneLineSummary = "Map/find features in BAM file, output VCF. Initially mapping SNVs",
+ programGroup = FlowBasedProgramGroup.class
+)
+
+
+@DocumentedFeature
+@ExperimentalFeature
+public final class FlowFeatureMapper extends ReadWalker {
+
+ private static final Logger logger = LogManager.getLogger(FlowFeatureMapper.class);
+
+ private static final String VCB_SOURCE = "fm";
+
+ private static final String VCF_READ_NAME = "X_RN";
+ private static final String VCF_SCORE = "X_SCORE";
+ private static final String VCF_FLAGS = "X_FLAGS";
+ private static final String VCF_MAPQ = "X_MAPQ";
+ private static final String VCF_CIGAR = "X_CIGAR";
+ private static final String VCF_READ_COUNT = "X_READ_COUNT";
+ private static final String VCF_FILTERED_COUNT = "X_FILTERED_COUNT";
+ private static final String VCF_FC1 = "X_FC1";
+ private static final String VCF_FC2 = "X_FC2";
+ private static final String VCF_LENGTH = "X_LENGTH";
+ private static final String VCF_EDIST = "X_EDIST";
+ private static final String VCF_INDEX = "X_INDEX";
+
+ private static final Double LOWEST_PROB = 0.0001;
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc = "File to which variants should be written")
+ public GATKPath outputVCF = null;
+
+ @ArgumentCollection
+ private FlowFeatureMapperArgumentCollection fmArgs = new FlowFeatureMapperArgumentCollection();
+
+ @Advanced
+ @Argument(fullName= AssemblyBasedCallerArgumentCollection.EMIT_REF_CONFIDENCE_LONG_NAME, shortName= AssemblyBasedCallerArgumentCollection.EMIT_REF_CONFIDENCE_SHORT_NAME, doc="Mode for emitting reference confidence scores (For Mutect2, this is a BETA feature)", optional = true)
+ public ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;
+
+ @Advanced
+ @Argument(fullName = HaplotypeCallerArgumentCollection.GQ_BAND_LONG_NAME, shortName = HaplotypeCallerArgumentCollection.GQ_BAND_SHORT_NAME, doc= "Exclusive upper bounds for reference confidence GQ bands " +
+ "(must be in [1, 100] and specified in increasing order)", optional = true)
+ public List GVCFGQBands = new ArrayList<>(70);
+ {
+ for (int i=1; i<=60; ++i) {
+ GVCFGQBands.add(i);
+ }
+ GVCFGQBands.add(70); GVCFGQBands.add(80); GVCFGQBands.add(90); GVCFGQBands.add(99);
+ };
+
+ @Advanced
+ @Argument(fullName=HaplotypeCallerArgumentCollection.OUTPUT_BLOCK_LOWER_BOUNDS, doc = "Output the band lower bound for each GQ block regardless of the data it represents", optional = true)
+ public boolean floorBlocks = false;
+
+ @ArgumentCollection
+ public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();
+
+ protected static class ReadContext implements Comparable {
+ final GATKRead read;
+ final ReferenceContext referenceContext;
+
+ ReadContext(final GATKRead read, final ReferenceContext referenceContext) {
+ this.read = read;
+ this.referenceContext = referenceContext;
+ }
+
+ @Override
+ public int compareTo(ReadContext o) {
+ int delta = read.getContig().compareTo(o.read.getContig());
+
+ delta = (delta != 0) ? delta : Integer.compare(read.getStart(), o.read.getStart());
+ delta = (delta != 0) ? delta : Integer.compare(read.getEnd(), o.read.getEnd());
+
+ return delta;
+ }
+ }
+
+ protected static class MappedFeature implements Comparable {
+
+ GATKRead read;
+ FlowFeatureMapperArgumentCollection.MappingFeatureEnum type;
+ byte[] readBases;
+ byte[] refBases;
+ int readBasesOffset; // offset of read bases array
+ int start; // location (on rerence)
+ int offsetDelta;
+ double score;
+ int readCount;
+ int filteredCount;
+ int nonIdentMBasesOnRead;
+ int featuresOnRead;
+ int refEditDistance;
+ int index;
+
+ public MappedFeature(GATKRead read, FlowFeatureMapperArgumentCollection.MappingFeatureEnum type, byte[] readBases,
+ byte[] refBases, int readBasesOffset, int start, int offsetDelta) {
+ this.read = read;
+ this.type = type;
+ this.readBases = readBases;
+ this.refBases = refBases;
+ this.readBasesOffset = readBasesOffset;
+ this.start = start;
+ this.offsetDelta = offsetDelta;
+ }
+
+ static MappedFeature makeSNV(GATKRead read, int offset, byte refBase, int start, int offsetDelta) {
+ byte[] readBases = {read.getBasesNoCopy()[offset]};
+ byte[] refBases = {refBase};
+ return new MappedFeature(
+ read,
+ FlowFeatureMapperArgumentCollection.MappingFeatureEnum.SNV,
+ readBases,
+ refBases,
+ offset,
+ start,
+ offsetDelta);
+ }
+
+ @Override
+ public String toString() {
+ return "Feature{" +
+ "read=" + read +
+ ", type=" + type +
+ ", readBases=" + Arrays.toString(readBases) +
+ ", refBases=" + Arrays.toString(refBases) +
+ ", readBasesOffset=" + readBasesOffset +
+ ", start=" + start +
+ '}';
+ }
+
+ @Override
+ public int compareTo(MappedFeature o) {
+
+ int delta = this.read.getContig().compareTo(o.read.getContig());
+ if ( delta != 0 ) {
+ return delta;
+ } else {
+ return this.start - o.start;
+ }
+ }
+ }
+
+ // locals
+ private VariantContextWriter vcfWriter;
+ final private PriorityQueue featureQueue = new PriorityQueue<>();
+ final private PriorityQueue readQueue = new PriorityQueue<>();
+ private FeatureMapper mapper;
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+ mapper = buildMapper();
+
+ // enforce requirement for sorted input
+ if ( getHeaderForReads().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) {
+ throw new IllegalArgumentException("input file must be coordinated sorted");
+ }
+
+ // open output vcf
+ // The HC engine will make the right kind (VCF or GVCF) of writer for us
+ final SAMSequenceDictionary sequenceDictionary = getHeaderForReads().getSequenceDictionary();
+ vcfWriter = makeVCFWriter(outputVCF, sequenceDictionary, createOutputVariantIndex, createOutputVariantMD5, outputSitesOnlyVCFs);
+ vcfWriter.writeHeader(makeVCFHeader(sequenceDictionary, getDefaultToolVCFHeaderLines()));
+ }
+
+ @Override
+ public void closeTool() {
+ flushQueue(null, null);
+ super.closeTool();
+ if ( vcfWriter != null ) {
+ vcfWriter.close();
+ }
+ }
+
+ public VariantContextWriter makeVCFWriter( final GATKPath outputVCF, final SAMSequenceDictionary readsDictionary,
+ final boolean createOutputVariantIndex, final boolean createOutputVariantMD5,
+ final boolean sitesOnlyMode ) {
+ Utils.nonNull(outputVCF);
+ Utils.nonNull(readsDictionary);
+
+ final List options = new ArrayList<>(2);
+ if (createOutputVariantIndex) {options.add(Options.INDEX_ON_THE_FLY);}
+ if (sitesOnlyMode) {options.add(Options.DO_NOT_WRITE_GENOTYPES);}
+
+ VariantContextWriter writer = GATKVariantContextUtils.createVCFWriter(
+ outputVCF.toPath(),
+ readsDictionary,
+ createOutputVariantMD5,
+ options.toArray(new Options[options.size()])
+ );
+
+ if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
+ try {
+ writer = new GVCFWriter(writer, new ArrayList(GVCFGQBands), floorBlocks);
+ } catch ( IllegalArgumentException e ) {
+ throw new CommandLineException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage());
+ }
+ }
+
+ return writer;
+ }
+
+ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, final Set defaultToolHeaderLines ) {
+ final Set headerInfo = new HashSet<>();
+ headerInfo.addAll(defaultToolHeaderLines);
+
+ // all callers need to add these standard annotation header lines
+ headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY));
+ headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
+
+ // all callers need to add these standard FORMAT field header lines
+ VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true,
+ VCFConstants.GENOTYPE_KEY,
+ VCFConstants.GENOTYPE_QUALITY_KEY,
+ VCFConstants.DEPTH_KEY,
+ VCFConstants.GENOTYPE_PL_KEY);
+
+ // add our own headers
+ headerInfo.add(new VCFInfoHeaderLine(VCF_READ_NAME, 1, VCFHeaderLineType.String, "Read name"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_SCORE, 1, VCFHeaderLineType.Float, "Mapping score"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_FLAGS, 1, VCFHeaderLineType.Integer, "Read flags"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_MAPQ, 1, VCFHeaderLineType.Integer, "Read mapqe"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_CIGAR, 1, VCFHeaderLineType.String, "Read CIGAR"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_READ_COUNT, 1, VCFHeaderLineType.Integer, "Number of reads containing this location"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_FILTERED_COUNT, 1, VCFHeaderLineType.Integer, "Number of reads containing this location that agree with reference according to fitler"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_FC1, 1, VCFHeaderLineType.Integer, "Number of M bases different on read from references"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_FC2, 1, VCFHeaderLineType.Integer, "Number of features before score threshold filter"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_LENGTH, 1, VCFHeaderLineType.Integer, "Read length"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_EDIST, 1, VCFHeaderLineType.Integer, "Read Levenshtein edit distance from reference"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_INDEX, 1, VCFHeaderLineType.Integer, "Ordinal index, from start of the read, where the feature was found"));
+ for ( String name : fmArgs.copyAttr ) {
+ headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + name, 1, VCFHeaderLineType.String, "copy-attr: " + name));
+ }
+
+ final VCFHeader vcfHeader = new VCFHeader(headerInfo);
+ vcfHeader.setSequenceDictionary(sequenceDictionary);
+ return vcfHeader;
+ }
+
+ @Override
+ public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
+
+ // include dups?
+ if ( read.isDuplicate() && !fmArgs.includeDupReads ) {
+ return;
+ }
+
+ // include supplementary alignments?
+ if ( read.isSupplementaryAlignment() && !fmArgs.keepSupplementaryAlignments ) {
+ return;
+ }
+
+ // flush qeues up to this read
+ flushQueue(read, referenceContext);
+
+ // find features in read
+ mapper.forEachOnRead(read, referenceContext, fr -> {
+ if ( logger.isDebugEnabled() ) {
+ logger.debug("fr: " + fr);
+ }
+
+ // score the feature
+ fr.score = scoreFeature(fr);
+
+ // emit feature if filters in
+ if ( filterFeature(fr) ) {
+ featureQueue.add(fr);
+ }
+ });
+ }
+
+ private void flushQueue(final GATKRead read, final ReferenceContext referenceContext) {
+
+ // emit all?
+ if ( read == null ) {
+ while ( featureQueue.size() != 0 ) {
+ final MappedFeature fr = featureQueue.poll();
+ enrichFeature(fr);
+ emitFeature(fr);
+ }
+ } else {
+ // enter read into the queue
+ readQueue.add(new ReadContext(read, referenceContext));
+
+ // emit all features that start before this read
+ while ( featureQueue.size() != 0 ) {
+ MappedFeature fr = featureQueue.peek();
+ if ( !fr.read.getContig().equals(read.getContig())
+ || (fr.start < read.getStart()) ) {
+ fr = featureQueue.poll();
+ enrichFeature(fr);
+ emitFeature(fr);
+ }
+ else {
+ break;
+ }
+ }
+
+ // remove all reads that start before this read
+ while ( readQueue.size() != 0 ) {
+ ReadContext rc = readQueue.peek();
+
+ if ( !rc.read.getContig().equals(read.getContig())
+ || (rc.read.getEnd() < read.getStart()) ) {
+ rc = readQueue.poll();
+ }
+ else {
+ break;
+ }
+ }
+ }
+ }
+
+ private void enrichFeature(final MappedFeature fr) {
+
+ // loop on queued reads, count and check if should be counted as filtered
+ final Locatable loc = new SimpleInterval(fr.read.getContig(), fr.start, fr.start);
+ for ( ReadContext rc : readQueue ) {
+ if ( rc.read.contains(loc) ) {
+ fr.readCount++;
+ if ( mapper.noFeatureButFilterAt(rc.read, rc.referenceContext, fr.start) ) {
+ fr.filteredCount++;
+ }
+ }
+ }
+ }
+
+ private double scoreFeature(final MappedFeature fr) {
+
+ // build haplotypes
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), fr.read);
+ final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder);
+
+ // create flow read
+ final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder,
+ rgInfo.maxClass, fbargs);
+ final int diffLeft = haplotypes[0].getStart() - flowRead.getStart() + fr.offsetDelta;
+ final int diffRight = flowRead.getEnd() - haplotypes[0].getEnd();
+ flowRead.applyBaseClipping(Math.max(0, diffLeft), Math.max(diffRight, 0), false);
+
+ if ( !flowRead.isValid() ) {
+ return -1;
+ }
+
+ // compute alternative score
+ final int hapKeyLength = Math.min(haplotypes[0].getKeyLength(), haplotypes[1].getKeyLength());
+ final double readScore = computeLikelihoodLocal(flowRead, haplotypes[0], hapKeyLength, false);
+ final double refScore = computeLikelihoodLocal(flowRead, haplotypes[1], hapKeyLength, false);
+ double score = readScore - refScore;
+ if ( !Double.isNaN(fmArgs.limitScore) ) {
+ score = Math.min(score, fmArgs.limitScore);
+ }
+
+ if ( ((Double.isNaN(score) || (score < 0)) && fmArgs.debugNegatives)
+ || (fmArgs.debugReadName != null && fmArgs.debugReadName.contains(fr.read.getName())) ) {
+ logger.info("**** debug read: " + fr.read);
+ logger.info("readBases: " + fr.read.getBasesString());
+ logger.info("flowRead: " + flowRead);
+ logger.info("flowBases: " + flowRead.getBasesString());
+ logger.info("flowOrder: " + flowRead.getFlowOrder());
+ logger.info("flowKey: " + flowRead.getKeyLength() + " " + Arrays.toString(flowRead.getKey()));
+ logger.info("readHaplotype: " + haplotypes[0]);
+ logger.info("readHapKey: " + haplotypes[0].getKeyLength() + " " + Arrays.toString(haplotypes[0].getKey()));
+ computeLikelihoodLocal(flowRead, haplotypes[0], hapKeyLength, true);
+ logger.info("refrHaplotype: " + haplotypes[1]);
+ logger.info("refrHapKey: " + haplotypes[1].getKeyLength() + " " + Arrays.toString(haplotypes[1].getKey()));
+ computeLikelihoodLocal(flowRead, haplotypes[1], hapKeyLength, true);
+ logger.info("score: " + score);
+
+ // analyze read
+ final FlowBasedRead flowRead2 = new FlowBasedRead(fr.read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+ final int[] key2 = flowRead2.getKey();
+ for ( int i = 0 ; i < key2.length ; i++ ) {
+ final double p1 = flowRead2.getProb(i, key2[i]);
+ for ( int j = 0 ; j < rgInfo.maxClass ; j++ ) {
+ final double p2 = flowRead2.getProb(i, j);
+ if ( p2 > p1 )
+ logger.info(String.format("prob at %s key[%d]=%d, %f is lower than at %d which is %f",
+ flowRead2.getName(), i, key2[i], p1, j, p2));
+ }
+ }
+ }
+
+ if ( score < 0 && !fmArgs.keepNegatives && score != -1.0 ) {
+ score = 0;
+ }
+
+ return score;
+ }
+
+ public static double computeLikelihoodLocal(final FlowBasedRead read, final FlowBasedHaplotype haplotype, final int hapKeyLength, final boolean debug) {
+
+ final byte[] flowOrder = haplotype.getFlowOrderArray();
+ final byte readFlowOrder0 = read.getFlowOrderArray()[0];
+ int startingPoint = 0;
+ for (int i = 0; i < flowOrder.length; i++) {
+ if (flowOrder[i] == readFlowOrder0) {
+ startingPoint = i;
+ break;
+ }
+ }
+ final int[] key = haplotype.getKey();
+
+ // debug support
+ StringBuffer debugMessage = null;
+ if ( debug )
+ debugMessage = new StringBuffer(Integer.toString(startingPoint) + " hmer prob |");
+ double result = 0 ;
+ for (int i = 0; i < read.getKeyLength(); i++) {
+ int index = i + startingPoint;
+ double prob = 0;
+ int locationToFetch = 0;
+ if ( index < hapKeyLength ) {
+ locationToFetch = Math.min(key[index] & 0xff, read.getMaxHmer() + 1);
+ prob = read.getProb(i, locationToFetch);
+ } else {
+ if ( debug ) {
+ debugMessage.append(" clip");
+ }
+ break;
+ }
+ if ( Precision.equals(prob, 0.0) ) {
+ prob = LOWEST_PROB;
+ }
+ result += Math.log10(prob);
+
+ if ( debug ) {
+ debugMessage.append(String.format(" %d %.4f", locationToFetch, prob));
+ }
+ }
+
+ if ( debug ) {
+ debugMessage.append(" | " + result);
+ logger.info("debugMessage: " + debugMessage);
+ }
+
+ return result;
+ }
+
+ private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder) {
+
+ // build bases for flow haplotypes
+ // NOTE!!!: this code assumes length of feature on read and reference is the same
+ // this is true for SNP but not for INDELs - it will have to be re-written!
+ // TODO: write for INDEL
+ final byte[] bases = fr.read.getBasesNoCopy();
+ int offset = fr.readBasesOffset;
+ int refStart = fr.start;
+ int refModOfs = 0;
+ if ( offset > 0 ) {
+ // reach into hmer before
+ offset--;
+ refModOfs++;
+ refStart--;
+
+ // extend until start of hmer
+ final byte hmerBase = bases[offset];
+ while ( offset > 0 && bases[offset-1] == hmerBase ) {
+ offset--;
+ refModOfs++;
+ refStart--;
+ }
+ }
+ final byte[] sAltBases = Arrays.copyOfRange(bases, offset, bases.length);
+ final byte[] sRefBases = Arrays.copyOf(sAltBases, sAltBases.length);
+ System.arraycopy(fr.refBases, 0, sRefBases, refModOfs, fr.refBases.length);
+
+ // construct haplotypes
+ final SimpleInterval genomeLoc = new SimpleInterval(fr.read.getContig(), refStart, refStart + sAltBases.length - 1);
+ final Cigar cigar = new Cigar();
+ cigar.add(new CigarElement(sAltBases.length, CigarOperator.M));
+ final Haplotype altHaplotype = new Haplotype(sAltBases, false);
+ final Haplotype refHaplotype = new Haplotype(sRefBases, true);
+ altHaplotype.setGenomeLocation(genomeLoc);
+ refHaplotype.setGenomeLocation(genomeLoc);
+ altHaplotype.setCigar(cigar);
+ refHaplotype.setCigar(cigar);
+
+ // prepare flow based haplotypes
+ final FlowBasedHaplotype[] result = {
+ new FlowBasedHaplotype(altHaplotype, flowOrder),
+ new FlowBasedHaplotype(refHaplotype, flowOrder)
+ };
+
+ // return
+ return result;
+ }
+
+ private boolean filterFeature(final MappedFeature fr) {
+
+ if ( fmArgs.excludeNaNScores && Double.isNaN(fr.score) ) {
+ return false;
+ } else if ( fr.score > fmArgs.maxScore ) {
+ return false;
+ } else if ( fr.score < fmArgs.minScore ) {
+ return false;
+ }
+
+ return true;
+ }
+
+ private void emitFeature(final MappedFeature fr) {
+
+ // create alleles
+ final Collection alleles = new LinkedList<>();
+ alleles.add(Allele.create(fr.readBases, false));
+ alleles.add(Allele.create(fr.refBases, true));
+
+ // create variant context builder
+ final VariantContextBuilder vcb = new VariantContextBuilder(
+ VCB_SOURCE,
+ fr.read.getContig(),
+ fr.start,
+ fr.start + fr.refBases.length - 1,
+ alleles);
+
+ // copy attributes
+ vcb.attribute(VCF_READ_NAME, fr.read.getName());
+ vcb.attribute(VCF_SCORE, String.format("%.5f", fr.score));
+ vcb.attribute(VCF_FLAGS, fr.read.getFlags());
+ vcb.attribute(VCF_MAPQ, fr.read.getMappingQuality());
+ vcb.attribute(VCF_CIGAR, fr.read.getCigar().toString());
+ vcb.attribute(VCF_READ_COUNT, fr.readCount);
+ vcb.attribute(VCF_FILTERED_COUNT, fr.filteredCount);
+ vcb.attribute(VCF_FC1, fr.nonIdentMBasesOnRead);
+ vcb.attribute(VCF_FC2, fr.featuresOnRead);
+ vcb.attribute(VCF_LENGTH, fr.read.getLength());
+ vcb.attribute(VCF_EDIST, fr.refEditDistance);
+ vcb.attribute(VCF_INDEX, fr.index);
+ for ( String name : fmArgs.copyAttr ) {
+ if ( fr.read.hasAttribute(name) ) {
+ vcb.attribute(fmArgs.copyAttrPrefix + name, fr.read.getAttributeAsString(name));
+ }
+ }
+ final VariantContext vc = vcb.make();
+
+ // write to file
+ vcfWriter.add(vc);
+ }
+
+ private FeatureMapper buildMapper() {
+
+ // build appropriate mapper
+ if ( fmArgs.mappingFeature == FlowFeatureMapperArgumentCollection.MappingFeatureEnum.SNV ) {
+ return new SNVMapper(fmArgs);
+ } else {
+ throw new GATKException("unsupported mappingFeature: " + fmArgs.mappingFeature);
+ }
+ }
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
new file mode 100644
index 00000000000..9b554deb8c6
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
@@ -0,0 +1,105 @@
+package org.broadinstitute.hellbender.tools.walkers.featuremapping;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.Hidden;
+
+import java.io.Serializable;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Set of arguments for the {@link FlowFeatureMapper}
+ */
+public class FlowFeatureMapperArgumentCollection implements Serializable{
+ private static final long serialVersionUID = 1L;
+
+ enum MappingFeatureEnum {
+ SNV
+ };
+
+ /**
+ * kind of feature we are mapping (looking for)
+ **/
+ @Argument(fullName = "mapping-feature", doc = "Kind of feaure being mapped", optional = true)
+ public MappingFeatureEnum mappingFeature = MappingFeatureEnum.SNV;
+
+ /**
+ * maximum value for delta in score
+ **/
+ @Argument(fullName = "limit-score", doc = "Limit value for score", optional = true)
+ public double limitScore = Double.NaN;
+
+ /**
+ * attributes to copy from bam
+ **/
+ @Argument(fullName = "copy-attr", doc = "attributes to copy from bam", optional = true)
+ public List copyAttr = new LinkedList<>();
+
+ /**
+ * prefix to add to attributes to copy from bam
+ **/
+ @Argument(fullName = "copy-attr-prefix", doc = "prefix to add to attributes to copy from bam", optional = true)
+ public String copyAttrPrefix = "";
+
+ /**
+ * number of bases that need to be identical before the snv
+ **/
+ @Argument(fullName = "snv-identical-bases", doc = "number of bases that need to be identical before the snv", optional = true)
+ public int snvIdenticalBases = 1;
+
+ /**
+ * number of bases that need to be identical after the snv
+ **/
+ @Argument(fullName = "snv-identical-bases-after", doc = "number of bases that need to be identical after the snv. 0 means same as number of bases before", optional = true)
+ public int snvIdenticalBasesAfter = 0;
+
+ /**
+ * threshold of score delta to for emitting (will be emitted if lower)
+ **/
+ @Argument(fullName = "max-score", doc = "threshold of score delta to for emitting (will be emitted if lower)", optional = true)
+ public double maxScore = Double.POSITIVE_INFINITY;
+
+ /**
+ * minimal threshold of score delta to for emitting (will be emitted if higher)
+ **/
+ @Argument(fullName = "min-score", doc = "minimal threshold of score delta to for emitting (will be emitted if higher)", optional = true)
+ public double minScore = Double.NEGATIVE_INFINITY;
+
+ /**
+ * exclude NaN score records?
+ **/
+ @Argument(fullName = "exclude-nan-scores", doc = "exclude nan scores", optional = true)
+ public boolean excludeNaNScores = false;
+
+ /**
+ * include duplicate read?
+ **/
+ @Argument(fullName = "include-dup-reads", doc = "include duplicate reads", optional = true)
+ public boolean includeDupReads = false;
+
+ /**
+ * keep negatives?
+ **/
+ @Argument(fullName = "keep-negatives", doc = "keep nevative scores?", optional = true)
+ public boolean keepNegatives = false;
+
+ /**
+ * keep supplementary alignments?
+ **/
+ @Argument(fullName = "keep-supplementary-alignments", doc = "keep supplementary alignments ?", optional = true)
+ public boolean keepSupplementaryAlignments = false;
+
+ /**
+ * debug negatives?
+ **/
+ @Hidden
+ @Argument(fullName = "debug-negatives", doc = "debug negative scores?", optional = true)
+ public boolean debugNegatives = false;
+
+ /**
+ * debug read names?
+ **/
+ @Hidden
+ @Argument(fullName = "debug-read-name", doc = "debug specific reads?", optional = true)
+ public List debugReadName = null;
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
new file mode 100644
index 00000000000..cfc197dc471
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
@@ -0,0 +1,213 @@
+package org.broadinstitute.hellbender.tools.walkers.featuremapping;
+
+import htsjdk.samtools.CigarElement;
+import org.apache.commons.text.similarity.LevenshteinDistance;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
+
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.Consumer;
+
+/**
+ * An implementation of a feature mapper that finds SNPs (SVN)
+ *
+ * This class only finds SNP that are surrounded by a specific number of bases identical to the reference.
+ */
+
+public class SNVMapper implements FeatureMapper {
+
+ final int identBefore;
+ final int identAfter;
+ final int minCigarElementLength;
+ final LevenshteinDistance levDistance = new LevenshteinDistance();
+
+ public SNVMapper(FlowFeatureMapperArgumentCollection fmArgs) {
+ identBefore = fmArgs.snvIdenticalBases;
+ identAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : identBefore;
+ minCigarElementLength = identBefore + 1 + identAfter;
+
+ // adjust minimal read length
+ FlowBasedRead.setMinimalReadLength(1 + 1 + identAfter);
+ }
+
+ @Override
+ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Consumer super FlowFeatureMapper.MappedFeature> action) {
+
+ // prepare list
+ List features = new LinkedList<>();
+
+ // access bases
+ final byte[] bases = read.getBasesNoCopy();
+ final byte[] ref = referenceContext.getBases();
+
+ // calculate edit distance
+ int startSoftClip = read.getStart() - read.getSoftStart();
+ int endSoftClip = read.getSoftEnd() - read.getEnd();
+ String basesString;
+ if ( startSoftClip == 0 && endSoftClip == 0 ) {
+ basesString = new String(bases);
+ } else {
+ basesString = new String(Arrays.copyOfRange(bases, startSoftClip, bases.length - endSoftClip));
+ }
+ int refEditDistance = levDistance.apply(basesString, new String(ref));
+
+ // count bases delta on M cigar elements
+ int nonIdentMBases = 0;
+ int readOfs = 0;
+ int refOfs = 0;
+ for ( final CigarElement cigarElement : read.getCigarElements() ) {
+ final int length = cigarElement.getLength();
+ if ( cigarElement.getOperator().consumesReadBases() && cigarElement.getOperator().consumesReferenceBases() ) {
+ for ( int ofs = 0 ; ofs < length ; ofs++ ) {
+ if ( ref[refOfs+ofs] != 'N' && bases[readOfs+ofs] != ref[refOfs+ofs] ) {
+ nonIdentMBases++;
+ }
+ }
+ }
+ if (cigarElement.getOperator().consumesReadBases()) {
+ readOfs += length;
+ }
+ if (cigarElement.getOperator().consumesReferenceBases()) {
+ refOfs += length;
+ }
+ }
+ int hardLength = read.getUnclippedEnd() - read.getUnclippedStart() + 1;
+
+ // walk the cigar (again) looking for features
+ readOfs = 0;
+ refOfs = 0;
+ for ( final CigarElement cigarElement : read.getCigarElements() ) {
+
+ final int length = cigarElement.getLength();
+
+ // worth looking into?
+ if ( length >= minCigarElementLength &&
+ cigarElement.getOperator().consumesReadBases() &&
+ cigarElement.getOperator().consumesReferenceBases() ) {
+ readOfs += identBefore;
+ refOfs += identBefore;
+ for ( int ofs = identBefore ; ofs < length - identAfter ; ofs++, readOfs++, refOfs++ ) {
+
+ if ( ref[refOfs] != 'N' && bases[readOfs] != ref[refOfs] ) {
+
+ // check that this is really a SNV (must be surrounded by identical ref)
+ boolean surrounded = true;
+ for ( int i = 0 ; i < identBefore && surrounded ; i++ ) {
+ if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) {
+ surrounded = false;
+ }
+ }
+ for ( int i = 0 ; i < identAfter && surrounded ; i++ ) {
+ if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) {
+ surrounded = false;
+ }
+ }
+ if ( !surrounded ) {
+ continue;
+ }
+
+ // add this feature
+ FlowFeatureMapper.MappedFeature feature = FlowFeatureMapper.MappedFeature.makeSNV(read, readOfs, ref[refOfs], referenceContext.getStart() + refOfs, readOfs - refOfs);
+ feature.nonIdentMBasesOnRead = nonIdentMBases;
+ feature.refEditDistance = refEditDistance;
+ if ( !read.isReverseStrand() )
+ feature.index = readOfs;
+ else
+ feature.index = hardLength - readOfs;
+ features.add(feature);
+ }
+ }
+ readOfs += identAfter;
+ refOfs += identAfter;
+
+ } else {
+
+ // manual advance
+ if (cigarElement.getOperator().consumesReadBases()) {
+ readOfs += length;
+ }
+ if (cigarElement.getOperator().consumesReferenceBases()) {
+ refOfs += length;
+ }
+ }
+ };
+
+ // report features
+ for ( FlowFeatureMapper.MappedFeature feature : features ) {
+ feature.featuresOnRead = features.size();
+ action.accept(feature);
+ }
+ }
+
+ public boolean noFeatureButFilterAt(GATKRead read, ReferenceContext referenceContext, int start) {
+
+ // access bases
+ final byte[] bases = read.getBasesNoCopy();
+ final byte[] ref = referenceContext.getBases();
+
+ // walk the cigar
+ int readOfs = 0;
+ int refOfs = 0;
+ for ( final CigarElement cigarElement : read.getCigarElements() ) {
+
+ final int length = cigarElement.getLength();
+
+ // worth looking into?
+ boolean includes = (start >= referenceContext.getStart() + refOfs) &&
+ (start < referenceContext.getStart() + refOfs + length);
+ if ( includes && length >= minCigarElementLength &&
+ cigarElement.getOperator().consumesReadBases() &&
+ cigarElement.getOperator().consumesReferenceBases() ) {
+
+ // break out if not enough clearing
+ if ( (start < referenceContext.getStart() + refOfs + identBefore) ||
+ (start >= referenceContext.getStart() + refOfs + length - identAfter) )
+ return false;
+
+ int delta = start - (referenceContext.getStart() + refOfs);
+ readOfs += delta;
+ refOfs += delta;
+
+ if ( bases[readOfs] == ref[refOfs] ) {
+
+ // check that this is really a SNV (must be surrounded by identical ref)
+ boolean surrounded = true;
+ for ( int i = 0 ; i < identBefore && surrounded ; i++ ) {
+ if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) {
+ surrounded = false;
+ }
+ }
+ for ( int i = 0 ; i < identAfter && surrounded ; i++ ) {
+ if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) {
+ surrounded = false;
+ }
+ }
+ if ( !surrounded ) {
+ continue;
+ }
+
+ // this is it! no feature but filtred in
+ return true;
+ } else
+ return false;
+
+ } else {
+
+ // manual advance
+ if (cigarElement.getOperator().consumesReadBases()) {
+ readOfs += length;
+ }
+ if (cigarElement.getOperator().consumesReferenceBases()) {
+ refOfs += length;
+ }
+ }
+ };
+
+ // if here, false
+ return false;
+ }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/IndependentSampleGenotypesModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/IndependentSampleGenotypesModel.java
index faa20cd4d5d..f16999f5a43 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/IndependentSampleGenotypesModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/IndependentSampleGenotypesModel.java
@@ -37,7 +37,11 @@ public IndependentSampleGenotypesModel(final int calculatorCachePloidyCapacity,
calculators = new GenotypeLikelihoodCalculators();
}
- public GenotypingLikelihoods calculateLikelihoods(final AlleleList genotypingAlleles, final GenotypingData data, final byte[] paddedReference, final int offsetForRefIntoEvent, final DragstrReferenceAnalyzer dragstrs) {
+ public GenotypingLikelihoods calculateLikelihoods(final AlleleList genotypingAlleles,
+ final GenotypingData data,
+ final byte[] paddedReference,
+ final int offsetForRefIntoEvent,
+ final DragstrReferenceAnalyzer dragstrs) {
Utils.nonNull(genotypingAlleles, "the allele cannot be null");
Utils.nonNull(data, "the genotyping data cannot be null");
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java
index 8943594d57f..05261b90ace 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java
@@ -1,19 +1,24 @@
package org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc;
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.*;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
import org.apache.commons.math3.special.Gamma;
import org.apache.commons.math3.util.MathArrays;
import org.broadinstitute.hellbender.utils.*;
+import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
+import org.broadinstitute.hellbender.utils.Dirichlet;
+import org.broadinstitute.hellbender.utils.IndexRange;
+import org.broadinstitute.hellbender.utils.MathUtils;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AlleleAndContext;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
@@ -26,8 +31,8 @@
public final class AlleleFrequencyCalculator {
private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
- private static final double THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE = 0.1;
- private static final int HOM_REF_GENOTYPE_INDEX = 0;
+ public static final double THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE = 0.1;
+ public static final int HOM_REF_GENOTYPE_INDEX = 0;
private final double refPseudocount;
private final double snpPseudocount;
@@ -74,6 +79,7 @@ public static AlleleFrequencyCalculator makeCalculator(final DragstrParams drags
* @return
*/
private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) {
+
final double[] log10Likelihoods;
if (g.hasLikelihoods()) {
log10Likelihoods = g.getLikelihoods().getAsVector();
@@ -138,16 +144,60 @@ public AFCalculationResult calculate(final VariantContext vc, final int defaultP
final List alleles = vc.getAlleles();
Utils.validateArg( numAlleles > 1, () -> "VariantContext at " + vc.getContig() + ":" + vc.getStart() +
"has only a single reference allele, but getLog10PNonRef requires at least alternate allele");
+ return calculate(numAlleles, alleles, vc.getGenotypes(), defaultPloidy, vc.getReference().length());
+
+ }
+
+ /**
+ * Identical to AlleleFrequencyCalculator but does not require VariantContext, and assumes
+ * only a single alternative allele. This is useful in the context of AlleleFiltering where
+ * we are basically genotyping the allele (all haplotypes that contain the allele) versus ~allele
+ * (all haplotypes that do not contain the allele).
+ *
+ * @param gls the GenotyplingLikelihoods holding the alleles and sample information.
+ * @return result (for programming convenience)
+ */
+ public AFCalculationResult fastCalculateDiploidBasedOnGLs(final GenotypingLikelihoods gls, final int defaultPloidy) {
+ Utils.nonNull(gls, "Likelihoods can only be non-null");
+ Utils.validateArg(gls.numberOfAlleles()==2, "Only case of two alleles is supported");
+ final int numAlleles = gls.numberOfAlleles();
+ final List alleles = gls.asListOfAlleles();
+
+ final List alleleLengths = new ArrayList<>();
+ for (Allele al : gls.asListOfAlleles()) {
+ if (al instanceof AlleleAndContext) {
+ alleleLengths.add(((AlleleAndContext) al).maxAlleleLength());
+ } else {
+ alleleLengths.add(al.length());
+ }
+ }
+ final int alleleLength = alleleLengths.stream().max(Integer::compare).get();
+
+ final List samples = gls.asListOfSamples();
+ final List genotypes = IntStream.range(0, samples.size()).mapToObj(idx -> new GenotypeBuilder(samples.get(idx)).alleles(alleles).PL(gls.sampleLikelihoods(idx).getAsPLs()).make()).collect(Collectors.toList());
+ return calculate(numAlleles, alleles, genotypes, defaultPloidy, alleleLength);
+ }
+
+
+ /**
+ * Private function that actually calculates allele frequencies etc.
+ *
+ */
+ private AFCalculationResult calculate(final int numAlleles,
+ final List alleles,
+ final List genotypes,
+ final int defaultPloidy,
+ final int refLength) {
final double[] priorPseudocounts = alleles.stream()
- .mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() == vc.getReference().length() ? snpPseudocount : indelPseudocount)).toArray();
+ .mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() == refLength ? snpPseudocount : indelPseudocount)).toArray();
double[] alleleCounts = new double[numAlleles];
final double flatLog10AlleleFrequency = -MathUtils.log10(numAlleles); // log10(1/numAlleles)
double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency);
- for (double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE; ) {
- final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies);
+ for (double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; alleleCountsMaximumDifference > AlleleFrequencyCalculator.THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE; ) {
+ final double[] newAlleleCounts = effectiveAlleleCounts(genotypes, log10AlleleFrequencies);
alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble();
alleleCounts = newAlleleCounts;
final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts);
@@ -159,6 +209,7 @@ public AFCalculationResult calculate(final VariantContext vc, final int defaultP
}
double[] log10POfZeroCountsByAllele = new double[numAlleles];
+
double log10PNoVariant = 0;
final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
@@ -166,7 +217,8 @@ public AFCalculationResult calculate(final VariantContext vc, final int defaultP
// re-usable buffers of the log10 genotype posteriors of genotypes missing each allele
final List log10AbsentPosteriors = IntStream.range(0,numAlleles).mapToObj(n -> new DoubleArrayList()).collect(Collectors.toList());
- for (final Genotype g : vc.getGenotypes()) {
+
+ for (final Genotype g : genotypes) {
if (!GenotypeUtils.genotypeIsUsableForAFCalculation(g)) {
continue;
}
@@ -177,7 +229,7 @@ public AFCalculationResult calculate(final VariantContext vc, final int defaultP
//the total probability
if (!spanningDeletionPresent) {
- log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
+ log10PNoVariant += log10GenotypePosteriors[AlleleFrequencyCalculator.HOM_REF_GENOTYPE_INDEX];
} else {
nonVariantIndicesByPloidy.computeIfAbsent(ploidy, p -> genotypeIndicesWithOnlyRefAndSpanDel(p, alleles));
final int[] nonVariantIndices = nonVariantIndicesByPloidy.get(ploidy);
@@ -196,6 +248,7 @@ public AFCalculationResult calculate(final VariantContext vc, final int defaultP
// for each allele, we collect the log10 probabilities of genotypes in which the allele is absent, then add (in log space)
// to get the log10 probability that the allele is absent in this sample
log10AbsentPosteriors.forEach(DoubleArrayList::clear); // clear the buffers. Note that this is O(1) due to the primitive backing array
+
for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
glCalc.genotypeAlleleCountsAt(genotype).forEachAbsentAlleleIndex(a -> log10AbsentPosteriors.get(a).add(log10GenotypePosterior), numAlleles);
@@ -226,6 +279,8 @@ public AFCalculationResult calculate(final VariantContext vc, final int defaultP
return new AFCalculationResult(integerAltAlleleCounts, alleles, log10PNoVariant, log10PRefByAllele);
}
+
+
/**
* Calculate the posterior probability that a single biallelic genotype is non-ref
*
@@ -253,12 +308,12 @@ public double calculateSingleSampleBiallelicNonRefPosterior(final double[] log10
// for numerical stability we will do this in log space:
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
- private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
- final int numAlleles = vc.getNAlleles();
+ private double[] effectiveAlleleCounts(List genotypes, final double[] log10AlleleFrequencies) {
+ final int numAlleles = log10AlleleFrequencies.length;
Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
final double[] log10Result = new double[numAlleles];
Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
- for (final Genotype g : vc.getGenotypes()) {
+ for (final Genotype g : genotypes) {
if (!GenotypeUtils.genotypeIsUsableForAFCalculation(g)) {
continue;
}
@@ -267,9 +322,10 @@ private double[] effectiveAlleleCounts(final VariantContext vc, final double[] l
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex ->
- glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
- log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.log10(count))));
+ glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
+ log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.log10(count))));
}
return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
+
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AncestralContigLocationTranslator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AncestralContigLocationTranslator.java
new file mode 100644
index 00000000000..278dcbbf792
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AncestralContigLocationTranslator.java
@@ -0,0 +1,51 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.Tuple;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+
+import java.io.IOException;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+ class AncestralContigLocationTranslator {
+
+ // locals
+ final private GATKPath basePath;
+ final private Map translators = new LinkedHashMap<>();
+
+ AncestralContigLocationTranslator(GATKPath basePath) {
+ this.basePath = basePath;
+ }
+
+ protected Tuple translate(final Locatable loc) throws IOException {
+ return new Tuple<>(translate(GroundTruthReadsBuilder.C_MATERNAL, loc),
+ translate(GroundTruthReadsBuilder.C_PATERNAL, loc));
+ }
+
+ private SimpleInterval translate(final String ancestor, final Locatable loc) throws IOException {
+
+ int start = translate(ancestor, loc.getContig(), loc.getStart());
+ int end = translate(ancestor, loc.getContig(), loc.getEnd());
+
+ if ( end > start ) {
+ return new SimpleInterval(loc.getContig() + "_" + ancestor, start, end);
+ } else {
+ throw new LocationTranslationException("location " + loc + " failed to translate for " + ancestor + ", start:" + start + " ,end:" + end);
+ }
+ }
+
+ private int translate(final String ancestor, final String contig, final int from) throws IOException {
+
+ // check-for/create translator
+ final String key = ancestor + "." + contig + ".csv";
+ if ( !translators.containsKey(key) ) {
+ final GATKPath path = new GATKPath(basePath.getURIString() + key);
+ translators.put(key, new SingleFileLocationTranslator(path));
+ }
+
+ // translate
+ return translators.get(key).translate(from);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
new file mode 100644
index 00000000000..cee262d277c
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
@@ -0,0 +1,986 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.SequenceUtil;
+import htsjdk.samtools.util.Tuple;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.argparser.*;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.read.CigarBuilder;
+import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.tools.walkers.featuremapping.FlowFeatureMapper;
+import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;
+import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Arrays;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Random;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * An internal tool to produce a flexible and robust ground truth set for base calling training.
+ *
+ *
+ * Input
+ *
+ * Coordinate-sorted and indexed SAM/BAM/CRAM
+ * Maternal and Parental references (fa)
+ * Folder with address translation files from reference to maternal/parental references (filename example: maternal.chr9.csv)
+ *
+ *
+ * Output
+ *
+ * CSV file containing maternal/parental haplotype scores and many more columns (.csv or .csv.gz supported)
+ *
+ *
+ * At present, the output will contain the following columns (not nessarily in this order) for each processed read:
+ *
+ * ReadName, ReadChrom, ReadStart, ReadEnd, tm, mapq, flags, ReadCigar, ReadSequence, ReadUnclippedStart,
+ * ReadUnclippedEnd - information directly extracted from the input read
+ * PaternalHaplotypeInterval, BestHaplotypeSequence, PaternalHaplotypeScore - parental haplotype information.
+ * First the read interval is translated into parental space, the haplotype sequence extracted and then scored
+ * Maternal* - same for maternal
+ * RefHaplotypeScore - score computed from the reference haplotype
+ * BestHaplotypeKey - flow based key for the 'best' (score wise) haplotype (out of the maternal/paternal) pair)
+ * ConsensusHaplotypeKey - a flow based key constructed from the flow keys of the maternal and paternal
+ * haplotypes, containing only keys that agree (other keys filled with fixed/special value)
+ *
+ *
+ *
+ * Usage examples
+ *
+ * gatk GroundTruthReadsBuilder \
+ * -R
+ * ../../../ref/Homo_sapiens_assembly38.fasta
+ * -I
+ * 150548-UGAv3-4.chr9.cram
+ * --maternal-ref
+ * chr9_HG001_maternal.fa
+ * --paternal-ref
+ * chr9_HG001_paternal.fa
+ * --ancestral-translators-base-path
+ * ./
+ * --output-csv
+ * output-small.csv
+ * --subsampling-ratio
+ * 1.0
+ * --max-output-reads
+ * 100000000
+ * --intervals
+ * chr9:109991494-109991494
+ * --smith-waterman
+ * FASTEST_AVAILABLE
+ * --likelihood-calculation-engine
+ * FlowBased
+ * -mbq
+ * 0
+ * --kmer-size
+ * 10
+ * --gt-debug
+ * --output-flow-length
+ * 1000
+ * --haplotype-output-padding-size
+ * 8
+ * --prepend-sequence
+ * TTTT
+ * --append-sequence
+ * CCCC
+ *
+ *
+ * {@GATK.walkertype ReadWalker}
+ */
+@CommandLineProgramProperties(
+ summary = "Ground Truth Reads Builder",
+ oneLineSummary = "Produces a flexible and robust ground truth set for base calling training",
+ programGroup = FlowBasedProgramGroup.class
+)
+
+@DocumentedFeature
+@ExperimentalFeature
+public final class GroundTruthReadsBuilder extends PartialReadWalker {
+
+ // constants
+ private static final Logger logger = LogManager.getLogger(GroundTruthReadsBuilder.class);
+ public static final int DEFAULT_FILL_VALUE = -65;
+ public static final int NONREF_FILL_VALUE = -80;
+ public static final int UNKNOWN_FILL_VALUE = -85;
+ public static final int SOFTCLIP_FILL_VALUE = -83;
+ private static final int EXTRA_FILL_FROM_HAPLOTYPE = 50;
+ static final String C_MATERNAL = "maternal";
+ static final String C_PATERNAL = "paternal";
+
+ @Argument(fullName = "maternal-ref", doc="maternal reference file")
+ public GATKPath maternalRefPath = null;
+ @Argument(fullName = "paternal-ref", doc="paternal reference file")
+ public GATKPath paternalRefPath = null;
+ @Argument(fullName = "ancestral-translators-base-path", doc="base path for ancestral translation ancestral.contig.csv files")
+ public GATKPath ancestralTranslatorsBasePath = null;
+
+ @Argument(fullName = "subsampling-ratio", doc = "subsampling ratio, should be between 0 and 1", optional = true)
+ public double subsamplingRatio = 1.0;
+ @Argument(fullName = "max-output-reads", doc = "maximal number of reads to output", optional = true)
+ public int maxOutputReads = 20000000;
+
+ @Argument(fullName = "output-flow-length", doc = "Required length of output flows", optional = true)
+ public int outputFlowLength = 0;
+ @Argument(fullName = "prepend-sequence", doc = "Sequence to prepend (barcode)", optional = true)
+ public String prependSequence;
+ @Argument(fullName = "append-sequence", doc = "Sequence to append (adapter)", optional = true)
+ public String appendSequence;
+
+ @Argument(fullName = "min-mq", doc = "Minimal mapping quality", optional = true)
+ public double minMappingQuality = 0;
+ @Argument(fullName = "max-rq", doc = "Maximal read quality", optional = true)
+ public double maxReadQuality = 0;
+ @Argument(fullName = "include-supp-align", doc = "Include supplementary alignments", optional = true)
+ public boolean includeSuppAlign = false;
+ @Argument(fullName = "min-haplotype-score", doc = "Minimal score (likelihood) on either haplotype", optional = true)
+ public double minHaplotypeScore = 0;
+ @Argument(fullName = "min-haplotype-score-delta", doc = "Minimal score (likelihood) delta between haplotypes", optional = true)
+ public double minHaplotypeScoreDelta = 0;
+ @Argument(fullName = "haplotype-output-padding-size", doc = "Number of N to append to best haplotype on output", optional = true)
+ public int haplotypeOutputPaddingSize = 8;
+ @Argument(fullName = "discard-non-polyt-softclipped-reads", doc = "Discard reads which are softclipped, unless the softclip is polyT, defaults to true", optional = true)
+ public boolean discardNonPolytSoftclippedReads = false;
+
+ @Argument(fullName = "fill-trimmed-reads-Q", doc = "Reads with tm:Q should be filled from haplotype, otherwise (default) filled with -80", optional = true)
+ public boolean fillTrimmedReadsQ;
+ @Argument(fullName = "fill-trimmed-reads-Z", doc = "Reads with tm:Z should be filled from haplotype, otherwise (default) filled with -80", optional = true)
+ public boolean fillTrimmedReadsZ;
+ @Argument(fullName = "fill-trimmed-reads", doc = "Reads with tm:Q or tm:Z should be filled from haplotype, otherwise (default) filled with -80", optional = true)
+ public boolean fillTrimmedReads;
+ @Argument(fullName = "fill-softclipped-reads", doc = "Softclipped reads should be filled from haplotype, otherwise (default) filled with -83", optional = true)
+ public boolean fillSoftclippedReads;
+ @Argument(fullName = "false-snp-compensation", doc = "skip haplotype bases until same base as read starts (false SNP compensation)", optional = true)
+ public boolean falseSnpCompensation;
+
+ @Argument(fullName = "output-csv", doc="main CSV output file. the file containing maternal/parental "
+ + "maternal and paternal haplotype sequences and scores (and many more columns). supported file extensions: .csv, .csv.gz.")
+ public GATKPath outputCsvPath = null;
+
+ @Hidden
+ @Argument(fullName = "gt-debug", doc = "Turn additional internal logging on", optional = true)
+ public boolean debugMode = false;
+
+ @Argument(fullName = "gt-no-output", doc = "do not generate output records", optional = true)
+ public boolean noOutput = false;
+
+ @ArgumentCollection
+ public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection();
+
+ @ArgumentCollection
+ public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();
+
+ // locals
+ private final Random random = new Random();
+ private int outputReadsCount = 0;
+ private ReferenceDataSource maternalReference;
+ private ReferenceDataSource paternalReference;
+ private AncestralContigLocationTranslator locationTranslator;
+ private FlowBasedAlignmentLikelihoodEngine likelihoodCalculationEngine;
+ private PrintWriter outputCsv;
+ private int locationTranslationErrors;
+
+ // static/const
+ static final private String[] CSV_FIELD_ORDER = {
+ "ReadName", "ReadChrom", "ReadStart", "ReadEnd",
+ "PaternalHaplotypeScore", "MaternalHaplotypeScore", "RefHaplotypeScore",
+ "ReadKey", "BestHaplotypeKey", "ConsensusHaplotypeKey",
+ "tm", "mapq", "flags", "ReadCigar",
+ "ReadSequence", "PaternalHaplotypeSequence", "MaternalHaplotypeSequence", "BestHaplotypeSequence",
+ "ReadUnclippedStart", "ReadUnclippedEnd", "PaternalHaplotypeInterval", "MaternalHaplotypeInterval"
+ };
+
+ private static class ScoredHaplotype {
+ ReferenceContext ref;
+ ReferenceContext clippedRef;
+ ReferenceContext unclippedRef;
+ int softclipFrontFillCount;
+ Haplotype haplotype;
+ double score;
+ }
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+
+ // initialize references
+ maternalReference = ReferenceDataSource.of(maternalRefPath.toPath());
+ paternalReference = ReferenceDataSource.of(paternalRefPath.toPath());
+ locationTranslator = new AncestralContigLocationTranslator(ancestralTranslatorsBasePath);
+
+ // create likelihood engine
+ ReadLikelihoodCalculationEngine engine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(likelihoodArgs, false);
+ if ( engine instanceof FlowBasedAlignmentLikelihoodEngine) {
+ likelihoodCalculationEngine = (FlowBasedAlignmentLikelihoodEngine)engine;
+ } else {
+ throw new GATKException("must use a flow based likelihood calculation engine");
+ }
+
+ // open output, write header
+ try {
+ if (outputCsvPath.toPath().toString().endsWith(".gz")) {
+ outputCsv = new PrintWriter(new GZIPOutputStream(outputCsvPath.getOutputStream()));
+ } else {
+ outputCsv = new PrintWriter(outputCsvPath.getOutputStream());
+ }
+ } catch (IOException e) {
+ throw new GATKException("failed to open csv output: " + outputCsvPath, e);
+ }
+ emitCsvHeaders();
+ }
+
+ @Override
+ public void closeTool() {
+
+ if ( locationTranslationErrors != 0 ) {
+ logger.warn("" + locationTranslationErrors + " location translation errors detected");
+ }
+
+ outputCsv.close();
+ super.closeTool();
+ }
+
+ @Override
+ protected boolean shouldExitEarly(GATKRead read) {
+
+ // limit number of output reads
+ return ( maxOutputReads != 0) && (outputReadsCount >= maxOutputReads);
+ }
+
+ @Override
+ public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
+
+ // filter out due to mapping quality
+ if ( minMappingQuality != 0 && read.getMappingQuality() < minMappingQuality ) {
+ return;
+ }
+
+ // supplemental alignment filter
+ if ( read.isSupplementaryAlignment() && !includeSuppAlign ) {
+ return;
+ }
+
+ // discard because softclipped
+ if ( discardNonPolytSoftclippedReads && isEndSoftclipped(read) && !isEndPolyTSoftclipped(read) ) {
+ return;
+ }
+
+ // subsample
+ // NOTE: this is done BEFORE read quality and haplotype scoring
+ if ( random.nextDouble() > subsamplingRatio ) {
+ return;
+ }
+
+ // filter out due to read quality
+ FlowBasedRead flowRead = null;
+ if ( maxReadQuality != 0 ) {
+ flowRead = buildFlowRead(read);
+ if (getFlowBasedReadQuality(flowRead, flowRead.getMaxHmer()) > maxReadQuality) {
+ return;
+ }
+ }
+
+ // process the read
+ try {
+ // make sure we have a flow read
+ if ( flowRead == null ) {
+ flowRead = buildFlowRead(read);
+ }
+
+ // prepare
+ final ScoredHaplotype maternal = new ScoredHaplotype();
+ final ScoredHaplotype paternal = new ScoredHaplotype();
+
+ // translate location to ascentors
+ final Tuple ancestralLocs = locationTranslator.translate(read);
+ maternal.ref = new ReferenceContext(maternalReference, ancestralLocs.a);
+ paternal.ref = new ReferenceContext(paternalReference, ancestralLocs.b);
+
+ // build haplotypes
+ maternal.haplotype = buildReferenceHaplotype(maternal.ref, read);
+ paternal.haplotype = buildReferenceHaplotype(paternal.ref, read);
+ buildExtendedRef(maternal, maternalReference, ancestralLocs.a, read);
+ buildExtendedRef(paternal, paternalReference, ancestralLocs.b, read);
+
+ // generate score for reference
+ final double refScore = scoreReadAgainstReference(read, referenceContext);
+
+ // score read against haplotypes, create flow versions of read nad haplotype
+ if ( areSame(maternal.haplotype, referenceContext, read.isReverseStrand()) ) {
+ maternal.score = refScore;
+ } else {
+ maternal.score = scoreReadAgainstHaplotype(read, maternal);
+ }
+ if ( areSame(paternal.haplotype, referenceContext, read.isReverseStrand()) ) {
+ paternal.score = refScore;
+ } else if ( arsSame(maternal.haplotype, paternal.haplotype, read.isReverseStrand()) ) {
+ paternal.score = scoreReadAgainstHaplotype(read, paternal);
+ } else {
+ paternal.score = scoreReadAgainstHaplotype(read, paternal);
+ }
+
+ // debug printing (in INFO for now, will be changed to DEBUG)
+ debugLog(read, referenceContext, maternal, paternal);
+
+ // filter on min score
+ // TODO: this is probaby wrong since the scores are negative. To be handled later
+ if ( minHaplotypeScore != 0 && Math.min(maternal.score, paternal.score) > minHaplotypeScore ) {
+ return;
+ }
+
+ // filter on score delta
+ if ( minHaplotypeScoreDelta != 0 && Math.abs(maternal.score - paternal.score) > minHaplotypeScoreDelta ) {
+ return;
+ }
+
+ // if here, emit this read
+ outputReadsCount++;
+ emit(read, flowRead, refScore, maternal, paternal);
+
+ } catch (LocationTranslationException e) {
+ logger.warn("location translation exception: " + e.getMessage());
+ locationTranslationErrors++;
+ } catch (IOException e) {
+ throw new GATKException("failed to process read: " + read.getName(), e);
+ }
+ }
+
+ private boolean shouldFillFromHaplotype(final GATKRead read) {
+
+ // softclip has priori
+ if ( isEndSoftclipped(read) )
+ return fillSoftclippedReads;
+
+ // extending timmed as well?
+ final String tm = read.getAttributeAsString(FlowBasedRead.CLIPPING_TAG_NAME);
+ if ( tm == null ) {
+ return true;
+ } else {
+ boolean hasA = tm.indexOf('A') >= 0;
+ boolean hasQ = tm.indexOf('Q') >= 0;
+ boolean hasZ = tm.indexOf('Z') >= 0;
+ if ( hasA ) {
+ return false;
+ }
+ else if ( hasZ && (fillTrimmedReads || fillTrimmedReadsZ) ) {
+ return true;
+ }
+ else if ( hasQ && (fillTrimmedReads || fillTrimmedReadsQ) ) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ private void buildExtendedRef(final ScoredHaplotype scoredHaplotype, final ReferenceDataSource ref, final SimpleInterval loc, final GATKRead read) {
+
+ // assume no extension
+ int extendStart = 0;
+ int extendEnd = 0;
+
+ // calc soft extension
+ if ( fillSoftclippedReads ) {
+ final CigarElement elem = !read.isReverseStrand()
+ ? read.getCigar().getLastCigarElement() : read.getCigar().getFirstCigarElement();
+ if (elem.getOperator() == CigarOperator.S) {
+ if (!read.isReverseStrand()) {
+ extendEnd += elem.getLength();
+ } else {
+ extendStart += elem.getLength();
+ }
+ }
+ }
+
+ // add padding
+ if ( !read.isReverseStrand() ) {
+ extendEnd += haplotypeOutputPaddingSize;
+ } else {
+ extendStart += haplotypeOutputPaddingSize;
+ }
+
+ // add extra fill from haplotype
+ if ( (outputFlowLength != 0) && shouldFillFromHaplotype(read) ) {
+ int length = (loc.getEnd() + extendEnd) - (loc.getStart() - extendStart);
+ int delta = Math.max(0, outputFlowLength - length) + EXTRA_FILL_FROM_HAPLOTYPE;
+ if ( !read.isReverseStrand() ) {
+ extendEnd += delta;
+ } else {
+ extendStart += delta;
+ }
+ }
+
+ // compensate for skipage of first hmer
+ final int delta = scoredHaplotype.ref.getInterval().size() - scoredHaplotype.haplotype.getGenomeLocation().getLengthOnReference();
+ if ( delta != 0 ) {
+ if ( !read.isReverseStrand() ) {
+ extendStart -= delta;
+ } else {
+ extendEnd -= delta;
+ }
+ }
+
+
+ int minStart = ref.getSequenceDictionary().getSequence(loc.getContig()).getStart();
+ int maxEnd = ref.getSequenceDictionary().getSequence(loc.getContig()).getEnd();
+ if ( isStartSoftclipped(read) ) {
+ scoredHaplotype.clippedRef = new ReferenceContext(ref,
+ new SimpleInterval(loc.getContig(), Math.max(minStart, loc.getStart() - extendStart), Math.min(maxEnd, loc.getEnd() + extendEnd)));
+ }
+
+ // add front unclipped
+ {
+ final CigarElement frontElem = !read.isReverseStrand()
+ ? read.getCigar().getFirstCigarElement() : read.getCigar().getLastCigarElement();
+ if (frontElem.getOperator() == CigarOperator.S) {
+ if (!read.isReverseStrand()) {
+ extendStart += frontElem.getLength();
+ } else {
+ extendEnd += frontElem.getLength();
+ }
+ }
+ scoredHaplotype.unclippedRef = new ReferenceContext(ref,
+ new SimpleInterval(loc.getContig(), Math.max(minStart, loc.getStart() - extendStart), Math.min(maxEnd, loc.getEnd() + extendEnd)));
+ }
+ }
+
+ private boolean arsSame(final Haplotype h1, final Haplotype h2, boolean isReversed) {
+
+ return Arrays.equals(h1.getBases(), h2.getBases());
+ }
+
+ private boolean areSame(final Haplotype h, final ReferenceContext ref, boolean isReversed) {
+ return Arrays.equals(h.getBases(), reverseComplement(ref.getBases(), isReversed));
+ }
+
+ private FlowBasedRead buildFlowRead(final GATKRead read) {
+
+ FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
+
+ return new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+ }
+
+ private boolean isEndSoftclipped(final GATKRead read) {
+
+ if ( !read.isReverseStrand() ) {
+ return read.getCigar().getLastCigarElement().getOperator() == CigarOperator.S;
+ } else {
+ return read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.S;
+ }
+ }
+
+ private boolean isStartSoftclipped(final GATKRead read) {
+
+ if ( !read.isReverseStrand() ) {
+ return read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.S;
+ } else {
+ return read.getCigar().getLastCigarElement().getOperator() == CigarOperator.S;
+ }
+ }
+
+ private boolean isEndPolyTSoftclipped(final GATKRead read) {
+
+ // must be softclipped
+ if ( !isEndSoftclipped(read) )
+ return false;
+
+ // are all softclipped bases T
+ final byte[] bases = read.getBasesNoCopy();
+ if ( !read.isReverseStrand() ) {
+ final int length = read.getCigar().getFirstCigarElement().getLength();
+ for ( int n = 0 ; n < length ; n++ ) {
+ if (bases[n] != 'T') {
+ return false;
+ }
+ }
+ } else {
+ final int length = read.getCigar().getLastCigarElement().getLength();
+ for ( int n = 0 ; n < length ; n++ ) {
+ if (bases[bases.length - n - 1] != 'A') {
+ return false;
+ }
+ }
+ }
+
+ // if here all softclipped bases are T
+ return true;
+ }
+
+ private void debugLog(final GATKRead read, final ReferenceContext referenceContext, final ScoredHaplotype maternal, final ScoredHaplotype paternal) {
+
+ if ( debugMode ) {
+ logger.info("read: " + read.getName() + " " + read.getCigar() + " " + read.getFlags());
+ logger.info("read: " + new SimpleInterval(read) + " " + new String(read.getBases()));
+ logger.info("ref: " + new SimpleInterval(referenceContext) + " " + new String(referenceContext.getBases()));
+ logger.info("mRef: " + maternal.ref.getInterval() + " " + new String(maternal.ref.getBases()));
+ logger.info("pRef: " + paternal.ref.getInterval() + " " + new String(paternal.ref.getBases()));
+ logger.info("pmDiff: " + new String(debugBinDiff(maternal.ref.getBases(), paternal.ref.getBases())));
+ logger.info("mHap: " + maternal.score + " " + maternal.haplotype);
+ logger.info("pHap: " + paternal.score + " " + paternal.haplotype);
+ }
+ }
+
+ private byte[] debugBinDiff(final byte[] b1, final byte[] b2) {
+ final int len = Math.min(b1.length, b2.length);
+ final byte[] result = new byte[len];
+
+ for ( int n = 0 ; n < len ; n++ ) {
+ result[n] = (b1[n] == b2[n]) ? (byte)'_' : (byte)'1';
+ }
+
+ return result;
+
+ }
+
+ private double getFlowBasedReadQuality(final FlowBasedRead read, final int maxClass) {
+
+ double sum = 0;
+ for ( int n = 0 ; n < read.getKeyLength() ; n++ ) {
+ sum += read.getProb(n, maxClass);
+ }
+ return sum;
+ }
+
+ private Haplotype buildReferenceHaplotype(final ReferenceContext ref, final GATKRead read) {
+
+ Locatable loc = new SimpleInterval(ref.getInterval());
+ byte[] haplotypeBases = reverseComplement(ref.getBases(), read.isReverseStrand());
+ final byte[] readBases = reverseComplement(getSoftclippedBases(read), read.isReverseStrand());
+
+ // skip haplotype bases until same base as read starts (false SNP compensation)
+ if ( falseSnpCompensation && (haplotypeBases[0] != readBases[0]) ) {
+ final int skip = detectFalseSNP(haplotypeBases, readBases);
+ if ( skip != 0 ) {
+ haplotypeBases = Arrays.copyOfRange(haplotypeBases, skip, haplotypeBases.length);
+ if ( !read.isReverseStrand() ) {
+ loc = new SimpleInterval(loc.getContig(), loc.getStart() + skip, loc.getEnd());
+ } else {
+ loc = new SimpleInterval(loc.getContig(), loc.getStart(), loc.getEnd() - skip);
+ }
+ }
+
+ }
+
+ final Haplotype haplotype = new Haplotype(haplotypeBases, loc);
+ haplotype.setCigar(new CigarBuilder(false)
+ .add(new CigarElement(haplotype.length(), CigarOperator.M)).make());
+
+ return haplotype;
+ }
+
+ private byte[] getSoftclippedBases(final GATKRead read) {
+
+ final int startClip = (read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.SOFT_CLIP)
+ ? read.getCigar().getFirstCigarElement().getLength() : 0;
+ final int endClip = (read.getCigar().getLastCigarElement().getOperator() == CigarOperator.SOFT_CLIP)
+ ? read.getCigar().getLastCigarElement().getLength() : 0;
+
+ final byte[] bases = read.getBasesNoCopy();
+ if ( startClip == 0 && endClip == 0 ) {
+ return bases;
+ } else {
+ return Arrays.copyOfRange(bases, startClip, bases.length - endClip);
+ }
+ }
+
+ private int detectFalseSNP(final byte[] haplotypeBases, final byte[] readBases) {
+
+ // parametrisation
+ final int maxSkipageSize = 5; // will not skip more than this
+ final int requiredRemainingMatchSize = 5; // after skipage, this number of bases must match
+
+ // this might be redundant, but just in case
+ if ( haplotypeBases[0] == readBases[0] )
+ return 0;
+
+ // establish sizes of homopolymer on read
+ int readHomoSize = 0;
+ for ( ; readHomoSize < readBases.length ; readHomoSize++ )
+ if ( readBases[readHomoSize] != readBases[0] )
+ break;
+
+ // loop on possible skipage
+ for ( int skip = 1 ; skip <= maxSkipageSize ; skip++ ) {
+
+ // there must be enough bases
+ if ( skip + requiredRemainingMatchSize > haplotypeBases.length ) {
+ break;
+ }
+ if ( Math.max(skip + 1, requiredRemainingMatchSize) > readBases.length ) {
+ break;
+ }
+ // skipage + 1 must be inside homo polymer
+ if ( skip + 1 > readHomoSize ) {
+ break;
+ }
+
+ // remaining bases must match
+ if ( arrayRangeEquals(readBases, skip, requiredRemainingMatchSize, haplotypeBases, skip) ) {
+ // found skipape point
+ return skip;
+ }
+ }
+
+ // if here, did not find skipage point
+ return 0;
+ }
+
+ private boolean arrayRangeEquals(final byte[] a1, final int ofs1, final int len, final byte[] a2, final int ofs2) {
+
+ for ( int i = 0 ; i < len ; i++ ) {
+ if (a1[ofs1 + i] != a2[ofs2 + i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private double scoreReadAgainstHaplotype(final GATKRead read, final ScoredHaplotype sh) {
+
+ // build haplotypes
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
+ final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(sh.haplotype, rgInfo.flowOrder);
+
+ // create flow read
+ final FlowBasedRead flowRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+ if ( read.isReverseStrand() ) {
+ flowRead.setDirection(FlowBasedRead.Direction.SYNTHESIS);
+ flowRead.applyAlignment();
+ }
+
+ if ( !flowRead.isValid() ) {
+ return -1;
+ }
+
+ // compute alternative score
+ final int hapKeyLength = flowHaplotype.getKeyLength();
+ final double score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false);
+
+ return score;
+ }
+
+ private double scoreReadAgainstReference(final GATKRead read, final ReferenceContext ref) {
+
+ // build haplotypes
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
+ final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(buildReferenceHaplotype(ref, read), rgInfo.flowOrder);
+
+ // create flow read
+ final FlowBasedRead flowRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+ if ( read.isReverseStrand() ) {
+ flowRead.setDirection(FlowBasedRead.Direction.SYNTHESIS);
+ flowRead.applyAlignment();
+ }
+
+ if ( !flowRead.isValid() ) {
+ return -1;
+ }
+
+ // compute alternative score
+ final int hapKeyLength = flowHaplotype.getKeyLength();
+ final double score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false);
+
+ // debug
+ if ( debugMode ) {
+ logger.info("flowRead: " + flowRead);
+ logger.info("flowHaplotype: " + flowHaplotype);
+ logger.info("flowRead.key: " + Arrays.toString(flowRead.getKey()));
+ logger.info("flowHaplotype.key: " + Arrays.toString(flowHaplotype.getKey()));
+ logger.info("scoreReadAgainstReference: score: " + score);
+ }
+
+ return score;
+ }
+
+ private int[] buildHaplotypeKey(final String haplotypeSeq, final FlowBasedReadUtils.ReadGroupInfo rgInfo, final boolean isReversed) {
+
+ // create a haplotype to contain the sequence
+ final byte[] seq = reverseComplement(haplotypeSeq.getBytes(), isReversed);
+ final Haplotype h = new Haplotype(seq);
+ final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(h, !isReversed ? rgInfo.flowOrder : rgInfo.getReversedFlowOrder());
+
+ // need to start on a T - find out T offset on the flow order
+ int[] hapKey = flowHaplotype.getKey();
+ byte[] hapFlowOrder = flowHaplotype.getFlowOrderArray();
+ int appendZeroCount = 0;
+ while ( hapKey[0] == 0 ) {
+ hapKey = Arrays.copyOfRange(hapKey, 1, hapKey.length);
+ }
+ if ( (seq[0] != 'T') && (seq[0] != 'N') ) {
+ int ofs = 0;
+ while ( hapFlowOrder[ofs] != 'T' )
+ ofs++;
+ while ( hapFlowOrder[ofs] != seq[0] ) {
+ appendZeroCount++;
+ ofs = (ofs + 1) % hapFlowOrder.length;
+ }
+ }
+
+ if ( appendZeroCount == 0 ) {
+ return hapKey;
+ } else {
+ int[] hapKey1 = new int[appendZeroCount + hapKey.length];
+ System.arraycopy(hapKey, 0, hapKey1, appendZeroCount, hapKey.length);
+ return hapKey1;
+ }
+
+ }
+
+ private int[] buildHaplotypeKeyForOutput(ScoredHaplotype scoredHaplotype, final FlowBasedReadUtils.ReadGroupInfo rgInfo, final int fillValue, final GATKRead read) {
+
+ boolean isReversed = read.isReverseStrand();
+
+ // create key from filled and unclipped version
+ int[] hapKey = buildHaplotypeKey(new String(scoredHaplotype.unclippedRef.getBases()), rgInfo, isReversed);
+ if ( isStartSoftclipped(read) ) {
+ int[] hapKeyClipped = buildHaplotypeKey(new String(scoredHaplotype.clippedRef.getBases()), rgInfo, isReversed);
+ scoredHaplotype.softclipFrontFillCount = hapKey.length - hapKeyClipped.length;
+ } else {
+ scoredHaplotype.softclipFrontFillCount = 0;
+ }
+
+ // prepare key
+ final int flowLength = (outputFlowLength != 0) ? outputFlowLength : hapKey.length;
+ final int[] key = new int[flowLength];
+ int ofs;
+ System.arraycopy(hapKey, 0, key, 0, ofs = Math.min(flowLength, hapKey.length));
+
+ // adjust to a fixed length
+ for ( ; ofs < flowLength ; ofs++ ) {
+ key[ofs] = fillValue;
+ }
+
+ return key;
+ }
+
+ private String buildHaplotypeSequenceForOutput(final ScoredHaplotype haplotype, final boolean isReversed, final int keyBaseCount) {
+
+ final StringBuilder sb = new StringBuilder();
+ if ( prependSequence != null ) {
+ sb.append(prependSequence);
+ }
+
+ final String seq = new String(reverseComplement(haplotype.unclippedRef.getBases(), isReversed));
+ final String baseCountSeq = seq.substring(0, keyBaseCount);
+ sb.append(baseCountSeq);
+
+ if ( appendSequence != null ) {
+ sb.append(appendSequence);
+ }
+
+ return sb.toString();
+ }
+
+ private int[] buildConsensusKey(final int[] k1, final int[] k2) {
+
+ final int len = Math.min(k1.length, k2.length);
+ final int[] key = new int[len];
+
+ for ( int n = 0 ; n < len ; n++ ) {
+ key[n] = (k1[n] == k2[n]) ? k1[n] : -72;
+ }
+
+ return key;
+ }
+
+
+ private String flowKeyAsCsvString(final int[] key) {
+ return "\"" + Arrays.toString(key).replaceAll("\\[|\\]|\\s", "") + "\"";
+ }
+
+ private String flowKeyAsCsvString(int[] key, final String seq, final String flowOrder) {
+ final StringBuilder sb = new StringBuilder();
+
+ sb.append("\"");
+
+ while ( key[0] == 0 ) {
+ key = Arrays.copyOfRange(key, 1, key.length);
+ }
+ if ( (seq.charAt(0) != 'T') && (seq.charAt(0) != 'N') ) {
+ int ofs = 0;
+ while ( flowOrder.charAt(ofs) != 'T' )
+ ofs++;
+ while ( flowOrder.charAt(ofs) != seq.charAt(0) ) {
+ sb.append("0,");
+ ofs = (ofs + 1) % flowOrder.length();
+ }
+ }
+
+ sb.append(Arrays.toString(key).replaceAll("\\[|\\]|\\s", ""));
+
+ sb.append("\"");
+
+ return sb.toString();
+ }
+
+ private void emitCsvHeaders() {
+
+ outputCsv.println(StringUtils.join(CSV_FIELD_ORDER, ","));
+ }
+
+ private void emit(final GATKRead read, final FlowBasedRead flowRead, final double refScore, final ScoredHaplotype maternal, final ScoredHaplotype paternal) throws IOException {
+
+ // build line columns
+ final Map cols = new LinkedHashMap<>();
+
+ // establish fill value
+ final String tm = read.getAttributeAsString(FlowBasedRead.CLIPPING_TAG_NAME);
+ boolean hasA = (tm != null) && tm.indexOf('A') >= 0;
+ boolean hasQ = (tm != null) && tm.indexOf('Q') >= 0;
+ boolean hasZ = (tm != null) && tm.indexOf('Z') >= 0;
+ int fillValue;
+ if ( isEndSoftclipped(read) )
+ fillValue = SOFTCLIP_FILL_VALUE;
+ else if ( hasQ || hasZ ) {
+ fillValue = hasA ? UNKNOWN_FILL_VALUE : NONREF_FILL_VALUE;
+ } else {
+ fillValue = DEFAULT_FILL_VALUE;
+ }
+
+ // read name
+ cols.put("ReadName", read.getName());
+
+ // haplotypes and reference scores
+ cols.put("PaternalHaplotypeScore", paternal.score);
+ cols.put("MaternalHaplotypeScore", maternal.score);
+ cols.put("RefHaplotypeScore", refScore);
+
+ // build haplotype keys
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
+ final int[] paternalHaplotypeKey = buildHaplotypeKeyForOutput(paternal, rgInfo,fillValue, read);
+ final int[] maternalHaplotypeKey = buildHaplotypeKeyForOutput(maternal, rgInfo,fillValue, read);
+
+ // build haplotype sequence
+ final String paternalHaplotypeSeq = buildHaplotypeSequenceForOutput(paternal, read.isReverseStrand(), keyBases(paternalHaplotypeKey));
+ final String maternalHaplotypeSeq = buildHaplotypeSequenceForOutput(maternal, read.isReverseStrand(), keyBases(maternalHaplotypeKey));
+
+ // fill softclip at front
+ softclipFill(paternal, paternalHaplotypeKey);
+ softclipFill(maternal, maternalHaplotypeKey);
+
+ // select best and establish consensus
+ final boolean ancestralHaplotypesSame = paternalHaplotypeSeq.equals(maternalHaplotypeSeq);
+ final ScoredHaplotype bestHaplotype = (paternal.score > maternal.score) ? paternal: maternal;
+ final int[] bestHaplotypeKey = (bestHaplotype == paternal) ? paternalHaplotypeKey : maternalHaplotypeKey;
+ final int[] consensus = buildConsensusKey(paternalHaplotypeKey, maternalHaplotypeKey);
+
+ // emit best haplotype
+ cols.put("BestHaplotypeSequence", (bestHaplotype == paternal) ? paternalHaplotypeSeq : maternalHaplotypeSeq);
+ if ( !ancestralHaplotypesSame )
+ cols.put("BestHaplotypeKey", flowKeyAsCsvString(bestHaplotypeKey));
+ else
+ cols.put("BestHaplotypeKey", flowKeyAsCsvString(consensus));
+
+ // write consensus haplotype
+ cols.put("ConsensusHaplotypeKey", flowKeyAsCsvString(consensus));
+
+ // additional fields
+ cols.put("ReadChrom", read.getContig());
+ cols.put("ReadStart", read.getStart());
+ cols.put("ReadEnd", read.getEnd());
+ cols.put("ReadUnclippedStart", read.getUnclippedStart());
+ cols.put("ReadUnclippedEnd", read.getUnclippedEnd());
+ cols.put("ReadCigar", read.getCigar());
+
+ final String readSeq = reverseComplement(read.getBasesString(), read.isReverseStrand());
+ final int[] readKey = !read.isReverseStrand() ? flowRead.getKey() : reversedCopy(flowRead.getKey());
+ final String readFlowOrder = reverseComplement(FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read).flowOrder, read.isReverseStrand());
+ cols.put("ReadSequence", readSeq);
+ cols.put("ReadKey", flowKeyAsCsvString(readKey, readSeq, readFlowOrder));
+ cols.put("PaternalHaplotypeInterval", paternal.ref.getInterval());
+ cols.put("PaternalHaplotypeSequence", paternalHaplotypeSeq);
+ cols.put("MaternalHaplotypeInterval", maternal.ref.getInterval());
+ cols.put("MaternalHaplotypeSequence", maternalHaplotypeSeq);
+
+ cols.put("tm", (read.hasAttribute(FlowBasedRead.CLIPPING_TAG_NAME) ? read.getAttributeAsString(FlowBasedRead.CLIPPING_TAG_NAME) : ""));
+ cols.put("mapq", read.getMappingQuality());
+ cols.put("flags", read.getFlags());
+
+ // construct line
+ StringBuilder sb = new StringBuilder();
+ int colIndex = 0;
+ for ( String field : CSV_FIELD_ORDER ) {
+ if ( colIndex++ > 0 ) {
+ sb.append(',');
+ }
+ if ( !cols.containsKey(field) ) {
+ throw new GATKException("column missing from csv line: " + field);
+ }
+ sb.append(cols.get(field));
+ cols.remove(field);
+ }
+ if ( cols.size() > 0 ) {
+ throw new GATKException("invalid columns on csv line: " + cols.keySet());
+ }
+
+ // output line
+ if ( !noOutput ) {
+ outputCsv.println(sb);
+ }
+ }
+
+ private void softclipFill(ScoredHaplotype scoredHaplotype, int[] key) {
+ if ( !fillSoftclippedReads ) {
+ int limit = Math.min(scoredHaplotype.softclipFrontFillCount, key.length);
+ for (int n = 0; n < limit ; n++) {
+ key[n] = SOFTCLIP_FILL_VALUE;
+ }
+ }
+ }
+
+ private int keyBases(int[] key) {
+ int count = 0;
+ for ( int c : key ) {
+ if (c > 0) {
+ count += c;
+ }
+ }
+ return count;
+ }
+
+ private byte[] reverseComplement(final byte[] bases) {
+
+ final byte[] result = new byte[bases.length];
+ System.arraycopy(bases, 0, result, 0, result.length);
+ SequenceUtil.reverseComplement(result);
+
+ return result;
+ }
+
+ private byte[] reverseComplement(final byte[] bases, final boolean isReversed) {
+ return !isReversed ? bases : reverseComplement(bases);
+ }
+
+ private String reverseComplement(final String bases) {
+ return new String(reverseComplement(bases.getBytes()));
+ }
+
+ private String reverseComplement(final String bases, final boolean isReversed) {
+ return !isReversed ? bases : reverseComplement(bases);
+ }
+
+ private int[] reversedCopy(final int[] bytes) {
+ int[] copy = ArrayUtils.clone(bytes);
+ ArrayUtils.reverse(copy);
+ return copy;
+ }
+
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/LocationTranslationException.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/LocationTranslationException.java
new file mode 100644
index 00000000000..56fd84b9c49
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/LocationTranslationException.java
@@ -0,0 +1,13 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import org.broadinstitute.hellbender.exceptions.GATKException;
+
+public class LocationTranslationException extends GATKException {
+
+ static final private long serialVersionUID = 0;
+
+ LocationTranslationException(String msg) {
+
+ super(msg);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SingleFileLocationTranslator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SingleFileLocationTranslator.java
new file mode 100644
index 00000000000..b2ff4559fe0
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SingleFileLocationTranslator.java
@@ -0,0 +1,41 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import htsjdk.samtools.util.Tuple;
+import org.broadinstitute.hellbender.engine.GATKPath;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+
+public class SingleFileLocationTranslator {
+
+ final private int[] pos;
+ final private int[] offset;
+
+ SingleFileLocationTranslator(final GATKPath path) throws IOException {
+
+ // read the file in. we assume it is sorted and starts with pos=1
+ final List> data = new LinkedList<>();
+ final BufferedReader reader = new BufferedReader(new InputStreamReader(path.getInputStream()));
+ String line = reader.readLine(); // ignore first line
+ while ( (line = reader.readLine()) != null ) {
+ final String[] toks = line.split(",");
+ data.add(new Tuple<>(Integer.parseInt(toks[0]), Integer.parseInt(toks[1])));
+ }
+ pos = data.stream().map(t->t.a).mapToInt(i->i).toArray();
+ offset = data.stream().map(t->t.b).mapToInt(i->i).toArray();
+ }
+
+ int translate(final int from) {
+
+ // search for starting point
+ final int index = Arrays.binarySearch(pos, from);
+ if ( index >= 0 )
+ return from + offset[index];
+ else
+ return from + offset[-index - 2];
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleAndContext.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleAndContext.java
new file mode 100644
index 00000000000..ffb3480d617
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleAndContext.java
@@ -0,0 +1,54 @@
+package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;
+import htsjdk.variant.variantcontext.Allele;
+
+/**
+ * This class is similar to {@link org.broadinstitute.hellbender.tools.walkers.haplotypecaller.LocationAndAlleles} but
+ * allows keeping only an allele/ref pair rather than a list of alleles. The comparison is done on allele by allele basis and
+ * not in the way it is done on LocationAndAlleles
+ */
+
+public class AlleleAndContext extends Allele {
+ final static public long serialVersionUID = 1L;
+ private final int loc;
+ private final String contig;
+ private final Allele refAllele;
+ public AlleleAndContext(final String contig, final int loc, final Allele allele, final Allele refAllele) {
+ super(allele, false);
+ this.loc = loc;
+ this.contig = contig;
+ this.refAllele = refAllele;
+ }
+
+ public int getLoc() {
+ return loc;
+ }
+
+ public String getContig() { return contig; }
+
+ public Allele getAllele() {
+ return this;
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ final AlleleAndContext that = (AlleleAndContext) o;
+
+ if (loc != that.loc) return false;
+ return super.equals(that) && this.refAllele.equals(that.getRefAllele());
+ }
+
+ @Override
+ public int hashCode() {
+ return 31 * loc + (this != null ? super.hashCode() : 0);
+ }
+
+ public String toString() {return String.format("(%d) %s/%s", loc, getBaseString(), getRefAllele().getBaseString());}
+ public Allele getRefAllele() { return refAllele;}
+ public int maxAlleleLength() {
+ return Math.max(getAllele().length(), refAllele.length());
+ }
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
new file mode 100644
index 00000000000..2611d20c44d
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
@@ -0,0 +1,593 @@
+package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;
+
+import htsjdk.samtools.util.CollectionUtil;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.annotator.StrandOddsRatio;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.InverseAllele;
+import org.broadinstitute.hellbender.utils.BaseUtils;
+import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
+import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.jgrapht.graph.DefaultDirectedWeightedGraph;
+import org.jgrapht.graph.DefaultWeightedEdge;
+import org.jgrapht.io.ComponentNameProvider;
+import org.jgrapht.io.DOTExporter;
+import org.jgrapht.io.IntegerComponentNameProvider;
+
+import java.io.*;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * Filtering haplotypes that contribute weak alleles to the genotyping.
+ *
+ * @author Ilya Soifer <ilya.soifer@ultimagen.com>
+ * @author Yossi Farjoun <farjoun@broadinstitute.org>
+ *
+ */
+
+public abstract class AlleleFiltering {
+
+ final protected static Logger logger = LogManager.getLogger(AlleleFiltering.class);
+ final protected AssemblyBasedCallerArgumentCollection assemblyArgs;
+ final private OutputStreamWriter assemblyDebugOutStream;
+ AlleleFiltering(final AssemblyBasedCallerArgumentCollection assemblyArgs, final OutputStreamWriter assemblyDebugOutStream){
+ this.assemblyArgs = assemblyArgs;
+ this.assemblyDebugOutStream = assemblyDebugOutStream;
+ }
+
+ /**
+ * Finds alleles that are likely not contributing much to explaining the data and remove the haplotypes
+ * that contribute them.
+ *
+ * The alleles from the active region are divided into clusters of alleles that likely "compete" with each
+ * other, where compete means that they are the same allele up to a sequencing error although they might
+ * be assigned to a different genomic location. In each cluster we iteratively calculate the quality of
+ * each allele relative to other alleles in the cluster and remove the allele with the lowest quality.
+ * We then also select in each cluster alleles with high SOR and remove them.
+ *
+ * Every haplotype that contributes a filtered allele is filtered out.
+ *
+ * @param readLikelihoods unfiltered read x haplotype likelihood matrix
+ * @param activeWindowStart location of the active windows (assemblyResult.getPaddedReferenceLoc().getStart()
+ * @param suspiciousLocations set of suspicious locations for further marking in genotyping
+ * @return Subsetted read x haplotype where only the haplotypes that do not contribute filtered alleles show. Also
+ * locations of filtered alleles on the genome added to `suspiciousLocations` list
+ */
+
+ public AlleleLikelihoods filterAlleles(final AlleleLikelihoods readLikelihoods,
+ final int activeWindowStart, Set suspiciousLocations){
+
+ logger.debug("SHA:: filter alleles - start");
+ final AlleleLikelihoods subsettedReadLikelihoodsFinal = subsetHaplotypesByAlleles(readLikelihoods, assemblyArgs, activeWindowStart, suspiciousLocations);
+ logger.debug("SHA:: filter alleles - end");
+
+ readLikelihoods.setFilteredHaplotypeCount(readLikelihoods.numberOfAlleles() - subsettedReadLikelihoodsFinal.numberOfAlleles());
+
+ if (assemblyDebugOutStream != null) {
+ try {
+ assemblyDebugOutStream.write("\nThere were " + subsettedReadLikelihoodsFinal.alleles().size() + " haplotypes found after subsetting by alleles. Here they are:\n");
+ subsettedReadLikelihoodsFinal.alleles().forEach(h -> {
+ try {
+ assemblyDebugOutStream.write(h.toString());
+ assemblyDebugOutStream.append("\n");
+ } catch (IOException e) {
+ throw new UserException("Error writing to debug output stream", e);
+ }
+ });
+ } catch (IOException e) {
+ throw new UserException("Error writing to debug output stream", e);
+ }
+ }
+
+ return subsettedReadLikelihoodsFinal;
+ }
+
+ /**
+ * Returns all alleles from haplotype
+ * @param haplotype Input
+ * @return set of AlleleAndContext
+ */
+ static private Set getAlleles(final Haplotype haplotype){
+ final Collection vcs = haplotype.getEventMap().getVariantContexts();
+
+ return vcs.stream().flatMap(
+ vc -> vc.getAlleles().stream().map(
+ al -> new AlleleAndContext(vc.getContig(), vc.getStart(), al, vc.getReference()))
+ ).collect(Collectors.toSet());
+ }
+
+ /**
+ * Main function that filters haplotypes that contribute weak alleles
+ * @param readLikelihoods read x haplotype matrix
+ * @param assemblyArgs HaplotypeCaller/Mutect2 parameters
+ * @param activeWindowStart Genomic location of the start
+ * @param suspiciousLocations set of positions where the alleles are being filtered (modified)
+ * @return read x haplotype matrix where the filtered haplotypes are removed
+ * @throws IOException if output file can't be written
+ */
+ private AlleleLikelihoods subsetHaplotypesByAlleles(final AlleleLikelihoods readLikelihoods,
+ final AssemblyBasedCallerArgumentCollection assemblyArgs,
+ final int activeWindowStart, Set suspiciousLocations) {
+ // 1. Collect all alleles in the active region
+ final Set disabledHaplotypes = new HashSet<>();
+ final Map> haplotypeAlleleMap = new CollectionUtil.DefaultingMap<>((k) -> new ArrayList<>(), true);
+ readLikelihoods.alleles().forEach(h -> getAlleles(h).stream().filter(al -> !al.isReference()).forEach(jh -> haplotypeAlleleMap.get(h).add(jh)));
+
+ // 2. Split them into sets to genotype together. The goal is to cluster true allele with all its variants from
+ // biased seq. error.
+ // The alleles split into clusters of alleles that potentially interact (compete with each other for reads)
+ // First we generate a graph with edge for each pair of alleles that do not occur in the same haplotype
+ // Then we only keep the edges where the alleles are close or up to hmer indel from each other
+ // the connected components of the graph are genotyped together
+ final OccurrenceMatrix occm = new OccurrenceMatrix<>(haplotypeAlleleMap);
+ List> nonCoOcurringAlleles = occm.nonCoOcurringColumns();
+ final List> closeNonCoOccurringAlleles = filterByDistance(nonCoOcurringAlleles, 0, 3);
+ nonCoOcurringAlleles = filterSameUpToHmerPairs(filterByDistance(nonCoOcurringAlleles,0,20),
+ findReferenceHaplotype(readLikelihoods.alleles()), activeWindowStart);
+ nonCoOcurringAlleles.addAll(closeNonCoOccurringAlleles);
+ final List> independentAlleles = occm.getIndependentSets(nonCoOcurringAlleles);
+
+ // 3. For each cluster - remove weak alleles
+ for (final Set alleleSet : independentAlleles) {
+
+ // debugging - write the interaction map of the location (we will keep this function from the unused approach
+ // where we only attempted to filter alleles that strongly affect an another allele's quality. This approach
+ // failed to deliver a significant improvement and thus is not used.
+ // interaction map is the graph of how much quality of each allele is improved when another allele is removed
+ if (assemblyArgs.writeFilteringGraphs) {
+ if (alleleSet.size() > 1 ) {
+ final List alleleSetAsList = new ArrayList<>(alleleSet);
+ final Map initialRPLsMap = new HashMap<>();
+ final DefaultDirectedWeightedGraph intm =
+ interactionMatrixToGraph(getInteractionMatrix(alleleSetAsList, haplotypeAlleleMap,
+ readLikelihoods, initialRPLsMap), initialRPLsMap);
+ printInteractionGraph(intm, initialRPLsMap, alleleSet);
+ }
+ }
+
+ boolean removedAlleles = true;
+ final Set activeHaplotypes = new HashSet<>(readLikelihoods.alleles());
+
+ while (removedAlleles) {
+ removedAlleles = false;
+ // b. Marginalize: calculate quality of each allele relative to all other alleles
+ logger.debug("GAL::start of iteration");
+ final List activeAlleles = activeHaplotypes.stream()
+ .flatMap(h -> getAlleles(h).stream().filter(alleleSet::contains))
+ .distinct()
+ .collect(Collectors.toList());;
+
+ final Map> alleleHaplotypeMap = new CollectionUtil.DefaultingMap<>((k) -> new ArrayList<>(), true);
+ readLikelihoods.alleles().stream().filter(activeHaplotypes::contains)
+ .forEach(h ->
+ getAlleles(h).stream()
+ .filter(alleleSet::contains)
+ .filter(al -> !al.isReference())
+ .forEach(jh -> alleleHaplotypeMap.get(jh).add(h))
+
+ );
+
+ logger.debug("AHM::printout start");
+ for (final AlleleAndContext al : alleleHaplotypeMap.keySet()) {
+ logger.debug("AHM::allele block ---> ");
+ for (final Allele h : alleleHaplotypeMap.get(al)) {
+ logger.debug(() -> String.format("AHM:: (%d) %s/%s: %s", al.getLoc(), al.getAllele().getBaseString(), al.getRefAllele().getBaseString(), h.getBaseString()));
+ }
+ logger.debug("AHM::allele block ---< ");
+
+ }
+ logger.debug("AHM::printout end");
+
+
+
+ final List> alleleLikelihoods =
+ activeAlleles.stream().map(al -> getAlleleLikelihoodMatrix(readLikelihoods, al,
+ haplotypeAlleleMap, activeHaplotypes)).collect(Collectors.toList());
+ // c. Calculate SOR and RPL
+ // Note that the QUAL is calculated as a PL, that is -10*log likelihood. This means that high PL is low quality allele
+ final List collectedRPLs = IntStream.range(0, activeAlleles.size()).mapToObj(i -> getAlleleLikelihoodVsInverse(alleleLikelihoods.get(i), activeAlleles.get(i))).collect(Collectors.toList());
+ final List collectedSORs = IntStream.range(0, activeAlleles.size()).mapToObj(i -> getAlleleSOR(alleleLikelihoods.get(i), activeAlleles.get(i))).collect(Collectors.toList());
+
+ // d. Generate variants that are below SOR threshold and below RPL threshold
+ final List filteringCandidates = identifyBadAlleles(collectedRPLs,
+ collectedSORs,
+ activeAlleles,
+ assemblyArgs.prefilterQualThreshold,
+ assemblyArgs.prefilterSorThreshold);
+
+
+ //very weak candidates are filtered out in any case, even if they are alone (they will be filtered anyway even in the GVCF mode)
+ // the very weak quality is hardcoded
+ final List filteringCandidatesStringent = identifyBadAlleles(collectedRPLs,
+ collectedSORs,
+ activeAlleles,
+ 1,
+ Integer.MAX_VALUE);
+
+
+ //for now we just mark all locations with close alleles, one of which is weak.
+ //We write them in suspiciousLocations and they will be then annotated as SUSP_NOISY... in the VCF
+ if ((filteringCandidates.size() > 0 ) && (alleleSet.size()>0)) {
+ activeAlleles.forEach(laa -> suspiciousLocations.add(laa.getLoc()));
+ }
+
+ // e. For every variant - calculate what is the effect of its deletion and if higher than threshold - delete and continue
+
+ // (This is a currently disabled code from the approach that would disable only the candidates that strongly
+ // affect other alleles
+ //AlleleAndContext candidateToDisable = identifyStrongInteractingAllele(filteringCandidates,
+ // hcargs.prefilterQualThreshold, activeAlleles, collectedRPLs, readLikelihoods, haplotypeAlleleMap, alleleHaplotypeMap); )
+
+ // if weak candidate had been identified - add its haplotypes into blacklist, remove the allele from the
+ // current cluster and genotype again.
+ if ((filteringCandidates.size()>0 && activeAlleles.size()>1) ||
+ (activeAlleles.size()==1 && filteringCandidatesStringent.size()>0) ||
+ (filteringCandidates.size()>0 && this.assemblyArgs.filterLoneAlleles)) {
+
+ if ((filteringCandidatesStringent.size()>0) && (filteringCandidates.size() == 0 )) {
+ throw new GATKException.ShouldNeverReachHereException("The thresholds for stringent allele " +
+ "filtering should always be higher than for the relaxed one");
+ }
+
+ final AlleleAndContext candidateToDisable = filteringCandidates.get(0);
+ logger.debug(() -> String.format("GAL:: Remove %s", candidateToDisable.toString()));
+ removedAlleles = true;
+ final List haplotypesToRemove = alleleHaplotypeMap.get(candidateToDisable);
+ disabledHaplotypes.addAll(haplotypesToRemove);
+ activeHaplotypes.removeAll(haplotypesToRemove);
+ }
+ logger.debug("GAL::end of iteration");
+
+ }
+ }
+
+ // finalizing: remove all disabled genotypes
+ logger.debug("----- SHA list of removed haplotypes start ----");
+ for (Haplotype hap : disabledHaplotypes) {
+ logger.debug(() -> String.format("SHA :: Removed haplotype : %s ", hap.toString()));
+ }
+ logger.debug("----- SHA list of removed haplotypes end ----");
+
+ final Set eventualAlleles = new HashSet<>();
+ readLikelihoods.alleles().stream().filter(al -> !disabledHaplotypes.contains(al)).forEach(eventualAlleles::add);
+ logger.debug("----- SHA list of remaining haplotypes start ----");
+ for (Haplotype hap : eventualAlleles) {
+ logger.debug(() -> String.format("SHA :: Remaining haplotype : %s ", hap.toString()));
+ }
+ logger.debug("----- SHA list of remaining haplotypes end ----");
+
+
+
+ final AlleleLikelihoods currentReadLikelihoods = readLikelihoods.removeAllelesToSubset(eventualAlleles);
+ logger.debug("----- SHA list of remaining alleles start ----");
+ final Set locAllele = new HashSet<>();
+ currentReadLikelihoods.alleles().forEach(h -> getAlleles(h).stream().filter(al -> !al.isReference()).forEach(locAllele::add));
+ for (final AlleleAndContext al: locAllele) {
+ logger.debug(() -> String.format("---- SHA :: %s ", al.toString()));
+ }
+ logger.debug("----- SHA list of remaining alleles end ----");
+
+ return currentReadLikelihoods;
+ }
+
+
+ /**
+ * Finds a list of alleles that are candidate for removal in the order of precedence (first - the best candidate to be removed)
+ *
+ * @param collectedRPLs list of each allele qualities (collected by {@link AlleleFiltering#getAlleleLikelihoodVsInverse}
+ * @param collectedSORs list of each allele SORs (collected by {@link AlleleFiltering#getAlleleSOR(AlleleLikelihoods, Allele)}
+ * @param alleles list of alleles in the same order as in collectedRPLs/collectedSORs
+ * @param qualThreshold only variants with quality below qualThreshold will be considered
+ * @param sorThreshold only variants with SOR above threshold will be considered
+ * @return list of alleles that can be removed
+ */
+ private List identifyBadAlleles(final List collectedRPLs, final List collectedSORs,
+ final List alleles,
+ final double qualThreshold,
+ final double sorThreshold) {
+
+ //collected RPLs are the -10*QUAL of the alleles. high RPL means low quality.
+ // SORs are regular: high SOR - strongly biased
+ final int[] rplsIndices = getSortedIndexList(collectedRPLs);
+ final int[] sorIndices = rplsIndices; // the variants that have high sor are ordered according to their quality
+
+
+ //this list will contain all alleles that should be filtered in the order of priority
+ final List result = new ArrayList<>();
+ final double THRESHOLD = -1 * qualThreshold; // quality threshold is like in GATK (GL) and we collected PL, so QUAL 30 will appear as -30.
+ final double SOR_THRESHOLD = sorThreshold;
+
+ //note that high value is a poor quality allele, so the worst allele is the highest collectedRPL
+ //we first collect all allleles with low quality: from the lowest
+ for (int i = rplsIndices.length-1 ; (i >= 0) && (collectedRPLs.get(rplsIndices[i])>THRESHOLD) ; i--) {
+ result.add(alleles.get(rplsIndices[i]));
+ }
+ int rplCount = result.size();
+ //we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD
+ //we will first filter the one with the lowest QUAL.
+ logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount));
+ for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) {
+ if (!result.contains(alleles.get(sorIndices[i]))) {
+ result.add(alleles.get(sorIndices[i]));
+ }
+ }
+ logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount));
+ return result;
+ }
+
+
+ /**
+ * Generates from read x haplotype matrix a read x allele matrix with two alleles: Allele and ~Allele where Allele
+ * is supported by all haplotypes that contain this allele and ~Allele is supported by all other haplotypes.
+ * Similar to {@link AlleleLikelihoods#marginalize(Map)}.
+ *
+ * @param readLikelihoods read x haplotype matrix
+ * @param allele allele to consider
+ * @param haplotypeAlleleMap map between alleles and haplotypes
+ * @param enabledHaplotypes list of haplotypes that are currently inactive
+ * @return read x allele matrix
+ */
+ private AlleleLikelihoods getAlleleLikelihoodMatrix(final AlleleLikelihoods readLikelihoods,
+ final AlleleAndContext allele,
+ final Map> haplotypeAlleleMap,
+ final Set enabledHaplotypes
+ ){
+ final Map> alleleHaplotypeMap = new CollectionUtil.DefaultingMap<>((k) -> new ArrayList<>(), true);
+
+ final Allele notAllele= InverseAllele.of(allele.getAllele(), true);
+ readLikelihoods.alleles().stream().filter(enabledHaplotypes::contains)
+ .filter(h->haplotypeAlleleMap.get(h).contains(allele))
+ .forEach(alleleHaplotypeMap.get(allele)::add);
+ readLikelihoods.alleles().stream().filter(enabledHaplotypes::contains)
+ .filter(h -> !haplotypeAlleleMap.get(h).contains(allele))
+ .forEach(alleleHaplotypeMap.get(notAllele)::add);
+
+ final AlleleLikelihoods alleleLikelihoods = readLikelihoods.marginalize(alleleHaplotypeMap);
+ logger.debug(() -> String.format("GALM: %s %d %d", allele.toString(), alleleHaplotypeMap.get(allele).size(), alleleHaplotypeMap.get(notAllele).size()));
+ return alleleLikelihoods;
+ }
+
+ //functions to get allele likelihoods and SOR. Differ between the mutect and the HC implementations
+ abstract int getAlleleLikelihoodVsInverse(final AlleleLikelihoods alleleLikelihoods, final Allele allele);
+
+ private double getAlleleSOR(final AlleleLikelihoods alleleLikelihoods, final Allele allele) {
+ final Allele notAllele = InverseAllele.of(allele, true);
+ final int [][] contingency_table = StrandOddsRatio.getContingencyTableWrtAll(alleleLikelihoods, notAllele, Collections.singletonList(allele), 1);
+ final double sor = StrandOddsRatio.calculateSOR(contingency_table);
+ logger.debug(() -> String.format("GAS:: %s: %f (%d %d %d %d)", allele.toString(), sor, contingency_table[0][0], contingency_table[0][1], contingency_table[1][0], contingency_table[1][1]));
+ return sor;
+
+ }
+
+ //filters pairs of alleles by distance
+ private List> filterByDistance(
+ final List> allelePairs,
+ final int minDist, final int maxDist) {
+ logger.debug(() -> String.format("FBD: input %d pairs ", allelePairs.size()));
+ final List> result = new ArrayList<>(allelePairs);
+ result.removeIf(v -> Math.abs(v.getLeft().getLoc() - v.getRight().getLoc())>maxDist);
+ result.removeIf(v -> Math.abs(v.getLeft().getLoc() - v.getRight().getLoc()) String.format("FBD: output %d pairs ", allelePairs.size()));
+
+ return result;
+ }
+
+ //filters pairs of alleles that are not same up to hmer indel
+ private List> filterSameUpToHmerPairs(final List> allelePairs, final Haplotype refHaplotype, final int activeWindowStart) {
+
+ final List> result = new ArrayList<>();
+ for (final Pair allelePair: allelePairs) {
+
+ final int commonPrefixLengthLeft = getCommonPrefixLength(allelePair.getLeft().getAllele(), allelePair.getLeft().getRefAllele());
+ final int commonPrefixLengthRight = getCommonPrefixLength(allelePair.getRight().getAllele(), allelePair.getRight().getRefAllele());
+
+ final Pair modifiedHaplotypes = new ImmutablePair<>(
+ refHaplotype.insertAllele(
+ allelePair.getLeft().getRefAllele(),
+ allelePair.getLeft().getAllele(),
+ allelePair.getLeft().getLoc()-activeWindowStart,
+ -1,
+ commonPrefixLengthLeft),
+ refHaplotype.insertAllele(
+ allelePair.getRight().getRefAllele(),
+ allelePair.getRight().getAllele(),
+ allelePair.getRight().getLoc() - activeWindowStart,
+ -1,
+ commonPrefixLengthRight));
+
+ if ( BaseUtils.equalUpToHmerChange(modifiedHaplotypes.getLeft().getBases(), modifiedHaplotypes.getRight().getBases()) ) {
+ result.add(allelePair);
+ }
+
+ }
+
+ return result;
+ }
+
+
+ // find (the) reference haplotype within a list of haplotypes
+ static Haplotype findReferenceHaplotype(final List haplotypeList) {
+ for (final Haplotype h: haplotypeList ) {
+ if (h.isReference()) {
+ return h;
+ }
+ }
+ return null;
+ }
+
+ // if alleles are different in length, return their the length of their (potentially) common prefix, otherwise return 0
+ private int getCommonPrefixLength(final Allele al1, final Allele al2){
+ if (al1.length()!=al2.length()){
+ return Math.min(al1.length(), al2.length());
+ } else {
+ return 0;
+ }
+ }
+
+ // sort an integer list
+ private int[] getSortedIndexList(final List values) {
+ return IntStream.range(0, values.size()).
+ mapToObj(i -> new ImmutablePair<>(i, values.get(i)))
+ .sorted(Comparator.comparingInt( v -> (int)v.getRight()))
+ .mapToInt(v-> v.getLeft()).toArray();
+
+ }
+
+ // the functions below are currently unused but kept for potential future uses.
+ // The goal of these functions is to look at how one allele affects the other and make decisions
+ // only for the alleles that really affect others. The approach did not currently work that well
+ @SuppressWarnings("unused")
+ private AlleleAndContext identifyStrongInteractingAllele(final List candidateList,
+ final float prefilterThreshold,
+ final List allAlleles,
+ final List rpls,
+ final AlleleLikelihoods readLikelihoods,
+ final Map> haplotypeAlleleMap,
+ final Map> alleleHaplotypeMap
+ ){
+
+
+ logger.debug("ISIA :: start");
+ final Map initialRPLsMap = new HashMap<>();
+ IntStream.range(0, allAlleles.size()).forEach(i -> initialRPLsMap.put(allAlleles.get(i), rpls.get(i)));
+
+ for (final AlleleAndContext cand: candidateList){
+ logger.debug(() -> String.format("ISIA :: test %s", cand.toString()));
+ if ( initialRPLsMap.get(cand) > (-1)*prefilterThreshold){
+ logger.debug( String.format("ISIA:: selected %s due to low QUAL", cand));
+ return cand;
+ }
+
+ if (allAlleles.size() <=1) {
+ return null;
+ }
+
+ final Map