Skip to content

Commit

Permalink
some clean up on util classes in SV packages
Browse files Browse the repository at this point in the history
and
  get default headers into output VCF
  • Loading branch information
SHuang-Broad committed Aug 15, 2018
1 parent 352673a commit e64a3ea
Show file tree
Hide file tree
Showing 18 changed files with 228 additions and 235 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -47,6 +48,7 @@
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand Down Expand Up @@ -188,8 +190,9 @@ protected void runTool( final JavaSparkContext ctx ) {

final String outputPath = svDiscoveryInputMetaData.getOutputPath();
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines();
final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger();
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, toolLogger);
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, defaultToolVCFHeaderLines, toolLogger);

// TODO: 1/14/18 this is the next version of precise variant calling
if ( expInterpret != null ) {
Expand Down Expand Up @@ -231,7 +234,7 @@ private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext
outputPrefixWithSampleName,
assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
cnvCallsBroadcast, getHeaderForReads(), getReference(), localLogger);
cnvCallsBroadcast, getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
}

public static Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls(final JavaSparkContext ctx,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ protected void runTool(final JavaSparkContext ctx) {
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, null, vcfOutputPath,
null, null, null,
cnvCallsBroadcast,
getHeaderForReads(), getReference(), localLogger);
getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);

final JavaRDD<AlignedContig> parsedContigAlignments =
new SvDiscoverFromLocalAssemblyContigAlignmentsSpark
Expand All @@ -146,7 +146,7 @@ protected void runTool(final JavaSparkContext ctx) {
.discoverVariantsFromChimeras(svDiscoveryInputMetaData, parsedContigAlignments);

final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
SVVCFWriter.writeVCF(annotatedVariants, vcfOutputPath, refSeqDictionary, localLogger);
SVVCFWriter.writeVCF(annotatedVariants, vcfOutputPath, refSeqDictionary, getDefaultToolVCFHeaderLines(), localLogger);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -150,6 +151,7 @@ public List<ReadFilter> getDefaultReadFilters() {
protected void runTool(final JavaSparkContext ctx) {

validateParams();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();

final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast =
StructuralVariationDiscoveryPipelineSpark.broadcastCNVCalls(ctx, getHeaderForReads(),
Expand All @@ -159,7 +161,7 @@ protected void runTool(final JavaSparkContext ctx) {
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, outputPrefixWithSampleName,
null, null, null,
cnvCallsBroadcast,
getHeaderForReads(), getReference(), localLogger);
getHeaderForReads(), getReference(), defaultToolVCFHeaderLines, localLogger);
final JavaRDD<GATKRead> assemblyRawAlignments = getReads();

final AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes =
Expand Down Expand Up @@ -328,7 +330,7 @@ private static List<VariantContext> extractSimpleVariants(final JavaRDD<Assembly
final Logger logger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null;
SVVCFWriter.writeVCF(simpleVariants, outputPrefixWithSampleName + SIMPLE_CHIMERA_VCF_FILE_NAME,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
logger);
svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines(), logger);
return simpleVariants;
}

Expand All @@ -348,20 +350,21 @@ private static CpxAndReInterpretedSimpleVariants extractCpxVariants(final JavaSp
final JavaRDD<GATKRead> assemblyRawAlignments,
final String outputPrefixWithSampleName) {
final Logger toolLogger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null;
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines();
final List<VariantContext> complexVariants =
CpxVariantInterpreter.makeInterpretation(contigsWithCpxAln, svDiscoveryInputMetaData);
SVVCFWriter.writeVCF(complexVariants, outputPrefixWithSampleName + COMPLEX_CHIMERA_VCF_FILE_NAME,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
toolLogger);
defaultToolVCFHeaderLines, toolLogger);

final JavaRDD<VariantContext> complexVariantsRDD = ctx.parallelize(complexVariants);
final SegmentedCpxVariantSimpleVariantExtractor.ExtractedSimpleVariants reInterpretedSimple =
SegmentedCpxVariantSimpleVariantExtractor.extract(complexVariantsRDD, svDiscoveryInputMetaData, assemblyRawAlignments);
final SAMSequenceDictionary refSeqDict = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
final String derivedOneSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME;
final String derivedMultiSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME;
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, refSeqDict, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, refSeqDict, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, refSeqDict, defaultToolVCFHeaderLines, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, refSeqDict, defaultToolVCFHeaderLines, toolLogger);

return new CpxAndReInterpretedSimpleVariants(complexVariants, reInterpretedSimple.getMergedReinterpretedCalls());
}
Expand Down Expand Up @@ -399,6 +402,7 @@ public static void filterAndWriteMergedVCF(final String outputPrefixWithSampleNa
final String out = outputPrefixWithSampleName + MERGED_VCF_FILE_NAME;
SVVCFWriter.writeVCF(variantsWithFilterApplied, out,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines(),
svDiscoveryInputMetaData.getToolLogger());
}

Expand Down Expand Up @@ -454,7 +458,7 @@ public boolean test(final VariantContext variantContext) {
if ( !variantContext.hasAttribute(GATKSVVCFConstants.CONTIG_NAMES) )
return true;

final List<String> mapQuals = SvDiscoveryUtils.getAttributeAsStringList(variantContext, attributeKey);
final List<String> mapQuals = SVUtils.getAttributeAsStringList(variantContext, attributeKey);
int maxMQ = 0;
for (final String mapQual : mapQuals) {
Integer integer = Integer.valueOf(mapQual);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
Expand Down Expand Up @@ -42,6 +43,10 @@ public String getOutputPath() {
return outputPath;
}

public Set<VCFHeaderLine> getDefaultToolVCFHeaderLines() {
return defaultToolVCFHeaderLines;
}

public static final class ReferenceData {
private final Broadcast<Set<String>> canonicalChromosomesBroadcast;
private final Broadcast<ReferenceMultiSource> referenceBroadcast;
Expand Down Expand Up @@ -121,6 +126,8 @@ public List<SVInterval> getAssembledIntervals() {

private final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs;

private final Set<VCFHeaderLine> defaultToolVCFHeaderLines;

private final Logger toolLogger;

private String outputPath;
Expand All @@ -135,17 +142,19 @@ public SvDiscoveryInputMetaData(final JavaSparkContext ctx,
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast,
final SAMFileHeader headerForReads,
final ReferenceMultiSource reference,
final Set<VCFHeaderLine> defaultToolVCFHeaderLines,
final Logger toolLogger) {

final SAMSequenceDictionary sequenceDictionary = headerForReads.getSequenceDictionary();
final Broadcast<Set<String>> canonicalChromosomesBroadcast =
ctx.broadcast(SvDiscoveryUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
ctx.broadcast(SVUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
final String sampleId = SVUtils.getSampleId(headerForReads);

this.referenceData = new ReferenceData(canonicalChromosomesBroadcast, ctx.broadcast(reference), ctx.broadcast(sequenceDictionary));
this.sampleSpecificData = new SampleSpecificData(sampleId, cnvCallsBroadcast, assembledIntervals, evidenceTargetLinks, readMetadata, ctx.broadcast(headerForReads));
this.discoverStageArgs = discoverStageArgs;
this.outputPath = outputPath;
this.defaultToolVCFHeaderLines = defaultToolVCFHeaderLines;
this.toolLogger = toolLogger;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,18 @@
package org.broadinstitute.hellbender.tools.spark.sv.discovery;

import htsjdk.samtools.*;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.BreakpointComplications;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFReader;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class SvDiscoveryUtils {

import java.util.List;

public final class SvDiscoveryUtils {

public static void evaluateIntervalsAndNarls(final List<SVInterval> assembledIntervals,
final List<NovelAdjacencyAndAltHaplotype> narls,
Expand Down Expand Up @@ -99,82 +84,4 @@ private static int getAmbiguity(final BreakpointComplications complications) {
return complications.getHomologyForwardStrandRep().length();
}

//==================================================================================================================

public static Set<String> getCanonicalChromosomes(final String nonCanonicalContigNamesFile,
@Nonnull final SAMSequenceDictionary dictionary) {
final LinkedHashSet<String> allContigs = Utils.nonNull(dictionary).getSequences().stream().map(SAMSequenceRecord::getSequenceName)
.collect(Collectors.toCollection(LinkedHashSet::new));
if (nonCanonicalContigNamesFile == null)
return allContigs;

try (final Stream<String> nonCanonical = Files.lines(IOUtils.getPath(( Utils.nonNull(nonCanonicalContigNamesFile) )))) {
nonCanonical.forEach(allContigs::remove);
return allContigs;
} catch ( final IOException ioe ) {
throw new UserException("Can't read nonCanonicalContigNamesFile file "+nonCanonicalContigNamesFile, ioe);
}
}

//==================================================================================================================

/**
* write SAM file for provided {@code filteredContigs}
* by extracting original alignments from {@code originalAlignments},
* to directory specified by {@code outputDir}.
*/
public static void writeSAMRecords(final JavaRDD<GATKRead> originalAlignments, final Set<String> readNameToInclude,
final String outputPath, final SAMFileHeader header) {
final List<GATKRead> reads = originalAlignments.collect();
writeSAMRecords(reads, readNameToInclude, outputPath, header);
}

public static void writeSAMRecords(final List<GATKRead> reads, final Set<String> readNameToInclude,
final String outputPath, final SAMFileHeader header) {
final SAMFileHeader cloneHeader = header.clone();
final SAMRecordComparator localComparator;
if (outputPath.toLowerCase().endsWith("bam")) {
cloneHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
localComparator = new SAMRecordCoordinateComparator();
} else if (outputPath.toLowerCase().endsWith("sam")) {
cloneHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
localComparator = new SAMRecordQueryNameComparator();
} else {
throw new IllegalArgumentException("Unsupported output format " + outputPath);
}

final List<SAMRecord> samRecords = new ArrayList<>();
reads.forEach(gatkRead -> {
if ( readNameToInclude.contains(gatkRead.getName()) ) {
samRecords.add(gatkRead.convertToSAMRecord(cloneHeader));
}
});

samRecords.sort(localComparator);
SVFileUtils.writeSAMFile( outputPath, samRecords.iterator(), cloneHeader, true);
}

/**
* todo: this should be fixed in a new major version of htsjdk
* this exist because for whatever reason,
* VC.getAttributeAsStringList() sometimes returns a giant single string, while using
* VC.getAttributeAsString() gives back an array.....
*/
public static List<String> getAttributeAsStringList(final VariantContext vc, final String attributeKey) {
if (vc.getAttribute(attributeKey) == null) return Collections.emptyList();
return vc.getAttributeAsStringList(attributeKey, "").stream()
.flatMap(s -> {
if ( s.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ) {
final String[] split = s.split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
return Arrays.stream(split);
} else {
return Stream.of(s);
}
})
.collect(Collectors.toList());
}

public static SimpleInterval makeOneBpInterval(final String chr, final int pos) {
return new SimpleInterval(chr, pos, pos);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.http.annotation.Experimental;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand All @@ -25,6 +26,7 @@

import java.util.Collections;
import java.util.List;
import java.util.Set;

/**
* (Internal) Tries to extract simple variants from a provided GATK-SV CPX.vcf
Expand Down Expand Up @@ -82,11 +84,12 @@ protected void runTool(final JavaSparkContext ctx) {

// TODO: 5/9/18 getback sample name in output files
final SAMFileHeader headerForReads = getHeaderForReads();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
final SvDiscoveryInputMetaData svDiscoveryInputMetaData =
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile,
derivedSimpleVCFPrefix,
null, null, null, null,
headerForReads, getReference(), localLogger);
headerForReads, getReference(), defaultToolVCFHeaderLines, localLogger);

final JavaRDD<VariantContext> complexVariants = new VariantsSparkSource(ctx)
.getParallelVariantContexts(complexVCF, getIntervals());
Expand All @@ -98,7 +101,7 @@ protected void runTool(final JavaSparkContext ctx) {
final String derivedOneSegmentSimpleVCF = derivedSimpleVCFPrefix + "_1_seg.vcf";
final String derivedMultiSegmentSimpleVCF = derivedSimpleVCFPrefix + "_multi_seg.vcf";
final VCFHeader vcfHeader = VariantsSparkSource.getHeader(complexVCF);
SVVCFWriter.writeVCF(extract.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), logger);
SVVCFWriter.writeVCF(extract.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), logger);
SVVCFWriter.writeVCF(extract.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), defaultToolVCFHeaderLines, logger);
SVVCFWriter.writeVCF(extract.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), defaultToolVCFHeaderLines, logger);
}
}
Loading

0 comments on commit e64a3ea

Please sign in to comment.