Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SV test and utils cleanup #5116

Merged
merged 1 commit into from
Aug 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -47,6 +48,7 @@
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand Down Expand Up @@ -188,8 +190,9 @@ protected void runTool( final JavaSparkContext ctx ) {

final String outputPath = svDiscoveryInputMetaData.getOutputPath();
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines();
final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger();
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, toolLogger);
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, defaultToolVCFHeaderLines, toolLogger);

// TODO: 1/14/18 this is the next version of precise variant calling
if ( expInterpret != null ) {
Expand Down Expand Up @@ -231,7 +234,7 @@ private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext
outputPrefixWithSampleName,
assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
cnvCallsBroadcast, getHeaderForReads(), getReference(), localLogger);
cnvCallsBroadcast, getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
}

public static Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls(final JavaSparkContext ctx,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ protected void runTool(final JavaSparkContext ctx) {
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, null, vcfOutputPath,
null, null, null,
cnvCallsBroadcast,
getHeaderForReads(), getReference(), localLogger);
getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);

final JavaRDD<AlignedContig> parsedContigAlignments =
new SvDiscoverFromLocalAssemblyContigAlignmentsSpark
Expand All @@ -146,7 +146,7 @@ protected void runTool(final JavaSparkContext ctx) {
.discoverVariantsFromChimeras(svDiscoveryInputMetaData, parsedContigAlignments);

final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
SVVCFWriter.writeVCF(annotatedVariants, vcfOutputPath, refSeqDictionary, localLogger);
SVVCFWriter.writeVCF(annotatedVariants, vcfOutputPath, refSeqDictionary, getDefaultToolVCFHeaderLines(), localLogger);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -125,11 +126,11 @@ public final class SvDiscoverFromLocalAssemblyContigAlignmentsSpark extends GATK
fullName = "write-sam", optional = true)
private boolean writeSAMFiles;

static final String SIMPLE_CHIMERA_VCF_FILE_NAME = "NonComplex.vcf";
static final String COMPLEX_CHIMERA_VCF_FILE_NAME = "Complex.vcf";
static final String REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_1_seg.vcf";
static final String REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_multi_seg.vcf";
static final String MERGED_VCF_FILE_NAME = "merged_simple.vcf";
public static final String SIMPLE_CHIMERA_VCF_FILE_NAME = "NonComplex.vcf";
public static final String COMPLEX_CHIMERA_VCF_FILE_NAME = "Complex.vcf";
public static final String REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_1_seg.vcf";
public static final String REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_multi_seg.vcf";
public static final String MERGED_VCF_FILE_NAME = "merged_simple.vcf";

@Override
public boolean requiresReference() {
Expand Down Expand Up @@ -159,7 +160,7 @@ protected void runTool(final JavaSparkContext ctx) {
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, outputPrefixWithSampleName,
null, null, null,
cnvCallsBroadcast,
getHeaderForReads(), getReference(), localLogger);
getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
final JavaRDD<GATKRead> assemblyRawAlignments = getReads();

final AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes =
Expand Down Expand Up @@ -328,7 +329,7 @@ private static List<VariantContext> extractSimpleVariants(final JavaRDD<Assembly
final Logger logger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null;
SVVCFWriter.writeVCF(simpleVariants, outputPrefixWithSampleName + SIMPLE_CHIMERA_VCF_FILE_NAME,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
logger);
svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines(), logger);
return simpleVariants;
}

Expand All @@ -348,20 +349,21 @@ private static CpxAndReInterpretedSimpleVariants extractCpxVariants(final JavaSp
final JavaRDD<GATKRead> assemblyRawAlignments,
final String outputPrefixWithSampleName) {
final Logger toolLogger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null;
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines();
final List<VariantContext> complexVariants =
CpxVariantInterpreter.makeInterpretation(contigsWithCpxAln, svDiscoveryInputMetaData);
SVVCFWriter.writeVCF(complexVariants, outputPrefixWithSampleName + COMPLEX_CHIMERA_VCF_FILE_NAME,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
toolLogger);
defaultToolVCFHeaderLines, toolLogger);

final JavaRDD<VariantContext> complexVariantsRDD = ctx.parallelize(complexVariants);
final SegmentedCpxVariantSimpleVariantExtractor.ExtractedSimpleVariants reInterpretedSimple =
SegmentedCpxVariantSimpleVariantExtractor.extract(complexVariantsRDD, svDiscoveryInputMetaData, assemblyRawAlignments);
final SAMSequenceDictionary refSeqDict = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
final String derivedOneSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME;
final String derivedMultiSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME;
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, refSeqDict, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, refSeqDict, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, refSeqDict, defaultToolVCFHeaderLines, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, refSeqDict, defaultToolVCFHeaderLines, toolLogger);

return new CpxAndReInterpretedSimpleVariants(complexVariants, reInterpretedSimple.getMergedReinterpretedCalls());
}
Expand Down Expand Up @@ -399,6 +401,7 @@ public static void filterAndWriteMergedVCF(final String outputPrefixWithSampleNa
final String out = outputPrefixWithSampleName + MERGED_VCF_FILE_NAME;
SVVCFWriter.writeVCF(variantsWithFilterApplied, out,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines(),
svDiscoveryInputMetaData.getToolLogger());
}

Expand Down Expand Up @@ -454,7 +457,7 @@ public boolean test(final VariantContext variantContext) {
if ( !variantContext.hasAttribute(GATKSVVCFConstants.CONTIG_NAMES) )
return true;

final List<String> mapQuals = SvDiscoveryUtils.getAttributeAsStringList(variantContext, attributeKey);
final List<String> mapQuals = SVUtils.getAttributeAsStringList(variantContext, attributeKey);
int maxMQ = 0;
for (final String mapQual : mapQuals) {
Integer integer = Integer.valueOf(mapQual);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
Expand Down Expand Up @@ -42,6 +43,10 @@ public String getOutputPath() {
return outputPath;
}

public Set<VCFHeaderLine> getDefaultToolVCFHeaderLines() {
return defaultToolVCFHeaderLines;
}

public static final class ReferenceData {
private final Broadcast<Set<String>> canonicalChromosomesBroadcast;
private final Broadcast<ReferenceMultiSource> referenceBroadcast;
Expand Down Expand Up @@ -121,6 +126,8 @@ public List<SVInterval> getAssembledIntervals() {

private final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs;

private final Set<VCFHeaderLine> defaultToolVCFHeaderLines;

private final Logger toolLogger;

private String outputPath;
Expand All @@ -135,17 +142,19 @@ public SvDiscoveryInputMetaData(final JavaSparkContext ctx,
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast,
final SAMFileHeader headerForReads,
final ReferenceMultiSource reference,
final Set<VCFHeaderLine> defaultToolVCFHeaderLines,
final Logger toolLogger) {

final SAMSequenceDictionary sequenceDictionary = headerForReads.getSequenceDictionary();
final Broadcast<Set<String>> canonicalChromosomesBroadcast =
ctx.broadcast(SvDiscoveryUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
ctx.broadcast(SVUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
final String sampleId = SVUtils.getSampleId(headerForReads);

this.referenceData = new ReferenceData(canonicalChromosomesBroadcast, ctx.broadcast(reference), ctx.broadcast(sequenceDictionary));
this.sampleSpecificData = new SampleSpecificData(sampleId, cnvCallsBroadcast, assembledIntervals, evidenceTargetLinks, readMetadata, ctx.broadcast(headerForReads));
this.discoverStageArgs = discoverStageArgs;
this.outputPath = outputPath;
this.defaultToolVCFHeaderLines = defaultToolVCFHeaderLines;
this.toolLogger = toolLogger;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,18 @@
package org.broadinstitute.hellbender.tools.spark.sv.discovery;

import htsjdk.samtools.*;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.BreakpointComplications;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFReader;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class SvDiscoveryUtils {

import java.util.List;

public final class SvDiscoveryUtils {

public static void evaluateIntervalsAndNarls(final List<SVInterval> assembledIntervals,
final List<NovelAdjacencyAndAltHaplotype> narls,
Expand Down Expand Up @@ -99,82 +84,4 @@ private static int getAmbiguity(final BreakpointComplications complications) {
return complications.getHomologyForwardStrandRep().length();
}

//==================================================================================================================

public static Set<String> getCanonicalChromosomes(final String nonCanonicalContigNamesFile,
@Nonnull final SAMSequenceDictionary dictionary) {
final LinkedHashSet<String> allContigs = Utils.nonNull(dictionary).getSequences().stream().map(SAMSequenceRecord::getSequenceName)
.collect(Collectors.toCollection(LinkedHashSet::new));
if (nonCanonicalContigNamesFile == null)
return allContigs;

try (final Stream<String> nonCanonical = Files.lines(IOUtils.getPath(( Utils.nonNull(nonCanonicalContigNamesFile) )))) {
nonCanonical.forEach(allContigs::remove);
return allContigs;
} catch ( final IOException ioe ) {
throw new UserException("Can't read nonCanonicalContigNamesFile file "+nonCanonicalContigNamesFile, ioe);
}
}

//==================================================================================================================

/**
* write SAM file for provided {@code filteredContigs}
* by extracting original alignments from {@code originalAlignments},
* to directory specified by {@code outputDir}.
*/
public static void writeSAMRecords(final JavaRDD<GATKRead> originalAlignments, final Set<String> readNameToInclude,
final String outputPath, final SAMFileHeader header) {
final List<GATKRead> reads = originalAlignments.collect();
writeSAMRecords(reads, readNameToInclude, outputPath, header);
}

public static void writeSAMRecords(final List<GATKRead> reads, final Set<String> readNameToInclude,
final String outputPath, final SAMFileHeader header) {
final SAMFileHeader cloneHeader = header.clone();
final SAMRecordComparator localComparator;
if (outputPath.toLowerCase().endsWith("bam")) {
cloneHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
localComparator = new SAMRecordCoordinateComparator();
} else if (outputPath.toLowerCase().endsWith("sam")) {
cloneHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
localComparator = new SAMRecordQueryNameComparator();
} else {
throw new IllegalArgumentException("Unsupported output format " + outputPath);
}

final List<SAMRecord> samRecords = new ArrayList<>();
reads.forEach(gatkRead -> {
if ( readNameToInclude.contains(gatkRead.getName()) ) {
samRecords.add(gatkRead.convertToSAMRecord(cloneHeader));
}
});

samRecords.sort(localComparator);
SVFileUtils.writeSAMFile( outputPath, samRecords.iterator(), cloneHeader, true);
}

/**
* todo: this should be fixed in a new major version of htsjdk
* this exist because for whatever reason,
* VC.getAttributeAsStringList() sometimes returns a giant single string, while using
* VC.getAttributeAsString() gives back an array.....
*/
public static List<String> getAttributeAsStringList(final VariantContext vc, final String attributeKey) {
if (vc.getAttribute(attributeKey) == null) return Collections.emptyList();
return vc.getAttributeAsStringList(attributeKey, "").stream()
.flatMap(s -> {
if ( s.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ) {
final String[] split = s.split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
return Arrays.stream(split);
} else {
return Stream.of(s);
}
})
.collect(Collectors.toList());
}

public static SimpleInterval makeOneBpInterval(final String chr, final int pos) {
return new SimpleInterval(chr, pos, pos);
}
}
Loading