From a29ab53fb56c21e77d8e3b16568b47f2d03ef038 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Tue, 20 Mar 2018 15:57:16 -0400 Subject: [PATCH 1/7] adding --sort-order option to SortSamSpark adding a --sort-order option to SortSamSpark to let users specify the what order to sort in enabling disabled tests fixing the tests which weren't actually asserting anything closes #1260 work in progress in progress refactoring using new SparkUtils method --- .../spark/datasources/ReadsSparkSink.java | 2 +- .../tools/spark/pipelines/SortSamSpark.java | 57 +++++++++++++------ .../SortSamSparkIntegrationTest.java | 55 ++++++++++++------ .../hellbender/tools/count_reads_sorted.sam | 2 +- 4 files changed, 80 insertions(+), 36 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java index c6d596a7262..b18f00a3c27 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java @@ -243,7 +243,7 @@ private static void writeReadsADAM( private static void saveAsShardedHadoopFiles( final JavaSparkContext ctx, final String outputFile, final String referenceFile, final SAMFormat samOutputFormat, final JavaRDD reads, final SAMFileHeader header, - final boolean writeHeader) throws IOException { + final boolean writeHeader) { // Set the static header on the driver thread. if (samOutputFormat == SAMFormat.CRAM) { SparkCRAMOutputFormat.setHeader(header); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java index 33ec1273996..4e25d95c889 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java @@ -8,13 +8,12 @@ import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.utils.spark.SparkUtils; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator; -import scala.Tuple2; import java.util.Collections; import java.util.List; @@ -27,35 +26,61 @@ public final class SortSamSpark extends GATKSparkTool { private static final long serialVersionUID = 1L; + public static final String SORT_ORDER_LONG_NAME = "sort-order"; + @Override public boolean requiresReads() { return true; } @Argument(doc="the output file path", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false) - protected String outputFile; + private String outputFile; + + @Argument(doc="sort order of the output file", fullName = SORT_ORDER_LONG_NAME, optional = true) + private SparkSortOrder sortOrder = SparkSortOrder.coordinate; + + /** + * SortOrders that have corresponding implementations for spark. + * These correspond to a subset of {@link SAMFileHeader.SortOrder}. + */ + private enum SparkSortOrder { + coordinate(SAMFileHeader.SortOrder.coordinate), + queryname(SAMFileHeader.SortOrder.queryname); + + private final SAMFileHeader.SortOrder order; + + SparkSortOrder(SAMFileHeader.SortOrder order) { + this.order = order; + } + + public SAMFileHeader.SortOrder getSamOrder() { + return order; + } + } @Override public List getDefaultReadFilters() { return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); } + @Override + protected void onStartup() { + super.onStartup(); + } + @Override protected void runTool(final JavaSparkContext ctx) { - JavaRDD reads = getReads(); - int numReducers = getRecommendedNumReducers(); - logger.info("Using %s reducers", numReducers); + final JavaRDD reads = getReads(); + final int numReducers = getRecommendedNumReducers(); + logger.info("Using %d reducers", numReducers); + + final SAMFileHeader header = getHeaderForReads(); + header.setSortOrder(sortOrder.getSamOrder()); - final SAMFileHeader readsHeader = getHeaderForReads(); - ReadCoordinateComparator comparator = new ReadCoordinateComparator(readsHeader); - JavaRDD sortedReads; + final JavaRDD readsToWrite; if (shardedOutput) { - sortedReads = reads - .mapToPair(read -> new Tuple2<>(read, null)) - .sortByKey(comparator, true, numReducers) - .keys(); + readsToWrite = SparkUtils.sortReadsAccordingToHeader(reads, header, numReducers); } else { - sortedReads = reads; // sorting is done by writeReads below + readsToWrite = reads; } - readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - writeReads(ctx, outputFile, sortedReads); + writeReads(ctx, outputFile, readsToWrite, header); } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java index b8e427e0817..283ac7f0421 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java @@ -1,8 +1,12 @@ package org.broadinstitute.hellbender.tools.spark.pipelines; +import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.ValidationStringency; +import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; +import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; import org.broadinstitute.hellbender.utils.test.SamAssertionUtils; import org.testng.annotations.DataProvider; @@ -20,9 +24,8 @@ public Object[][] sortBAMData() { {"count_reads.cram", "count_reads_sorted.cram", "count_reads.fasta", ".cram", "coordinate"}, {"count_reads.bam", "count_reads_sorted.bam", "count_reads.fasta", ".cram", "coordinate"}, - //SortBamSpark is missing SORT_ORDER parameter https://github.com/broadinstitute/gatk/issues/1260 -// {"count_reads.bam", "count_reads.bam", null, ".bam", "queryname"}, -// {"count_reads.cram", "count_reads.cram", "count_reads.fasta", ".cram", "queryname"}, + {"count_reads.bam", "count_reads.bam", null, ".bam", "queryname"}, + {"count_reads.cram", "count_reads.cram", "count_reads.fasta", ".cram", "queryname"}, }; } @@ -38,21 +41,17 @@ public void testSortBAMs( final File actualOutputFile = createTempFile("sort_sam", outputExtension); File referenceFile = null == referenceFileName ? null : new File(getTestDataDir(), referenceFileName); ArgumentsBuilder args = new ArgumentsBuilder(); - args.add("--input"); args.add(inputFile.getCanonicalPath()); - args.add("--output"); args.add(actualOutputFile.getCanonicalPath()); + args.addInput(inputFile); + args.addOutput(actualOutputFile); if (null != referenceFile) { - args.add("--R"); - args.add(referenceFile.getAbsolutePath()); + args.addReference(referenceFile); } - args.add("--num-reducers"); args.add("1"); + args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME, "1"); + args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrderName); - //https://github.com/broadinstitute/gatk/issues/1260 -// args.add("--SORT_ORDER"); -// args.add(sortOrderName); + this.runCommandLine(args); - this.runCommandLine(args.getArgsArray()); - - SamAssertionUtils.samsEqualStringent(actualOutputFile, expectedOutputFile, ValidationStringency.DEFAULT_STRINGENCY, referenceFile); + SamAssertionUtils.assertSamsEqual(actualOutputFile, expectedOutputFile, ValidationStringency.DEFAULT_STRINGENCY, referenceFile); } @Test(groups = "spark") @@ -61,13 +60,33 @@ public void test() throws Exception { final File sortedBam = new File(getTestDataDir(), "count_reads_sorted.bam"); final File outputBam = createTempFile("sort_bam_spark", ".bam"); ArgumentsBuilder args = new ArgumentsBuilder(); - args.add("--"+ StandardArgumentDefinitions.INPUT_LONG_NAME); args.add(unsortedBam.getCanonicalPath()); - args.add("--"+StandardArgumentDefinitions.OUTPUT_LONG_NAME); args.add(outputBam.getCanonicalPath()); - args.add("--num-reducers"); args.add("1"); + args.addInput(unsortedBam); + args.addOutput(outputBam); + args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME, "1"); - this.runCommandLine(args.getArgsArray()); + this.runCommandLine(args); SamAssertionUtils.assertSamsEqual(outputBam, sortedBam); } + + @DataProvider + public Object[][] getInvalidSortOrders(){ + return new Object[][]{ + {SAMFileHeader.SortOrder.unknown}, + {SAMFileHeader.SortOrder.unsorted}, + {SAMFileHeader.SortOrder.duplicate} + }; + } + + @Test(expectedExceptions = CommandLineException.BadArgumentValue.class, dataProvider = "getInvalidSortOrders") + public void testBadSortOrders(SAMFileHeader.SortOrder badOrder){ + final File unsortedBam = new File(getTestDataDir(), "count_reads.bam"); + ArgumentsBuilder args = new ArgumentsBuilder(); + args.addInput(unsortedBam); + args.addOutput(createTempFile("sort_bam_spark", ".bam")); + args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, badOrder.toString()); + + this.runCommandLine(args); + } } diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam b/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam index 4ab8112f1ce..b6aaeed2efb 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam +++ b/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam @@ -7,7 +7,7 @@ @SQ SN:chr6 LN:101 @SQ SN:chr7 LN:404 @SQ SN:chr8 LN:202 -@RG ID:0 SM:Hi,Mom! +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA @PG ID:1 PN:Hey! VN:2.0 both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 From d20579b6aa5539e3f9f654b1a7a78762d3404232 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 25 May 2018 18:07:24 -0400 Subject: [PATCH 2/7] fix tests --- .../SortSamSparkIntegrationTest.java | 102 +++++++++++++----- .../pipelines/SortSamSpark/count_reads.bam | Bin 0 -> 688 bytes .../pipelines/SortSamSpark/count_reads.cram | Bin 0 -> 13266 bytes .../SortSamSpark/count_reads.cram.crai | Bin 0 -> 68 bytes .../pipelines/SortSamSpark/count_reads.dict | 9 ++ .../pipelines/SortSamSpark/count_reads.fasta | 40 +++++++ .../SortSamSpark/count_reads.fasta.fai | 8 ++ .../pipelines/SortSamSpark/count_reads.sam | 19 ++++ .../SortSamSpark/count_reads_sorted.bam | Bin 0 -> 698 bytes .../SortSamSpark/count_reads_sorted.bam.bai | Bin 0 -> 152 bytes .../SortSamSpark/count_reads_sorted.cram | Bin 0 -> 12816 bytes .../SortSamSpark/count_reads_sorted.cram.crai | Bin 0 -> 57 bytes .../SortSamSpark/count_reads_sorted.sam | 19 ++++ 13 files changed, 169 insertions(+), 28 deletions(-) create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.bam create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram.crai create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.dict create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta.fai create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.sam create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam.bai create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram.crai create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.sam diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java index 283ac7f0421..11b98206b78 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java @@ -1,13 +1,20 @@ package org.broadinstitute.hellbender.tools.spark.pipelines; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; +import org.apache.spark.api.java.JavaRDD; import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.ReadsDataSource; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; -import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.engine.spark.SparkContextFactory; +import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource; +import org.broadinstitute.hellbender.tools.spark.pipelines.SortSamSpark; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; +import org.broadinstitute.hellbender.utils.test.BaseTest; import org.broadinstitute.hellbender.utils.test.SamAssertionUtils; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -15,17 +22,32 @@ import java.io.File; public final class SortSamSparkIntegrationTest extends CommandLineProgramTest { + + public static final String COUNT_READS_SAM = "count_reads.sam"; + public static final String COORDINATE_SAM = "count_reads_sorted.sam"; + public static final String QUERY_NAME_BAM = "count_reads.bam"; + public static final String COORDINATE_BAM = "count_reads_sorted.bam"; + public static final String COORDINATE_CRAM = "count_reads_sorted.cram"; + public static final String QUERY_NAME_CRAM = "count_reads.cram"; + public static final String REF = "count_reads.fasta"; + public static final String CRAM = ".cram"; + public static final String BAM = ".bam"; + public static final String SAM = ".sam"; + @DataProvider(name="sortbams") public Object[][] sortBAMData() { return new Object[][] { - {"count_reads.sam", "count_reads_sorted.sam", null, ".sam", "coordinate"}, - {"count_reads.bam", "count_reads_sorted.bam", null, ".bam", "coordinate"}, - {"count_reads.cram", "count_reads_sorted.cram", "count_reads.fasta", ".bam", "coordinate"}, - {"count_reads.cram", "count_reads_sorted.cram", "count_reads.fasta", ".cram", "coordinate"}, - {"count_reads.bam", "count_reads_sorted.bam", "count_reads.fasta", ".cram", "coordinate"}, - - {"count_reads.bam", "count_reads.bam", null, ".bam", "queryname"}, - {"count_reads.cram", "count_reads.cram", "count_reads.fasta", ".cram", "queryname"}, + {COUNT_READS_SAM, COORDINATE_SAM, null, SAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_BAM, COORDINATE_BAM, null, BAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_CRAM, COORDINATE_CRAM, REF, BAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_CRAM, COORDINATE_CRAM, REF, CRAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_BAM, COORDINATE_BAM, REF, CRAM, SAMFileHeader.SortOrder.coordinate}, + + {COORDINATE_SAM, COUNT_READS_SAM, null, SAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_BAM, QUERY_NAME_BAM, null, BAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_CRAM, QUERY_NAME_CRAM, REF, BAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_CRAM, QUERY_NAME_CRAM, REF, CRAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_BAM, QUERY_NAME_BAM, REF, CRAM, SAMFileHeader.SortOrder.queryname}, }; } @@ -35,40 +57,64 @@ public void testSortBAMs( final String expectedOutputFileName, final String referenceFileName, final String outputExtension, - final String sortOrderName) throws Exception { - final File inputFile = new File(getTestDataDir(), inputFileName); - final File expectedOutputFile = new File(getTestDataDir(), expectedOutputFileName); + final SAMFileHeader.SortOrder sortOrder) throws Exception { + final File inputFile = getTestFile(inputFileName); + final File expectedOutputFile = getTestFile(expectedOutputFileName); final File actualOutputFile = createTempFile("sort_sam", outputExtension); - File referenceFile = null == referenceFileName ? null : new File(getTestDataDir(), referenceFileName); + File referenceFile = null == referenceFileName ? null : getTestFile(referenceFileName); + + final SamReaderFactory factory = SamReaderFactory.makeDefault(); + ArgumentsBuilder args = new ArgumentsBuilder(); args.addInput(inputFile); args.addOutput(actualOutputFile); if (null != referenceFile) { args.addReference(referenceFile); + factory.referenceSequence(referenceFile); } - args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME, "1"); - args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrderName); + args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrder.name()); this.runCommandLine(args); + //test files are exactly equal SamAssertionUtils.assertSamsEqual(actualOutputFile, expectedOutputFile, ValidationStringency.DEFAULT_STRINGENCY, referenceFile); + + //test sorting matches htsjdk + try(ReadsDataSource in = new ReadsDataSource(actualOutputFile.toPath(), factory )) { + BaseTest.assertSorted(Utils.stream(in).map(read -> read.convertToSAMRecord(in.getHeader())).iterator(), sortOrder.getComparatorInstance()); + } } - @Test(groups = "spark") - public void test() throws Exception { - final File unsortedBam = new File(getTestDataDir(), "count_reads.bam"); - final File sortedBam = new File(getTestDataDir(), "count_reads_sorted.bam"); - final File outputBam = createTempFile("sort_bam_spark", ".bam"); + @Test(dataProvider="sortbams", groups="spark") + public void testSortBAMsSharded( + final String inputFileName, + final String unused, + final String referenceFileName, + final String outputExtension, + final SAMFileHeader.SortOrder sortOrder) { + final File inputFile = getTestFile(inputFileName); + final File actualOutputFile = createTempFile("sort_sam", outputExtension); + File referenceFile = null == referenceFileName ? null : getTestFile(referenceFileName); ArgumentsBuilder args = new ArgumentsBuilder(); - args.addInput(unsortedBam); - args.addOutput(outputBam); - args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME, "1"); + args.addInput(inputFile); + args.addOutput(actualOutputFile); + if (null != referenceFile) { + args.addReference(referenceFile); + } + args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrder.name()); + args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME,true); + args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME, "2"); this.runCommandLine(args); - SamAssertionUtils.assertSamsEqual(outputBam, sortedBam); - } + final ReadsSparkSource source = new ReadsSparkSource(SparkContextFactory.getTestSparkContext()); + final JavaRDD reads = source.getParallelReads(actualOutputFile.getAbsolutePath(), referenceFile == null ? null : referenceFile.getAbsolutePath()); + final SAMFileHeader header = source.getHeader(actualOutputFile.getAbsolutePath(), + referenceFile == null ? null : referenceFile.getAbsolutePath()); + + BaseTest.assertSorted(reads.collect().stream().map(read -> read.convertToSAMRecord(header)).iterator(), sortOrder.getComparatorInstance()); + } @DataProvider public Object[][] getInvalidSortOrders(){ @@ -81,10 +127,10 @@ public Object[][] getInvalidSortOrders(){ @Test(expectedExceptions = CommandLineException.BadArgumentValue.class, dataProvider = "getInvalidSortOrders") public void testBadSortOrders(SAMFileHeader.SortOrder badOrder){ - final File unsortedBam = new File(getTestDataDir(), "count_reads.bam"); + final File unsortedBam = new File(getTestDataDir(), QUERY_NAME_BAM); ArgumentsBuilder args = new ArgumentsBuilder(); args.addInput(unsortedBam); - args.addOutput(createTempFile("sort_bam_spark", ".bam")); + args.addOutput(createTempFile("sort_bam_spark", BAM)); args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, badOrder.toString()); this.runCommandLine(args); diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.bam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.bam new file mode 100644 index 0000000000000000000000000000000000000000..fc8db70dfc54bcf3f034dfd37878c1f027a8d239 GIT binary patch literal 688 zcmV;h0#E%PiwFb&00000{{{d;LjnMk0@af}Z__{+$6bj#-Cg2~eL1UHKuY9F8K5L- z`l3v!6{)U7Un3#GVoYn=h`iy1A!Dr$-r)mOLVOQa_y9}@B!1f@ z{rBDff41bGAD^|lG4oAPl)BxZPd97j`9<2?s=ayZ`J=Ae@mRgNLz|nmy+glDpTK&l zoQNvZ(ZY08osJf#qowI+If|;KY9d;RqDra4>bq-ny-_RC=0?qKJ>2MZ@~plcXJxv* zS+l)SJ_4%Dmly#)1d1RO@1Ln8%|g;tlV&k#mXc;!c|S8TE6Nv``~YE)1U(q9iy37v zej>^;pckN*z2Twbd+z?gaoeqfuCv!}^&NNL?GHU)*@j#5Lh9Da4w8lW0s@cwdbV?VrGK)Zh{zLpTO(g zam4SJI8oh?5t|dlN6_-mH+He(`meqHyF(b!To8Vk5Jt6pV-pjE=T;*hoC8-Fn0f0+ z6bq)1In$D+8Q8Y8kHTEzC?ABjd2Cu|t5!~Eh-)&Nl?Ig5csv?X3Z`={%Mq7Qn$84} zW1vlA$_SwtYk~=aIcE%GjtSv{Au1Uo2N>Ezm8rEXM(*(_2O!zD&9AB1l zwTFG*8+hHJ)9bcJ&a0L`7+ydRZhtR-CaC6O-j7#$`QN0sx}RSYM8*w!G6Hle&al^m z-?Te&4&447=-+9AA|gZA?8R%zPGC`hqHzcG>E12hlYapnA<9Q?2mk;diwFb&00000 W{{{d;LjnLB00RI3000000000+nm}d% literal 0 HcmV?d00001 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram new file mode 100644 index 0000000000000000000000000000000000000000..2397cd63c2554b98527b700aef26a0ab33c4a33a GIT binary patch literal 13266 zcmeI2Yg82H8OPtb?2VgTgp*Tg5Gbi4vNJooJG;b$Yq;;S?4kujO=f0ifeNfFi^Ljr zF!qG27sSSfrnv~_B%tZVwt}XD3AS7}At#a6Ypb55mZ~SQT4MrzXOTeUOFxymV0NqHdlGlAPE0CJ?RRR-$rDG=cbK!9*wI3J7`q#?j&S|(mrGm>lp!obp$bSyx|3HyaZ zwjvpdq8OH%j^PM~0BbJ%<`Ix=k)uFm&44W!$&`*M;UJ13C~CH0KIk;OmdH^AF33XO`it$LS?^ z&U`Z2@~V*t*S?o|W0`6_A_I~B5lu>Oa&e4XfA1-BQaIMNVLQO-;_J`C$T|Kqs3pnt4Mryjpz1SMM`~$y*ID?Z~*&xYw6c7Zdm%} zt&-b&+lmT5s`P8=zFV>t6r3!sZTCj?6WZ$T&4s<->8}E$67N52_1*bImZIif$@M+l zt@rjPo&8{F)mRkrLX#@*jn?JI+mb?ae%pAhF9pd0d#=zI4*hGg_R*y19mN@+o)~|{ z`DER+yl(R;r)T8$I?qV%-IhaN`&_?q@%u*)9!`5%@ps4lGo4vM4vm$*{N3aU)vj=r zVP9v({XFw_aHG5Rt$UCif`>D85wps0Y+L__s?(OGNu;Y%_N(~6PUU=gOg6A2BqI8; zQ=dn_dA04Ot2sYCc_7A_I;Cv*(|Bk5mQwwdO_$zbPVQ>_d82EBFc-D182(^%jkdk& z_|3DwXlkuXj___1Q8IrUzQ5nUj{v)f%XcmrkF-5gp1kusG29&&JGyK1<2R1{u)g4E zf&HLQ2<~bb*&detvaG$U^XbW`w)HFjBJ?Ph#gZ#+`~KZg+L0Mnv@|Te=z8Gz6B9^N z;~h-g>{a;jdH^7ZjEDZ|5>G*g(6R6oAVNzJJOyx0Cju6lf5xiw?2jYxr~T0SUIRkY zd)v_KUP~7|u=b2~Q<5=1S!zZ>GIXwZw4jx}v%c+QCE`{3EjLF+eWM;jef?${4;3Y7 zXEJ;)ZCg8fjynf@n2-Gf!aiN!(zd12@uKPRPYxJ9+%eY80cU=HbNGzCjWM&U0G2bP z9;{bqn{rp~R*sAJ8)!Y%#4i-_}SGs7TTX2 z@P3jK`=u?aexjcFZ2DC15&ydvPk1(-yZ`*=j6ZJkzuC3tOuqZ|gN>QL&ImacGv0hl z*<%$l_J+vpuWhfq6Y+XW!C4*#Hp8MNK4kzr%W`2zjtG-t#;yt17G?|xW1%o(3e+YX zf`w;c&K@dQu!)ETYA$CA6se1cT`+CFVg>s`Aq<+y(QNvYG6siP4}}{>uUu9sSMoV;ZMbGGOD2m$l?uDk|j=rM6<-4_Yn&Qu3()J2Ur7}?2__N@k z$mL-|eem}H9vd8F+1ntzfCF%mDm#%lV~n`i+0X!61wGL~H>w9cdpxF*uI9YFxD}2S zryo;-@~?iE5If@qpVl literal 0 HcmV?d00001 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram.crai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram.crai new file mode 100644 index 0000000000000000000000000000000000000000..84e99c3def4726a6185a0767ee02c0d45cdccc83 GIT binary patch literal 68 zcmb2|=3oE=X5Lc=1sN1Lm=Bh}I4UWu3@chr1 +TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC +TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA +A +>chr2 +CATCTCTACAAGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATAC +TTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTTGACACCTTT +T +>chr3 +CGTATGCGCTTTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAAT +AAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGGAATGTGCAA +A +>chr4 +CGTGATACCAACTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATAT +TTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGGTTTGCAGCC +C +>chr5 +NTCTCATTTAAAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTT +CATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCAAGACGTTATC +T +>chr6 +NAATTGTTCTTAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACA +ATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACCAGTGTCGAT +C +>chr7 +CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG +TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA +AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA +GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA +CACAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAA +GGTTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCC +GAAACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGG +CAGCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCA +TACA +>chr8 +CACATCGTGAATCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGA +GAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCCTAAGATGAC +CCCAGGTTCAAATGTGCAGCCCCTTTTGAGAGATTTTTTTTTTGGGCTGG +AAAAAAGACACAGCTATTCCTAAGATGACAAGATCAGAAAAAAAGTCAAG +CA diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta.fai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta.fai new file mode 100644 index 00000000000..d5e1a06c3e1 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta.fai @@ -0,0 +1,8 @@ +chr1 101 6 50 51 +chr2 101 116 50 51 +chr3 101 226 50 51 +chr4 101 336 50 51 +chr5 101 446 50 51 +chr6 101 556 50 51 +chr7 404 666 50 51 +chr8 202 1085 50 51 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.sam new file mode 100644 index 00000000000..e23b33980c0 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.sam @@ -0,0 +1,19 @@ +@HD VN:1.5 SO:queryname +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA +@PG ID:1 PN:Hey! VN:2.0 +both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam new file mode 100644 index 0000000000000000000000000000000000000000..35b66990a70878c1da9d9450a373960bb5a9fb25 GIT binary patch literal 698 zcmV;r0!95FiwFb&00000{{{d;LjnMu0@ai~Yui8=$7Se=yR&tc&L`rf1serCw5jbl zFL<(rAqq6FhC;ijimfK5iDhIj86z@iZ$Chw^m}w|KR~y((4lh^I&^a%$*H5@1Ue>< z^yu#YzZ2-EyL-&$M(V31Ne!z-o^RGG3rnQERquG7-|h9CkxLux9n#*ccMkjtc?J{8 zm4sEDwH9Zs+N`xSYc0=OE0I+z*AmuhWL3*m+Spwq>#cg3v^VNj@9~CrRG^LR*sGB3 z&AR0t7b2qSLYZO!M351LRq<5*QSd~6Q=NqIfP2%Brou{OB z{6^$Gfytks-Sb8Vw(mOKq3s;@_WO3{us5)e9RH2mm9}AKfeX$2y_GLnRtUGz413FckN|a`k5e{j|pGIgyYMQ zt`iP?cj)#ReR)luz>wzd*2z{1G4Xz6j{he zG1=QGnSeZl&%1e-I$dWla{c>*D5i*=DPn~E>ehpJZxH|0nrJUhYR&s8ViYU4u41Cr zT$rswZ~RE=^Vmw`4T6N)jBWtCHoWyUC#WhP_HI7LKIijk~QL>WQwYY0Xeg{iW7 zt=2?@(FsI15Z-tK&tQ;88yRv3^5cU$PWnT-s-W{R+lOnUQT$g`z0;I`0TQvB32z7h g03VA81ONa4009360763o02=@U00000000000F1Rfp#T5? literal 0 HcmV?d00001 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam.bai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..7d5017616ed98ebe9c4eb3a726d48407c82052e0 GIT binary patch literal 152 zcmZ>A^kn2r literal 0 HcmV?d00001 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram new file mode 100644 index 0000000000000000000000000000000000000000..ea0c00decea339a217707ebfe3c668394e5f6ca3 GIT binary patch literal 12816 zcmeI2dvFuS9mj8Vk}X-1c^G%7?Z#j*LxQpHp}W&5ZDZs2L$WP7V3JUCj}-M; zv`2t6&C#4J^DU~gB{ka~= zACTfCGI0`FT47np&>SbTD&!!;aaNfZS;Z=#2u)3qL^e($&(a(c#H%VpS#1)u*;GY= zjI44Lr!pcpMG~!X5*40OWRdaE5OT6$qc~O(kwDoa(V`@K=qZxO#Yvgmx1qfLY@|46Ytjx*0WTk1rhUR~YBnnZ9R>~T6 zv?^I`HjiRu8AX*mjKWZg$fKaZN`lO@Q)L%hR3bw$F^McI@`4R|6vzp@iu}wY%g9y{ zNsns$-%@nN_Djm(cGyc-=QzA;=O#^Tx%i3+U;>x`CV&ZG0+;|MfC*p%m;fe#319-4 z049J5U;>x`CV&ZG0+;|MfC*p%m;fe#319-4049J5U;>x`CV&b2KNHwR5<~(B?@ATU!VSwo-83yg)vgj0MqN6JNj=KakU*nxy2&KXeN?6;XaI0llV%-A4d)Wm z$9t@Glc_pXdn*y0{z-(+6Dz-~hz^1JDt3}4U1~**t4;?`H-4KZ)tID^Clqsg3}k%@-B zKkfFpE2EwJMJmHOR63S!cns)>Ii}2ZeQdh5WXekeomc<$*Ot5W&fGL2XXO1f6RH7P zQ}q6pV8iCT?w)YDv$gN~_VQo( zo;YKCr~fy$7rVkgJNA#4-zw_PTDqa}@O-lEWF%5nvGe4Kdp#qEM&2;~{FBecv$4TU zp*^iblcRkd8SH#7Pp>__>|jE{CXdN?uY2r4d4?(mKy!JpsOnUm9@Yc!t_Srg>+^eTn) zf;DHVr{@h`ysGc{#hJVH{f{hobL9b6xb;!fxxcJC6=bU-o{R_XG)@FOd$TR@{P2#a zpD9$o+*Lns!TxXNzIwfD#W%|6k|Jl3*fjgqoU$&cSMH=9Z2$K1hV+W$<-W9G;lue$ z*N+RxYzc3flnDn1&8I2i8S2{YKZ|``^-XtHjtzdi#h@7|_B>+DjwNrYi>F|5Kl|`TJ?%A2OT&<~_4UZJOe_-bB4o&6s zoz)*&hZb7ru#tcsB+mRwlC~fQ`_U0wj|2dT5Uu88QIGb9dVk2HoimU}QO+Si{D3uM z{)zKs%p%B-F*_h9#-5P@V|lgT`EA@#OY)KH3QB78k&mLrN5&fO?R&n`gP79qc`GaH zKk7-1(KP-L^U&l8S&Pn?q`awj;BD{OB+IUt?D@6h;VXM=A3S~WxI%bq z20JohYse^OPqZS)ir-t#Z?lv;Dj(c#Sy;Z<8CldeGBSLmqiQ(kq1?8r3+F3G7NHeb zKK(*#cC59j7EZL*lBt9F$;dk7>+Y;=f7~X0asSlGKJ%SV-x+DVdiSYd$?sase>?Wv zxthR%7oIKcFG=}r_N9)S`THq*EdliBJ literal 0 HcmV?d00001 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.sam new file mode 100644 index 00000000000..b6aaeed2efb --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.sam @@ -0,0 +1,19 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA +@PG ID:1 PN:Hey! VN:2.0 +both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 From 4d4265bd9137dc30c15bebc40a3aa992ed43a641 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Thu, 31 May 2018 11:30:30 -0400 Subject: [PATCH 3/7] more test output to try to figure out what's going on in travis --- .../hellbender/utils/test/BaseTest.java | 18 +++++++++++++++++- .../pipelines/SortSamSparkIntegrationTest.java | 6 +++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java b/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java index 596e24991a6..165c7040b30 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java @@ -411,11 +411,27 @@ public static void assertSorted(Iterable iterable, Comparator comparat * assert that the iterator is sorted according to the comparator */ public static void assertSorted(Iterator iterator, Comparator comparator){ + assertSorted(iterator, comparator, null); + } + + + /** + * assert that the iterator is sorted according to the comparator + */ + public static void assertSorted(Iterable iterable, Comparator comparator, String message){ + assertSorted(iterable.iterator(), comparator, message); + } + + + /** + * assert that the iterator is sorted according to the comparator + */ + public static void assertSorted(Iterator iterator, Comparator comparator, String message){ T previous = null; while(iterator.hasNext()){ T current = iterator.next(); if( previous != null) { - Assert.assertTrue(comparator.compare(previous, current) <= 0, "Expected " + previous + " to be <= " + current); + Assert.assertTrue(comparator.compare(previous, current) <= 0, "Expected " + previous + " to be <= " + current + (message == null ? "" : "\n"+message)); } previous = current; } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java index 11b98206b78..5ba8c7505f7 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.spark.pipelines; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; import org.apache.spark.api.java.JavaRDD; @@ -20,6 +21,8 @@ import org.testng.annotations.Test; import java.io.File; +import java.util.List; +import java.util.stream.Collectors; public final class SortSamSparkIntegrationTest extends CommandLineProgramTest { @@ -113,7 +116,8 @@ public void testSortBAMsSharded( final SAMFileHeader header = source.getHeader(actualOutputFile.getAbsolutePath(), referenceFile == null ? null : referenceFile.getAbsolutePath()); - BaseTest.assertSorted(reads.collect().stream().map(read -> read.convertToSAMRecord(header)).iterator(), sortOrder.getComparatorInstance()); + final List reloadedReads = reads.collect().stream().map(read -> read.convertToSAMRecord(header)).collect(Collectors.toList()); + BaseTest.assertSorted(reloadedReads.iterator(), sortOrder.getComparatorInstance(), reloadedReads.stream().map(SAMRecord::getSAMString).collect(Collectors.joining("\n"))); } @DataProvider From 8855b4a8f8472a968e1ece03502616b009494dba Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Tue, 5 Jun 2018 15:54:40 -0400 Subject: [PATCH 4/7] trying to sort splits --- .../spark/datasources/ReadsSparkSource.java | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java index 46d1f71bb93..246908ed618 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java @@ -7,7 +7,10 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.parquet.avro.AvroParquetInputFormat; @@ -33,6 +36,8 @@ import java.io.File; import java.io.IOException; import java.io.Serializable; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Objects; @@ -69,6 +74,21 @@ public JavaRDD getParallelReads(final String readFileName, final Strin return getParallelReads(readFileName, referencePath, traversalParameters, 0); } + + public static class SplitSortingSamInputFormat extends AnySAMInputFormat{ + + @Override + public List getSplits(JobContext job) throws IOException { + final List splits = super.getSplits(job); + splits.sort((a, b) -> { + FileSplit fa = (FileSplit) a, fb = (FileSplit) b; + return fa.getPath().compareTo(fb.getPath()); + }); + return splits; + } + } + + /** * Loads Reads using Hadoop-BAM. For local files, bam must have the fully-qualified path, * i.e., file:///path/to/bam.bam. @@ -102,7 +122,7 @@ public JavaRDD getParallelReads(final String readFileName, final Strin } rdd2 = ctx.newAPIHadoopFile( - readFileName, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, + readFileName, SplitSortingSamInputFormat.class, LongWritable.class, SAMRecordWritable.class, conf); JavaRDD reads= rdd2.map(v1 -> { From 2408189762992247e05bbfe967af2958a0d777c9 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Wed, 6 Jun 2018 11:23:37 -0400 Subject: [PATCH 5/7] trying again with better casts, but still bad --- .../spark/datasources/ReadsSparkSource.java | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java index 246908ed618..5bcc52bdce7 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java @@ -27,10 +27,7 @@ import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.*; import org.broadinstitute.hellbender.utils.spark.SparkUtils; -import org.seqdoop.hadoop_bam.AnySAMInputFormat; -import org.seqdoop.hadoop_bam.BAMInputFormat; -import org.seqdoop.hadoop_bam.CRAMInputFormat; -import org.seqdoop.hadoop_bam.SAMRecordWritable; +import org.seqdoop.hadoop_bam.*; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; import java.io.File; @@ -40,6 +37,7 @@ import java.util.Comparator; import java.util.List; import java.util.Objects; +import java.util.stream.Stream; /** Loads the reads from disk either serially (using samReaderFactory) or in parallel using Hadoop-BAM. * The parallel code is a modified version of the example writing code from Hadoop-BAM. @@ -76,14 +74,22 @@ public JavaRDD getParallelReads(final String readFileName, final Strin public static class SplitSortingSamInputFormat extends AnySAMInputFormat{ - + @SuppressWarnings("unchecked") @Override public List getSplits(JobContext job) throws IOException { final List splits = super.getSplits(job); - splits.sort((a, b) -> { - FileSplit fa = (FileSplit) a, fb = (FileSplit) b; - return fa.getPath().compareTo(fb.getPath()); - }); + + + if( splits.stream().allMatch(split -> split instanceof FileVirtualSplit || split instanceof FileSplit)) { + splits.sort(Comparator.comparing(split -> { + if (split instanceof FileVirtualSplit) { + return ((FileVirtualSplit) split).getPath(); + } else { + return ((FileSplit) split).getPath(); + } + })); + } + return splits; } } From 558a13f3ad490e85ecd9c283c8e9cef31ded6e18 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Wed, 6 Jun 2018 15:47:01 -0400 Subject: [PATCH 6/7] adding doc to SplitSortingSamInputFormat --- .../engine/spark/datasources/ReadsSparkSource.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java index 5bcc52bdce7..03f94af2e95 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java @@ -73,13 +73,17 @@ public JavaRDD getParallelReads(final String readFileName, final Strin } + /** + * this is a hack to work around https://github.com/HadoopGenomics/Hadoop-BAM/issues/199 + * + * fix the problem by explicitly sorting the input file splits + */ public static class SplitSortingSamInputFormat extends AnySAMInputFormat{ @SuppressWarnings("unchecked") @Override public List getSplits(JobContext job) throws IOException { final List splits = super.getSplits(job); - if( splits.stream().allMatch(split -> split instanceof FileVirtualSplit || split instanceof FileSplit)) { splits.sort(Comparator.comparing(split -> { if (split instanceof FileVirtualSplit) { From 72997c548336f181a5e84993d6954da2b5a28a8f Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Mon, 11 Jun 2018 13:46:49 -0400 Subject: [PATCH 7/7] responding to james --- .../hellbender/engine/spark/datasources/ReadsSparkSource.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java index 03f94af2e95..2e268129deb 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java @@ -87,9 +87,9 @@ public List getSplits(JobContext job) throws IOException { if( splits.stream().allMatch(split -> split instanceof FileVirtualSplit || split instanceof FileSplit)) { splits.sort(Comparator.comparing(split -> { if (split instanceof FileVirtualSplit) { - return ((FileVirtualSplit) split).getPath(); + return ((FileVirtualSplit) split).getPath().getName(); } else { - return ((FileSplit) split).getPath(); + return ((FileSplit) split).getPath().getName(); } })); }