diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java index c6d596a7262..b18f00a3c27 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java @@ -243,7 +243,7 @@ private static void writeReadsADAM( private static void saveAsShardedHadoopFiles( final JavaSparkContext ctx, final String outputFile, final String referenceFile, final SAMFormat samOutputFormat, final JavaRDD reads, final SAMFileHeader header, - final boolean writeHeader) throws IOException { + final boolean writeHeader) { // Set the static header on the driver thread. if (samOutputFormat == SAMFormat.CRAM) { SparkCRAMOutputFormat.setHeader(header); diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java index 46d1f71bb93..2e268129deb 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java @@ -7,7 +7,10 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.parquet.avro.AvroParquetInputFormat; @@ -24,17 +27,17 @@ import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.*; import org.broadinstitute.hellbender.utils.spark.SparkUtils; -import org.seqdoop.hadoop_bam.AnySAMInputFormat; -import org.seqdoop.hadoop_bam.BAMInputFormat; -import org.seqdoop.hadoop_bam.CRAMInputFormat; -import org.seqdoop.hadoop_bam.SAMRecordWritable; +import org.seqdoop.hadoop_bam.*; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; import java.io.File; import java.io.IOException; import java.io.Serializable; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Objects; +import java.util.stream.Stream; /** Loads the reads from disk either serially (using samReaderFactory) or in parallel using Hadoop-BAM. * The parallel code is a modified version of the example writing code from Hadoop-BAM. @@ -69,6 +72,33 @@ public JavaRDD getParallelReads(final String readFileName, final Strin return getParallelReads(readFileName, referencePath, traversalParameters, 0); } + + /** + * this is a hack to work around https://github.com/HadoopGenomics/Hadoop-BAM/issues/199 + * + * fix the problem by explicitly sorting the input file splits + */ + public static class SplitSortingSamInputFormat extends AnySAMInputFormat{ + @SuppressWarnings("unchecked") + @Override + public List getSplits(JobContext job) throws IOException { + final List splits = super.getSplits(job); + + if( splits.stream().allMatch(split -> split instanceof FileVirtualSplit || split instanceof FileSplit)) { + splits.sort(Comparator.comparing(split -> { + if (split instanceof FileVirtualSplit) { + return ((FileVirtualSplit) split).getPath().getName(); + } else { + return ((FileSplit) split).getPath().getName(); + } + })); + } + + return splits; + } + } + + /** * Loads Reads using Hadoop-BAM. For local files, bam must have the fully-qualified path, * i.e., file:///path/to/bam.bam. @@ -102,7 +132,7 @@ public JavaRDD getParallelReads(final String readFileName, final Strin } rdd2 = ctx.newAPIHadoopFile( - readFileName, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, + readFileName, SplitSortingSamInputFormat.class, LongWritable.class, SAMRecordWritable.class, conf); JavaRDD reads= rdd2.map(v1 -> { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java index 33ec1273996..4e25d95c889 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java @@ -8,13 +8,12 @@ import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.utils.spark.SparkUtils; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator; -import scala.Tuple2; import java.util.Collections; import java.util.List; @@ -27,35 +26,61 @@ public final class SortSamSpark extends GATKSparkTool { private static final long serialVersionUID = 1L; + public static final String SORT_ORDER_LONG_NAME = "sort-order"; + @Override public boolean requiresReads() { return true; } @Argument(doc="the output file path", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false) - protected String outputFile; + private String outputFile; + + @Argument(doc="sort order of the output file", fullName = SORT_ORDER_LONG_NAME, optional = true) + private SparkSortOrder sortOrder = SparkSortOrder.coordinate; + + /** + * SortOrders that have corresponding implementations for spark. + * These correspond to a subset of {@link SAMFileHeader.SortOrder}. + */ + private enum SparkSortOrder { + coordinate(SAMFileHeader.SortOrder.coordinate), + queryname(SAMFileHeader.SortOrder.queryname); + + private final SAMFileHeader.SortOrder order; + + SparkSortOrder(SAMFileHeader.SortOrder order) { + this.order = order; + } + + public SAMFileHeader.SortOrder getSamOrder() { + return order; + } + } @Override public List getDefaultReadFilters() { return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); } + @Override + protected void onStartup() { + super.onStartup(); + } + @Override protected void runTool(final JavaSparkContext ctx) { - JavaRDD reads = getReads(); - int numReducers = getRecommendedNumReducers(); - logger.info("Using %s reducers", numReducers); + final JavaRDD reads = getReads(); + final int numReducers = getRecommendedNumReducers(); + logger.info("Using %d reducers", numReducers); + + final SAMFileHeader header = getHeaderForReads(); + header.setSortOrder(sortOrder.getSamOrder()); - final SAMFileHeader readsHeader = getHeaderForReads(); - ReadCoordinateComparator comparator = new ReadCoordinateComparator(readsHeader); - JavaRDD sortedReads; + final JavaRDD readsToWrite; if (shardedOutput) { - sortedReads = reads - .mapToPair(read -> new Tuple2<>(read, null)) - .sortByKey(comparator, true, numReducers) - .keys(); + readsToWrite = SparkUtils.sortReadsAccordingToHeader(reads, header, numReducers); } else { - sortedReads = reads; // sorting is done by writeReads below + readsToWrite = reads; } - readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - writeReads(ctx, outputFile, sortedReads); + writeReads(ctx, outputFile, readsToWrite, header); } } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java b/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java index 596e24991a6..165c7040b30 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/test/BaseTest.java @@ -411,11 +411,27 @@ public static void assertSorted(Iterable iterable, Comparator comparat * assert that the iterator is sorted according to the comparator */ public static void assertSorted(Iterator iterator, Comparator comparator){ + assertSorted(iterator, comparator, null); + } + + + /** + * assert that the iterator is sorted according to the comparator + */ + public static void assertSorted(Iterable iterable, Comparator comparator, String message){ + assertSorted(iterable.iterator(), comparator, message); + } + + + /** + * assert that the iterator is sorted according to the comparator + */ + public static void assertSorted(Iterator iterator, Comparator comparator, String message){ T previous = null; while(iterator.hasNext()){ T current = iterator.next(); if( previous != null) { - Assert.assertTrue(comparator.compare(previous, current) <= 0, "Expected " + previous + " to be <= " + current); + Assert.assertTrue(comparator.compare(previous, current) <= 0, "Expected " + previous + " to be <= " + current + (message == null ? "" : "\n"+message)); } previous = current; } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java index b8e427e0817..5ba8c7505f7 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java @@ -1,28 +1,56 @@ package org.broadinstitute.hellbender.tools.spark.pipelines; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; +import org.apache.spark.api.java.JavaRDD; +import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.ReadsDataSource; +import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; +import org.broadinstitute.hellbender.engine.spark.SparkContextFactory; +import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource; +import org.broadinstitute.hellbender.tools.spark.pipelines.SortSamSpark; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; +import org.broadinstitute.hellbender.utils.test.BaseTest; import org.broadinstitute.hellbender.utils.test.SamAssertionUtils; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.util.List; +import java.util.stream.Collectors; public final class SortSamSparkIntegrationTest extends CommandLineProgramTest { + + public static final String COUNT_READS_SAM = "count_reads.sam"; + public static final String COORDINATE_SAM = "count_reads_sorted.sam"; + public static final String QUERY_NAME_BAM = "count_reads.bam"; + public static final String COORDINATE_BAM = "count_reads_sorted.bam"; + public static final String COORDINATE_CRAM = "count_reads_sorted.cram"; + public static final String QUERY_NAME_CRAM = "count_reads.cram"; + public static final String REF = "count_reads.fasta"; + public static final String CRAM = ".cram"; + public static final String BAM = ".bam"; + public static final String SAM = ".sam"; + @DataProvider(name="sortbams") public Object[][] sortBAMData() { return new Object[][] { - {"count_reads.sam", "count_reads_sorted.sam", null, ".sam", "coordinate"}, - {"count_reads.bam", "count_reads_sorted.bam", null, ".bam", "coordinate"}, - {"count_reads.cram", "count_reads_sorted.cram", "count_reads.fasta", ".bam", "coordinate"}, - {"count_reads.cram", "count_reads_sorted.cram", "count_reads.fasta", ".cram", "coordinate"}, - {"count_reads.bam", "count_reads_sorted.bam", "count_reads.fasta", ".cram", "coordinate"}, - - //SortBamSpark is missing SORT_ORDER parameter https://github.com/broadinstitute/gatk/issues/1260 -// {"count_reads.bam", "count_reads.bam", null, ".bam", "queryname"}, -// {"count_reads.cram", "count_reads.cram", "count_reads.fasta", ".cram", "queryname"}, + {COUNT_READS_SAM, COORDINATE_SAM, null, SAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_BAM, COORDINATE_BAM, null, BAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_CRAM, COORDINATE_CRAM, REF, BAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_CRAM, COORDINATE_CRAM, REF, CRAM, SAMFileHeader.SortOrder.coordinate}, + {QUERY_NAME_BAM, COORDINATE_BAM, REF, CRAM, SAMFileHeader.SortOrder.coordinate}, + + {COORDINATE_SAM, COUNT_READS_SAM, null, SAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_BAM, QUERY_NAME_BAM, null, BAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_CRAM, QUERY_NAME_CRAM, REF, BAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_CRAM, QUERY_NAME_CRAM, REF, CRAM, SAMFileHeader.SortOrder.queryname}, + {COORDINATE_BAM, QUERY_NAME_BAM, REF, CRAM, SAMFileHeader.SortOrder.queryname}, }; } @@ -32,42 +60,83 @@ public void testSortBAMs( final String expectedOutputFileName, final String referenceFileName, final String outputExtension, - final String sortOrderName) throws Exception { - final File inputFile = new File(getTestDataDir(), inputFileName); - final File expectedOutputFile = new File(getTestDataDir(), expectedOutputFileName); + final SAMFileHeader.SortOrder sortOrder) throws Exception { + final File inputFile = getTestFile(inputFileName); + final File expectedOutputFile = getTestFile(expectedOutputFileName); final File actualOutputFile = createTempFile("sort_sam", outputExtension); - File referenceFile = null == referenceFileName ? null : new File(getTestDataDir(), referenceFileName); + File referenceFile = null == referenceFileName ? null : getTestFile(referenceFileName); + + final SamReaderFactory factory = SamReaderFactory.makeDefault(); + ArgumentsBuilder args = new ArgumentsBuilder(); - args.add("--input"); args.add(inputFile.getCanonicalPath()); - args.add("--output"); args.add(actualOutputFile.getCanonicalPath()); + args.addInput(inputFile); + args.addOutput(actualOutputFile); if (null != referenceFile) { - args.add("--R"); - args.add(referenceFile.getAbsolutePath()); + args.addReference(referenceFile); + factory.referenceSequence(referenceFile); } - args.add("--num-reducers"); args.add("1"); + args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrder.name()); - //https://github.com/broadinstitute/gatk/issues/1260 -// args.add("--SORT_ORDER"); -// args.add(sortOrderName); + this.runCommandLine(args); - this.runCommandLine(args.getArgsArray()); + //test files are exactly equal + SamAssertionUtils.assertSamsEqual(actualOutputFile, expectedOutputFile, ValidationStringency.DEFAULT_STRINGENCY, referenceFile); - SamAssertionUtils.samsEqualStringent(actualOutputFile, expectedOutputFile, ValidationStringency.DEFAULT_STRINGENCY, referenceFile); + //test sorting matches htsjdk + try(ReadsDataSource in = new ReadsDataSource(actualOutputFile.toPath(), factory )) { + BaseTest.assertSorted(Utils.stream(in).map(read -> read.convertToSAMRecord(in.getHeader())).iterator(), sortOrder.getComparatorInstance()); + } } - @Test(groups = "spark") - public void test() throws Exception { - final File unsortedBam = new File(getTestDataDir(), "count_reads.bam"); - final File sortedBam = new File(getTestDataDir(), "count_reads_sorted.bam"); - final File outputBam = createTempFile("sort_bam_spark", ".bam"); + @Test(dataProvider="sortbams", groups="spark") + public void testSortBAMsSharded( + final String inputFileName, + final String unused, + final String referenceFileName, + final String outputExtension, + final SAMFileHeader.SortOrder sortOrder) { + final File inputFile = getTestFile(inputFileName); + final File actualOutputFile = createTempFile("sort_sam", outputExtension); + File referenceFile = null == referenceFileName ? null : getTestFile(referenceFileName); ArgumentsBuilder args = new ArgumentsBuilder(); - args.add("--"+ StandardArgumentDefinitions.INPUT_LONG_NAME); args.add(unsortedBam.getCanonicalPath()); - args.add("--"+StandardArgumentDefinitions.OUTPUT_LONG_NAME); args.add(outputBam.getCanonicalPath()); - args.add("--num-reducers"); args.add("1"); + args.addInput(inputFile); + args.addOutput(actualOutputFile); + if (null != referenceFile) { + args.addReference(referenceFile); + } + args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrder.name()); + args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME,true); + args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME, "2"); + + this.runCommandLine(args); + + final ReadsSparkSource source = new ReadsSparkSource(SparkContextFactory.getTestSparkContext()); + final JavaRDD reads = source.getParallelReads(actualOutputFile.getAbsolutePath(), referenceFile == null ? null : referenceFile.getAbsolutePath()); + + final SAMFileHeader header = source.getHeader(actualOutputFile.getAbsolutePath(), + referenceFile == null ? null : referenceFile.getAbsolutePath()); - this.runCommandLine(args.getArgsArray()); + final List reloadedReads = reads.collect().stream().map(read -> read.convertToSAMRecord(header)).collect(Collectors.toList()); + BaseTest.assertSorted(reloadedReads.iterator(), sortOrder.getComparatorInstance(), reloadedReads.stream().map(SAMRecord::getSAMString).collect(Collectors.joining("\n"))); + } - SamAssertionUtils.assertSamsEqual(outputBam, sortedBam); + @DataProvider + public Object[][] getInvalidSortOrders(){ + return new Object[][]{ + {SAMFileHeader.SortOrder.unknown}, + {SAMFileHeader.SortOrder.unsorted}, + {SAMFileHeader.SortOrder.duplicate} + }; } + @Test(expectedExceptions = CommandLineException.BadArgumentValue.class, dataProvider = "getInvalidSortOrders") + public void testBadSortOrders(SAMFileHeader.SortOrder badOrder){ + final File unsortedBam = new File(getTestDataDir(), QUERY_NAME_BAM); + ArgumentsBuilder args = new ArgumentsBuilder(); + args.addInput(unsortedBam); + args.addOutput(createTempFile("sort_bam_spark", BAM)); + args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, badOrder.toString()); + + this.runCommandLine(args); + } } diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam b/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam index 4ab8112f1ce..b6aaeed2efb 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam +++ b/src/test/resources/org/broadinstitute/hellbender/tools/count_reads_sorted.sam @@ -7,7 +7,7 @@ @SQ SN:chr6 LN:101 @SQ SN:chr7 LN:404 @SQ SN:chr8 LN:202 -@RG ID:0 SM:Hi,Mom! +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA @PG ID:1 PN:Hey! VN:2.0 both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.bam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.bam new file mode 100644 index 00000000000..fc8db70dfc5 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.bam differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram new file mode 100644 index 00000000000..2397cd63c25 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram.crai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram.crai new file mode 100644 index 00000000000..84e99c3def4 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.cram.crai differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.dict b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.dict new file mode 100644 index 00000000000..f4de06bc6fc --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.dict @@ -0,0 +1,9 @@ +@HD VN:1.5 SO:unsorted +@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta +@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta +@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta +@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta +@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta +@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta +@SQ SN:chr7 LN:404 M5:da488fc432cdaf2c20c96da473a7b630 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta +@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/count_reads.fasta diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta new file mode 100644 index 00000000000..95611510399 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta @@ -0,0 +1,40 @@ +>chr1 +TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC +TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA +A +>chr2 +CATCTCTACAAGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATAC +TTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTTGACACCTTT +T +>chr3 +CGTATGCGCTTTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAAT +AAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGGAATGTGCAA +A +>chr4 +CGTGATACCAACTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATAT +TTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGGTTTGCAGCC +C +>chr5 +NTCTCATTTAAAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTT +CATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCAAGACGTTATC +T +>chr6 +NAATTGTTCTTAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACA +ATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACCAGTGTCGAT +C +>chr7 +CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG +TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA +AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA +GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA +CACAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAA +GGTTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCC +GAAACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGG +CAGCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCA +TACA +>chr8 +CACATCGTGAATCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGA +GAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCCTAAGATGAC +CCCAGGTTCAAATGTGCAGCCCCTTTTGAGAGATTTTTTTTTTGGGCTGG +AAAAAAGACACAGCTATTCCTAAGATGACAAGATCAGAAAAAAAGTCAAG +CA diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta.fai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta.fai new file mode 100644 index 00000000000..d5e1a06c3e1 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.fasta.fai @@ -0,0 +1,8 @@ +chr1 101 6 50 51 +chr2 101 116 50 51 +chr3 101 226 50 51 +chr4 101 336 50 51 +chr5 101 446 50 51 +chr6 101 556 50 51 +chr7 404 666 50 51 +chr8 202 1085 50 51 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.sam new file mode 100644 index 00000000000..e23b33980c0 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads.sam @@ -0,0 +1,19 @@ +@HD VN:1.5 SO:queryname +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA +@PG ID:1 PN:Hey! VN:2.0 +both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam new file mode 100644 index 00000000000..35b66990a70 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam.bai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam.bai new file mode 100644 index 00000000000..7d5017616ed Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.bam.bai differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram new file mode 100644 index 00000000000..ea0c00decea Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram.crai b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram.crai new file mode 100644 index 00000000000..b697713bdbb Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.cram.crai differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.sam new file mode 100644 index 00000000000..b6aaeed2efb --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark/count_reads_sorted.sam @@ -0,0 +1,19 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA +@PG ID:1 PN:Hey! VN:2.0 +both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0