diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java index 93936fd95dd..1696b05f813 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java @@ -6,6 +6,7 @@ import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.broadinstitute.barclay.argparser.Advanced; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.ArgumentCollection; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -127,6 +128,8 @@ programGroup = ReadDataManipulationProgramGroup.class) public final class MarkDuplicatesSpark extends GATKSparkTool { private static final long serialVersionUID = 1L; + public static final String ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG = "allow-multiple-sort-orders-in-input"; + public static final String TREAT_UNSORTED_AS_ORDERED = "treat-unsorted-as-querygroup-ordered"; @Override public boolean requiresReads() { return true; } @@ -140,6 +143,16 @@ public final class MarkDuplicatesSpark extends GATKSparkTool { fullName = StandardArgumentDefinitions.METRICS_FILE_LONG_NAME) protected String metricsFile; + @Advanced + @Argument(doc = "Allow non-queryname sorted inputs when specifying multiple input bams.", optional=true, + fullName = ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG) + protected boolean allowMultipleSortOrders = false; + + @Advanced + @Argument(doc = "Treat unsorted files as query-group orderd files. WARNING: This option disables a basic safety check and may result in unexpected behavior if the file is truly unordered", optional=true, + fullName = TREAT_UNSORTED_AS_ORDERED) + protected boolean treatUnsortedAsOrdered = false; + @ArgumentCollection protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection(); @@ -298,10 +311,35 @@ protected void runTool(final JavaSparkContext ctx) { // Check if we are using multiple inputs that the headers are all in the correct querygrouped ordering, if so set the aggregate header to reflect this Map headerMap = getReadSourceHeaderMap(); if (headerMap.size() > 1) { - headerMap.entrySet().stream().forEach(h -> {if(!ReadUtils.isReadNameGroupedBam(h.getValue())) { - throw new UserException("Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted or querygroup-sorted for multi-input processing but input "+h.getKey()+" was sorted in "+h.getValue().getSortOrder()+" order"); - }}); - mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query); + final Optional> badlySorted = headerMap.entrySet() + .stream() + .filter(h -> !treatAsReadGroupOrdered(h.getValue(), treatUnsortedAsOrdered)) + .findFirst(); + + if(badlySorted.isPresent()) { + if (allowMultipleSortOrders) { + //don't set an ordering, the files will all be sorted downstream + logger.info("Input files are not all grouped by read name so they will be sorted together."); + } else { + final Map.Entry badPair = badlySorted.get(); + throw new UserException( + "Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted " + + "or querygroup-sorted for multi-input processing but input " + badPair.getKey() + " was sorted in " + badPair + .getValue().getSortOrder() + " order"); + } + } else { + // The default sort order for merged input files is unsorted, so this will be fed to the tool to be sorted + if (!allowMultipleSortOrders) { + mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query); + } + } + + // If there is only one file and we are in treatUnsortedAsOrdered mode than set its group order accordingly. + } else { + if (treatUnsortedAsOrdered && (mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unknown) || mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted))) { + logger.warn("Input bam was marked as " + mergedHeader.getSortOrder().toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped"); + mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query); + } } JavaRDD reads = getReads(); @@ -332,4 +370,16 @@ protected void runTool(final JavaSparkContext ctx) { writeReads(ctx, output, readsForWriting, mergedHeader, true); } + // helper method to determin if an input header is to be treated as a query group sorted file. + private boolean treatAsReadGroupOrdered(SAMFileHeader header, boolean treatUnsortedAsReadGrouped) { + final SAMFileHeader.SortOrder sortOrder = header.getSortOrder(); + if( ReadUtils.isReadNameGroupedBam(header) ){ + return true; + } else if ( treatUnsortedAsReadGrouped && (sortOrder.equals(SAMFileHeader.SortOrder.unknown) || sortOrder.equals(SAMFileHeader.SortOrder.unsorted))) { + logger.warn("Input bam was marked as " + sortOrder.toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped"); + return true; + } + return false; + } + } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java index a2e12cc37ff..30d4ff1fca2 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java @@ -482,4 +482,26 @@ public void testAssertCorrectSortOrderMultipleBams() { args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam")); runCommandLine(args); } + + @Test + public void testAssertCorrectSortOrderMultipleBamsOverriding() { + final File output = createTempFile("supplementaryReadUnmappedMate", "bam"); + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addOutput(output); + args.addInput(new File(TEST_DATA_DIR,"optical_dupes.bam")); + args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam")); + args.addArgument(MarkDuplicatesSpark.ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG); + runCommandLine(args); + } + + @Test + public void testAssertAssumeUnsortedFilesAreQueryGroupedFiles() { + final File output = createTempFile("supplementaryReadUnmappedMate", "bam"); + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addOutput(output); + args.addInput(new File(TEST_DATA_DIR,"optical_dupes.queryname.bam")); + args.addInput(new File(TEST_DATA_DIR,"optical_dupes.unsorted.querygrouped.sam")); + args.addArgument(MarkDuplicatesSpark.TREAT_UNSORTED_AS_ORDERED); + runCommandLine(args); + } } diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/MarkDuplicatesGATK/optical_dupes.unsorted.querygrouped.sam b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/MarkDuplicatesGATK/optical_dupes.unsorted.querygrouped.sam new file mode 100644 index 00000000000..fc15f4cdd12 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/MarkDuplicatesGATK/optical_dupes.unsorted.querygrouped.sam @@ -0,0 +1,16 @@ +@HD VN:1.5 SO:unsorted +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:1AAXX.1 SM:Hi,Mom! LB:mylib PL:ILLUMINA +@PG ID:MarkDuplicates PN:MarkDuplicates VN:1 CL:MarkDuplicates merge1.sam PP:bwa +@PG ID:bwa PN:bwa VN:1 CL:bwa aln +C4N4WACXX140821:8:1112:2344:1984 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1 +C4N4WACXX140821:8:1112:2344:1984 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1 +C4N4WACXX140821:8:1112:2344:1985 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1 +C4N4WACXX140821:8:1112:2344:1985 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1