Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added an option to opt out of the protections to sorting multiple bam inputs #5974

Merged
merged 6 commits into from
May 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
Expand Down Expand Up @@ -127,6 +128,8 @@
programGroup = ReadDataManipulationProgramGroup.class)
public final class MarkDuplicatesSpark extends GATKSparkTool {
private static final long serialVersionUID = 1L;
public static final String ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG = "allow-multiple-sort-orders-in-input";
public static final String TREAT_UNSORTED_AS_ORDERED = "treat-unsorted-as-querygroup-ordered";

@Override
public boolean requiresReads() { return true; }
Expand All @@ -140,6 +143,16 @@ public final class MarkDuplicatesSpark extends GATKSparkTool {
fullName = StandardArgumentDefinitions.METRICS_FILE_LONG_NAME)
protected String metricsFile;

@Advanced
@Argument(doc = "Allow non-queryname sorted inputs when specifying multiple input bams.", optional=true,
fullName = ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG)
protected boolean allowMultipleSortOrders = false;

@Advanced
@Argument(doc = "Treat unsorted files as query-group orderd files. WARNING: This option disables a basic safety check and may result in unexpected behavior if the file is truly unordered", optional=true,
fullName = TREAT_UNSORTED_AS_ORDERED)
protected boolean treatUnsortedAsOrdered = false;

@ArgumentCollection
protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection();

Expand Down Expand Up @@ -298,10 +311,35 @@ protected void runTool(final JavaSparkContext ctx) {
// Check if we are using multiple inputs that the headers are all in the correct querygrouped ordering, if so set the aggregate header to reflect this
Map<String, SAMFileHeader> headerMap = getReadSourceHeaderMap();
if (headerMap.size() > 1) {
headerMap.entrySet().stream().forEach(h -> {if(!ReadUtils.isReadNameGroupedBam(h.getValue())) {
throw new UserException("Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted or querygroup-sorted for multi-input processing but input "+h.getKey()+" was sorted in "+h.getValue().getSortOrder()+" order");
}});
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
final Optional<Map.Entry<String, SAMFileHeader>> badlySorted = headerMap.entrySet()
.stream()
.filter(h -> !treatAsReadGroupOrdered(h.getValue(), treatUnsortedAsOrdered))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woops, sorry

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whats worse is that I actually caught this yesterday but forgot to push the branch until today... sigh...

.findFirst();

if(badlySorted.isPresent()) {
if (allowMultipleSortOrders) {
//don't set an ordering, the files will all be sorted downstream
logger.info("Input files are not all grouped by read name so they will be sorted together.");
} else {
final Map.Entry<String, SAMFileHeader> badPair = badlySorted.get();
throw new UserException(
"Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted " +
"or querygroup-sorted for multi-input processing but input " + badPair.getKey() + " was sorted in " + badPair
.getValue().getSortOrder() + " order");
}
} else {
// The default sort order for merged input files is unsorted, so this will be fed to the tool to be sorted
if (!allowMultipleSortOrders) {
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
}
}

// If there is only one file and we are in treatUnsortedAsOrdered mode than set its group order accordingly.
} else {
if (treatUnsortedAsOrdered && (mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unknown) || mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted))) {
logger.warn("Input bam was marked as " + mergedHeader.getSortOrder().toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped");
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
}
}

JavaRDD<GATKRead> reads = getReads();
Expand Down Expand Up @@ -332,4 +370,16 @@ protected void runTool(final JavaSparkContext ctx) {
writeReads(ctx, output, readsForWriting, mergedHeader, true);
}

// helper method to determin if an input header is to be treated as a query group sorted file.
private boolean treatAsReadGroupOrdered(SAMFileHeader header, boolean treatUnsortedAsReadGrouped) {
final SAMFileHeader.SortOrder sortOrder = header.getSortOrder();
if( ReadUtils.isReadNameGroupedBam(header) ){
return true;
} else if ( treatUnsortedAsReadGrouped && (sortOrder.equals(SAMFileHeader.SortOrder.unknown) || sortOrder.equals(SAMFileHeader.SortOrder.unsorted))) {
logger.warn("Input bam was marked as " + sortOrder.toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped");
return true;
}
return false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -482,4 +482,26 @@ public void testAssertCorrectSortOrderMultipleBams() {
args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"));
runCommandLine(args);
}

@Test
public void testAssertCorrectSortOrderMultipleBamsOverriding() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a test case for a single unsorted bam, and one that rejects when there's a coordinate sorted bam along side an unsorted bam

final File output = createTempFile("supplementaryReadUnmappedMate", "bam");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addOutput(output);
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.bam"));
args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"));
args.addArgument(MarkDuplicatesSpark.ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG);
runCommandLine(args);
}

@Test
public void testAssertAssumeUnsortedFilesAreQueryGroupedFiles() {
final File output = createTempFile("supplementaryReadUnmappedMate", "bam");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addOutput(output);
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.queryname.bam"));
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.unsorted.querygrouped.sam"));
args.addArgument(MarkDuplicatesSpark.TREAT_UNSORTED_AS_ORDERED);
runCommandLine(args);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
@HD VN:1.5 SO:unsorted
@SQ SN:chr1 LN:101
@SQ SN:chr2 LN:101
@SQ SN:chr3 LN:101
@SQ SN:chr4 LN:101
@SQ SN:chr5 LN:101
@SQ SN:chr6 LN:101
@SQ SN:chr7 LN:404
@SQ SN:chr8 LN:202
@RG ID:1AAXX.1 SM:Hi,Mom! LB:mylib PL:ILLUMINA
@PG ID:MarkDuplicates PN:MarkDuplicates VN:1 CL:MarkDuplicates merge1.sam PP:bwa
@PG ID:bwa PN:bwa VN:1 CL:bwa aln
C4N4WACXX140821:8:1112:2344:1984 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1
C4N4WACXX140821:8:1112:2344:1984 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1
C4N4WACXX140821:8:1112:2344:1985 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1
C4N4WACXX140821:8:1112:2344:1985 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1