-
Notifications
You must be signed in to change notification settings - Fork 589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
adding --sort-order option to SortSamSpark #4545
Changes from all commits
a29ab53
d20579b
4d4265b
8855b4a
2408189
558a13f
72997c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,13 +8,12 @@ | |
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.utils.spark.SparkUtils; | ||
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; | ||
import org.broadinstitute.hellbender.engine.filters.ReadFilter; | ||
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; | ||
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator; | ||
import scala.Tuple2; | ||
|
||
import java.util.Collections; | ||
import java.util.List; | ||
|
@@ -27,35 +26,61 @@ | |
public final class SortSamSpark extends GATKSparkTool { | ||
private static final long serialVersionUID = 1L; | ||
|
||
public static final String SORT_ORDER_LONG_NAME = "sort-order"; | ||
|
||
@Override | ||
public boolean requiresReads() { return true; } | ||
|
||
@Argument(doc="the output file path", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false) | ||
protected String outputFile; | ||
private String outputFile; | ||
|
||
@Argument(doc="sort order of the output file", fullName = SORT_ORDER_LONG_NAME, optional = true) | ||
private SparkSortOrder sortOrder = SparkSortOrder.coordinate; | ||
|
||
/** | ||
* SortOrders that have corresponding implementations for spark. | ||
* These correspond to a subset of {@link SAMFileHeader.SortOrder}. | ||
*/ | ||
private enum SparkSortOrder { | ||
coordinate(SAMFileHeader.SortOrder.coordinate), | ||
queryname(SAMFileHeader.SortOrder.queryname); | ||
|
||
private final SAMFileHeader.SortOrder order; | ||
|
||
SparkSortOrder(SAMFileHeader.SortOrder order) { | ||
this.order = order; | ||
} | ||
|
||
public SAMFileHeader.SortOrder getSamOrder() { | ||
return order; | ||
} | ||
} | ||
|
||
@Override | ||
public List<ReadFilter> getDefaultReadFilters() { | ||
return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); | ||
} | ||
|
||
@Override | ||
protected void onStartup() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On startup is documented as having an empty default implementation. I can add it anyway though to future proof. |
||
super.onStartup(); | ||
} | ||
|
||
@Override | ||
protected void runTool(final JavaSparkContext ctx) { | ||
JavaRDD<GATKRead> reads = getReads(); | ||
int numReducers = getRecommendedNumReducers(); | ||
logger.info("Using %s reducers", numReducers); | ||
final JavaRDD<GATKRead> reads = getReads(); | ||
final int numReducers = getRecommendedNumReducers(); | ||
logger.info("Using %d reducers", numReducers); | ||
|
||
final SAMFileHeader header = getHeaderForReads(); | ||
header.setSortOrder(sortOrder.getSamOrder()); | ||
|
||
final SAMFileHeader readsHeader = getHeaderForReads(); | ||
ReadCoordinateComparator comparator = new ReadCoordinateComparator(readsHeader); | ||
JavaRDD<GATKRead> sortedReads; | ||
final JavaRDD<GATKRead> readsToWrite; | ||
if (shardedOutput) { | ||
sortedReads = reads | ||
.mapToPair(read -> new Tuple2<>(read, null)) | ||
.sortByKey(comparator, true, numReducers) | ||
.keys(); | ||
readsToWrite = SparkUtils.sortReadsAccordingToHeader(reads, header, numReducers); | ||
} else { | ||
sortedReads = reads; // sorting is done by writeReads below | ||
readsToWrite = reads; | ||
} | ||
readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); | ||
writeReads(ctx, outputFile, sortedReads); | ||
writeReads(ctx, outputFile, readsToWrite, header); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
@SQ SN:chr6 LN:101 | ||
@SQ SN:chr7 LN:404 | ||
@SQ SN:chr8 LN:202 | ||
@RG ID:0 SM:Hi,Mom! | ||
@RG ID:0 SM:Hi,Mom! PL:ILLUMINA | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did you need to make this change? |
||
@PG ID:1 PN:Hey! VN:2.0 | ||
both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 | ||
both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a blank line after this declaration
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done