-
Notifications
You must be signed in to change notification settings - Fork 589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bypass FeatureReader for GenomicsDBImport #7393
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -203,6 +203,11 @@ public void testGenomicsDBImportFileInputs() throws IOException { | |
testGenomicsDBImporter(LOCAL_GVCFS, INTERVAL, COMBINED, b38_reference_20_21, true, 1); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsNativeReader() throws IOException { | ||
testGenomicsDBImporter(LOCAL_GVCFS, INTERVAL, COMBINED, b38_reference_20_21, true, 1, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputs_newMQ() throws IOException { | ||
testGenomicsDBImporter_newMQ(GVCFS_WITH_NEW_MQ, INTERVAL2, COMBINED_WITH_NEW_MQ, b37_reference_20_21, true, Collections.emptyList()); | ||
|
@@ -213,6 +218,11 @@ public void testGenomicsDBImportFileInputsWithMultipleIntervals() throws IOExcep | |
testGenomicsDBImporter(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, b38_reference_20_21, true, 1); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsWithMultipleIntervalsNativeReader() throws IOException { | ||
testGenomicsDBImporter(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, b38_reference_20_21, true, 1, true); | ||
} | ||
|
||
@Test(timeOut = 1000000) | ||
public void testGenomicsDBImportWith1000IntervalsToBeMerged() throws IOException { | ||
final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace"; | ||
|
@@ -235,6 +245,11 @@ public void testGenomicsDBImportFileInputsAgainstCombineGVCFMergeContigsToSingle | |
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL_20_21, b38_reference_20_21, new String[0], 1, 1, false); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFNativeReader() throws IOException { | ||
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, INTERVAL, b38_reference_20_21, new String[0], 1, 0, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportMergeContigsManyNonAdjacentContigsToSeveralContigs() throws IOException { | ||
List<SimpleInterval> manyContigs = MANY_CONTIGS_NON_ADJACENT_INTERVALS.stream().map(SimpleInterval::new).collect(Collectors.toList()); | ||
|
@@ -256,18 +271,35 @@ public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleInterval | |
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, new String[0]); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsNativeReader() throws IOException { | ||
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, new String[0], 1, 0, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsWithMultipleThreads() throws IOException { | ||
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, | ||
new String[0], 4); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsWithMultipleThreadsNativeReader() throws IOException { | ||
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, | ||
new String[0], 4, 0, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervals() throws IOException { | ||
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, | ||
b38_reference_20_21, new String[0]); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsNativeReader() throws IOException { | ||
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, | ||
b38_reference_20_21, new String[0], 1, 0, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsForFilesProducedAfterCombineGVCFs() | ||
throws IOException { | ||
|
@@ -276,6 +308,14 @@ public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjac | |
b38_reference_20_21, new String[0]); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervalsForFilesProducedAfterCombineGVCFsNativeReader() | ||
throws IOException { | ||
//this test covers the scenario where the input vcfs have spanning deletions | ||
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS_AFTER_COMBINE_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, | ||
b38_reference_20_21, new String[0], 1, 0, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithNonDiploidData() throws IOException { | ||
testGenomicsDBImporterWithGenotypes(Arrays.asList(NA12878_HG37, MULTIPLOID_DATA_HG37), INTERVAL_NONDIPLOID, | ||
|
@@ -307,6 +347,12 @@ public void testGenomicsDBThreeLargeSamplesWithGenotypes() throws IOException { | |
testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_WITH_GENOTYPES, b38_reference_20_21, true, true, false); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBThreeLargeSamplesWithGenotypesNativeReader() throws IOException { | ||
ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("chr20", 1, 64444167))); | ||
testGenomicsDBImporterWithGenotypes(LOCAL_GVCFS, intervals, COMBINED_WITH_GENOTYPES, b38_reference_20_21, true, true, false, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBThreeLargeSamplesSitesOnlyQuery() throws IOException { | ||
ArrayList<SimpleInterval> intervals = new ArrayList<SimpleInterval>(Arrays.asList( | ||
|
@@ -504,11 +550,21 @@ public void testGenomicsDBImportFileInputsInBatchesWithMultipleIntervals(final i | |
testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, batchSize); | ||
} | ||
|
||
@Test(dataProvider = "batchSizes") | ||
public void testGenomicsDBImportFileInputsInBatchesWithMultipleIntervalsNativeReader(final int batchSize) throws IOException { | ||
testGenomicsDBImporterWithBatchSize(LOCAL_GVCFS, MULTIPLE_INTERVALS, COMBINED_MULTI_INTERVAL, batchSize, true); | ||
} | ||
|
||
@Test(groups = {"bucket"}, dataProvider = "batchSizes") | ||
public void testGenomicsDBImportGCSInputsInBatches(final int batchSize) throws IOException { | ||
testGenomicsDBImporterWithBatchSize(resolveLargeFilesAsCloudURIs(LOCAL_GVCFS), INTERVAL, COMBINED, batchSize); | ||
} | ||
|
||
@Test(groups = {"bucket"}, dataProvider = "batchSizes") | ||
public void testGenomicsDBImportGCSInputsInBatchesNativeReader(final int batchSize) throws IOException { | ||
testGenomicsDBImporterWithBatchSize(resolveLargeFilesAsCloudURIs(LOCAL_GVCFS), INTERVAL, COMBINED, batchSize, true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
} | ||
|
||
@DataProvider | ||
public Object[][] getThreads(){ | ||
return new Object[][] { | ||
|
@@ -581,7 +637,7 @@ private void testGenomicsDBImporterWithBatchSize(final List<String> vcfInputs, f | |
final boolean useNativeReader) throws IOException { | ||
final String workspace = createTempDir("genomicsdb-batchsize-tests-").getAbsolutePath() + "/workspace-" + batchSize; | ||
|
||
writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, false, 0, 1, false, false, false, 0, true); | ||
writeToGenomicsDB(vcfInputs, intervals, workspace, batchSize, false, 0, 1, false, false, false, 0, useNativeReader); | ||
checkJSONFilesAreWritten(workspace); | ||
checkGenomicsDBAgainstExpected(workspace, intervals, expectedCombinedVCF, b38_reference_20_21, true, ATTRIBUTES_TO_IGNORE); | ||
} | ||
|
@@ -632,6 +688,7 @@ private void writeToGenomicsDB(final List<String> vcfInputs, final List<SimpleIn | |
if (chrsToPartitions != 0) { | ||
args.add(GenomicsDBImport.MERGE_CONTIGS_INTO_NUM_PARTITIONS, String.valueOf(chrsToPartitions)); | ||
} | ||
args.add(GenomicsDBImport.BYPASS_FEATURE_READER, useNativeReader); | ||
if (useBufferSize) { | ||
args.add("genomicsdb-vcf-buffer-size", String.valueOf(bufferSizePerSample)); | ||
} | ||
|
@@ -1091,10 +1148,18 @@ public void testIncrementalMustHaveExistingWorkspace() { | |
private void testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace, | ||
final int batchSize, final boolean produceGTField, final boolean useVCFCodec, final String expected, | ||
final int chrsToPartitions, final boolean useNativeReader) throws IOException { | ||
testIncrementalImport(stepSize, intervals, workspace, batchSize, produceGTField, useVCFCodec, expected, | ||
chrsToPartitions, useNativeReader, false); | ||
} | ||
|
||
private void testIncrementalImport(final int stepSize, final List<SimpleInterval> intervals, final String workspace, | ||
final int batchSize, final boolean produceGTField, final boolean useVCFCodec, final String expected, | ||
final int chrsToPartitions, final boolean useNativeReader, final boolean useNativeReaderInitial) | ||
throws IOException { | ||
for(int i=0; i<LOCAL_GVCFS.size(); i+=stepSize) { | ||
int upper = Math.min(i+stepSize, LOCAL_GVCFS.size()); | ||
writeToGenomicsDB(LOCAL_GVCFS.subList(i, upper), intervals, workspace, batchSize, false, 0, 1, false, false, i!=0, | ||
chrsToPartitions, i!=0 && useNativeReader); | ||
chrsToPartitions, (i == 0 && useNativeReaderInitial) || (i > 0 && useNativeReader)); | ||
checkJSONFilesAreWritten(workspace); | ||
} | ||
for(SimpleInterval currInterval : intervals) { | ||
|
@@ -1120,13 +1185,33 @@ public void testGenomicsDBBasicIncremental() throws IOException { | |
createAndCheckIntervalListFromExistingWorkspace(workspace, INTERVAL_PICARD_STYLE_EXPECTED); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBBasicIncrementalAllNativeReader() throws IOException { | ||
final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; | ||
testIncrementalImport(2, INTERVAL, workspace, 0, true, true, COMBINED_WITH_GENOTYPES, 0, true, true); | ||
createAndCheckIntervalListFromExistingWorkspace(workspace, INTERVAL_PICARD_STYLE_EXPECTED); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervals() throws IOException { | ||
final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; | ||
testIncrementalImport(2, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 1, false, true, "", 0, false); | ||
createAndCheckIntervalListFromExistingWorkspace(workspace, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsNativeReader() throws IOException { | ||
final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; | ||
testIncrementalImport(2, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, workspace, 1, false, true, "", 0, true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah - not entirely sure anymore, I think I wanted to check that a given workspace could be imported to using feature reader and htslib. Refactored a bit to make that a bit more clear, and added a test that does all htslib/native incremental import |
||
createAndCheckIntervalListFromExistingWorkspace(workspace, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS_PICARD_STYLE_EXPECTED); | ||
} | ||
|
||
@Test(expectedExceptions = {UserException.class}, expectedExceptionsMessageRegExp=".*must be block compressed.*") | ||
public void testGenomicsDBImportNativeReaderNoCompressedVcf() throws IOException { | ||
testGenomicsDBImporterWithGenotypes(Arrays.asList(NA_12878_PHASED), MULTIPLE_INTERVALS, NA_12878_PHASED, b37_reference_20_21, | ||
false, true, false, true); | ||
} | ||
|
||
@Test | ||
public void testGenomicsDBIncrementalAndBatchSize1WithNonAdjacentIntervalsMergeContigsIntoPartitions() throws IOException { | ||
final String workspace = createTempDir("genomicsdb-incremental-tests").getAbsolutePath() + "/workspace"; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In your testing, did you find that these extra checks for whether the inputs are block-compressed and indexed added significantly to the runtime when dealing with remote files?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We haven't done a lot of remote testing -- just sanity tests to ensure that they work. In the small remote cases we've tried the native reader is actually slower, but I haven't dug into it to see where the bottleneck is (potentially tweaking buffer sizes, etc). As I mentioned in the PR, that is something we were hoping to explore with Broad.