Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Funcotator - VCF ID for VCF Data Sources #5327

Merged
merged 3 commits into from
Oct 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ public class VcfFuncotationFactory extends DataSourceFuncotationFactory {
*/
private final static String DUPLICATE_RECORD_DELIMITER = "|";

/**
* The name of the additional ID field to add to VCF annotations to preserve the ID of the original (data source)
* variant.
*/
private static final String ID_FIELD_NAME = "ID";

@VisibleForTesting
int cacheHits = 0;
@VisibleForTesting
Expand Down Expand Up @@ -147,12 +153,20 @@ private FuncotationMetadata createFuncotationMetadata(final Path sourceFilePath)
List<VCFInfoHeaderLine> createFuncotationVcfInfoHeaderLines(final VCFHeader vcfHeader) {
final List<VCFInfoHeaderLine> supportedVcfInfoHeaderLines = vcfHeader.getInfoHeaderLines().stream()
.filter(vcfInfoHeaderLine -> supportedFieldNames.contains(createFinalFieldName(name, vcfInfoHeaderLine.getID())))
.map(vcfInfoHeaderLine -> copyWithRename(vcfInfoHeaderLine, name))
.collect(Collectors.toList());

// Add in the ID field to the meta data:
final VCFInfoHeaderLine idHeaderLine = new VCFInfoHeaderLine(
createFinalFieldName(name, ID_FIELD_NAME),
VCFHeaderLineCount.A,
VCFHeaderLineType.String,
"ID of the variant from the data source creating this annotation."
);
supportedVcfInfoHeaderLines.add( idHeaderLine );

// Make sure to rename the input VCF field names to the output funcotation field names for this funcotation factory.
return supportedVcfInfoHeaderLines.stream()
.map(vcfInfoHeaderLine -> copyWithRename(vcfInfoHeaderLine, name))
.collect(Collectors.toList());
return supportedVcfInfoHeaderLines;
}

private static VCFInfoHeaderLine copyWithRename(final VCFInfoHeaderLine vcfInfoHeaderLine, final String name) {
Expand Down Expand Up @@ -223,21 +237,21 @@ protected List<Funcotation> createFuncotationsOnVariant(final VariantContext var

// Get rid of any null features.
// By this point we know the feature type is correct, so we cast it:
final List<VariantContext> funcotationFactoryVariants = featureList.stream().filter(f -> f != null)
final List<VariantContext> featureVariantList = featureList.stream().filter(f -> f != null)
.map(f -> (VariantContext) f).collect(Collectors.toList());

// Create a map that will keep the final outputs. Default it to default funcotations for each alt allele in the
// query variant.
final Map<Allele, Funcotation> outputOrderedMap = new LinkedHashMap<>();

for ( final VariantContext funcotationFactoryVariant : funcotationFactoryVariants ) {
for ( final VariantContext featureVariant : featureVariantList ) {

// The funcotationFactoryVariants already overlap the query variant in position, now get which
// The featureVariantList already overlaps the query variant in position, now get which
// match in ref/alt as well. And make sure to handle multiallelics in both the query variant and the
// funcotation factory variant.
// matchIndices length will always be the same as the number of alt alleles in the variant (first parameter)
// Note that this is not the same length as the funcotationFactoryVariant.
final int[] matchIndices = GATKVariantContextUtils.matchAllelesOnly(variant, funcotationFactoryVariant);
// Note that this is not the same length as the featureVariant.
final int[] matchIndices = GATKVariantContextUtils.matchAllelesOnly(variant, featureVariant);

for (int i = 0; i < matchIndices.length; i++) {
final int matchIndex = matchIndices[i];
Expand All @@ -246,10 +260,13 @@ protected List<Funcotation> createFuncotationsOnVariant(final VariantContext var

final LinkedHashMap<String, Object> annotations = new LinkedHashMap<>(supportedFieldNamesAndDefaults);

for (final Map.Entry<String, Object> entry : funcotationFactoryVariant.getAttributes().entrySet()) {
populateAnnotationMap(funcotationFactoryVariant, variant, matchIndex, annotations, entry);
for (final Map.Entry<String, Object> entry : featureVariant.getAttributes().entrySet()) {
populateAnnotationMap(featureVariant, variant, matchIndex, annotations, entry);
}

// Add the ID of the variant:
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
annotations.put(createFinalFieldName(name, ID_FIELD_NAME), featureVariant.getID());

final TableFuncotation newFuncotation = TableFuncotation.create(annotations, queryAltAllele, name, supportedFieldMetadata);
outputOrderedMap.merge(queryAltAllele, newFuncotation, VcfFuncotationFactory::mergeDuplicateFuncotationFactoryVariant);
}
Expand Down Expand Up @@ -439,6 +456,10 @@ private void populateSupportedFieldNamesFromVcfFile() {
}
supportedFieldNames.add(createFinalFieldName(name, key));
}

// Add our ID to the supported fields:
supportedFieldNamesAndDefaults.put(createFinalFieldName(name, ID_FIELD_NAME), "" );
supportedFieldNames.add(createFinalFieldName(name, ID_FIELD_NAME));
}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ public class VcfFuncotationFactoryUnitTest extends GATKBaseTest {
FIELD_DEFAULT_MAP.put("WGT", "");
FIELD_DEFAULT_MAP.put("WTD", "false");
FIELD_DEFAULT_MAP.put("dbSNPBuildID", "");
FIELD_DEFAULT_MAP.put("ID", "");
}

//==================================================================================================================
Expand Down Expand Up @@ -155,14 +156,14 @@ private Object[][] provideForTestCreateFuncotationsOnVariant() {
helpProvideForTestCreateFuncotations("3", 61662, 61662, "T", "C",
Collections.singletonList(
TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130"),
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130", "rs73009205"),
Allele.create("C"), FACTORY_NAME, null)
)
),
// No matching VCF features (three overlap by position only), since there are no indels in dbSNP (the test datasource), so the ground truth should be a default entry, which was constructed here manually:
helpProvideForTestCreateFuncotations("3", 64157, 64166, "AGAAAGGTCA", "TCTTTCCAGT",
Collections.singletonList(TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
Arrays.asList("false", "false", "", "false", "false", "", "false", "false", "false", "", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "", "", "false", "false", "", "false", "", "false", "", "false", "false", "false", "", "false", "", "", "false", ""),
Arrays.asList("false", "false", "", "false", "false", "", "false", "false", "false", "", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "", "", "false", "false", "", "false", "", "false", "", "false", "false", "false", "", "false", "", "", "false", "", ""),
Allele.create("TCTTTCCAGT"), FACTORY_NAME, null))
),
};
Expand Down
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading