Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC 65] Add custom namespace columns to CNA #9878

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 9 additions & 58 deletions core/src/main/java/org/mskcc/cbio/maf/MafUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@

package org.mskcc.cbio.maf;

import static org.mskcc.cbio.maf.ValueTypeUtil.isDouble;
import static org.mskcc.cbio.maf.ValueTypeUtil.isFloat;
import static org.mskcc.cbio.maf.ValueTypeUtil.isInt;


import java.util.*;
import java.util.regex.Pattern;
Expand All @@ -49,8 +45,6 @@
*/
public class MafUtil {
private static final Pattern validNucleotidesPattern = Pattern.compile("^([ATGC]*)$");
public static final String NAMESPACE_DELIMITER = ".";
public static final String NAMESPACE_DELIMITER_REGEX = "\\.";
// standard header column names
public static final String HUGO_SYMBOL = "Hugo_Symbol";
public static final String ENTREZ_GENE_ID = "Entrez_Gene_Id";
Expand Down Expand Up @@ -215,7 +209,7 @@ public class MafUtil {

// mapping for all column names (both standard and custom columns)
private HashMap<String, Integer> columnIndexMap;
private Map<String, Map<String, Integer>> namespaceIndexMap;
private final NamespaceColumnParser namespaceColumnParser;

public MafUtil(String headerLine) {
this(headerLine, null);
Expand All @@ -226,9 +220,9 @@ public MafUtil(String headerLine) {
* @param headerLine Header Line.
*/
public MafUtil(String headerLine, Set<String> namespaces) {

// init column index map
this.columnIndexMap = new HashMap<String, Integer>();
this.namespaceIndexMap = new HashMap<String, Map<String, Integer>>();

// split header names
String parts[] = headerLine.split("\t");
Expand Down Expand Up @@ -374,26 +368,10 @@ public MafUtil(String headerLine, Set<String> namespaces) {
driverTiersIndex = i;
} else if(header.equalsIgnoreCase(DRIVER_TIERS_FILTER_ANNOTATION)) {
driverTiersAnnIndex = i;
} else if (namespaces != null && !namespaces.isEmpty()) {
int columnIndex = i;
namespaces.stream()
// Perform a case-insensitive match of namespace in meta file with the column name.
.filter(namespace -> header.toLowerCase().startsWith(namespace.toLowerCase(
Locale.ROOT) + NAMESPACE_DELIMITER))
.findFirst()
.ifPresent(namespace -> {
String columnName = header.split(NAMESPACE_DELIMITER_REGEX)[1];
// For legacy reasons perform lower-case transformation for ASCN column names.
if (namespace.equalsIgnoreCase("ascn")) {
columnName = columnName.toLowerCase();
}
// Key the namespaces with the format (upper-/lowercase) specified in the meta file.
Map<String, Integer> nsKeyIndexMap = this.namespaceIndexMap.getOrDefault(namespace, new HashMap<>());
nsKeyIndexMap.put(columnName, columnIndex);
this.namespaceIndexMap.put(namespace, nsKeyIndexMap);
});
}
}

this.namespaceColumnParser = new NamespaceColumnParser(namespaces, parts);
}

public MafRecord parseRecord(String line) {
Expand Down Expand Up @@ -485,30 +463,15 @@ public MafRecord parseRecord(String line) {

fixEndPointForInsertion(record);

// extract namespace key-value pairs for json annotation support
Map<String, Map<String, Object>> recordNamespaceAnnotationJsonMap = new HashMap<>();
if (!namespaceIndexMap.isEmpty()) {
for (Map.Entry<String, Map<String, Integer>> nsKeyIndexMap : namespaceIndexMap.entrySet()) {
String namespace = nsKeyIndexMap.getKey();
// construct map of the key-value pairs from the record
Map<String, Object> namespaceKeyValueMappings = new HashMap<>();
for (Map.Entry<String, Integer> nsKeyIndexPairs : nsKeyIndexMap.getValue().entrySet()) {
String keyName = nsKeyIndexPairs.getKey();
Integer keyIndex = nsKeyIndexPairs.getValue();
String stringValue = TabDelimitedFileUtil.getPartStringAllowEmptyAndNA(keyIndex, parts);
namespaceKeyValueMappings.put(keyName, parseNamespaceValue(stringValue));
}
// update namespace map with the key-value pairs extracted from record
recordNamespaceAnnotationJsonMap.put(namespace, namespaceKeyValueMappings);
}
record.setNamespacesMap(recordNamespaceAnnotationJsonMap);
}
record.setNamespacesMap(
this.namespaceColumnParser.parseCustomNamespaces(parts)
);

return record;
}

public Map<String, Map<String, Integer>> getNamespaceIndexMap() {
return namespaceIndexMap;
return this.namespaceColumnParser.getNamespaceColumnIndexMap();
}

private void fixEndPointForInsertion(MafRecord record) {
Expand Down Expand Up @@ -845,17 +808,5 @@ public static boolean variantContainsAmbiguousTumorSeqAllele(String referenceAll
(validNucleotidesPattern.matcher(tumorSeqAllele1.toUpperCase()).matches() || validNucleotidesPattern.matcher(tumorSeqAllele2.toUpperCase()).matches()));
}

private static Object parseNamespaceValue(String stringValue) {
if (stringValue == null || stringValue.isEmpty()) {
return null;
} else if (isInt(stringValue)) {
return Integer.parseInt(stringValue);
} else if (isFloat(stringValue)) {
return Float.parseFloat(stringValue);
} else if (isDouble(stringValue)) {
return Double.parseDouble(stringValue);
}
return stringValue;
}

}

97 changes: 97 additions & 0 deletions core/src/main/java/org/mskcc/cbio/maf/NamespaceColumnParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package org.mskcc.cbio.maf;

import com.fasterxml.jackson.databind.*;
import org.mskcc.cbio.maf.*;

import java.io.*;
import java.util.*;

import static org.mskcc.cbio.maf.ValueTypeUtil.isDouble;
import static org.mskcc.cbio.maf.ValueTypeUtil.isFloat;
import static org.mskcc.cbio.maf.ValueTypeUtil.isInt;

public class NamespaceColumnParser {

public static final String NAMESPACE_DELIMITER = ".";
public static final String NAMESPACE_DELIMITER_REGEX = "\\.";

private Map<String, Map<String, Integer>> namespaceIndexMap;
private ObjectMapper mapper;

public NamespaceColumnParser(Set<String> namespaces, String[] parts) {
this.namespaceIndexMap = new HashMap<>();
this.mapper = new ObjectMapper();
findNamespaceHeaders(namespaces, parts);
}

public Map<String, Map<String, Integer>> getNamespaceColumnIndexMap() {
return namespaceIndexMap;
}

private void findNamespaceHeaders(
Set<String> namespaces,
String[] parts
) {
// find required header indices
for (int i = 0; i < parts.length; i++) {
String header = parts[i];
if (namespaces == null || namespaces.isEmpty()) {
continue;
}
int columnIndex = i;
namespaces
.stream()
// Perform a case-insensitive match of namespace in meta file with the column name.
.filter(namespace -> header.toLowerCase().startsWith(namespace.toLowerCase(
Locale.ROOT) + NAMESPACE_DELIMITER))
.findFirst()
.ifPresent(namespace -> {
String columnName = header.split(NAMESPACE_DELIMITER_REGEX)[1];
// For legacy reasons perform lower-case transformation for ASCN column names.
if (namespace.equalsIgnoreCase("ascn")) {
columnName = columnName.toLowerCase();
}
// Key the namespaces with the format (upper-/lowercase) specified in the meta file.
Map<String, Integer> nsKeyIndexMap = this.namespaceIndexMap.getOrDefault(namespace, new HashMap<>());
nsKeyIndexMap.put(columnName, columnIndex);
this.namespaceIndexMap.put(namespace, nsKeyIndexMap);
});
}
}

public Map<String, Map<String, Object>> parseCustomNamespaces(String[] parts) {
// extract namespace key-value pairs for json annotation support
Map<String, Map<String, Object>> recordNamespaceAnnotationJsonMap = new HashMap<>();
if (this.namespaceIndexMap.isEmpty()) {
return recordNamespaceAnnotationJsonMap;
}
for (Map.Entry<String, Map<String, Integer>> nsKeyIndexMap : namespaceIndexMap.entrySet()) {
String namespace = nsKeyIndexMap.getKey();
// construct map of the key-value pairs from the record
Map<String, Object> namespaceKeyValueMappings = new HashMap<>();
for (Map.Entry<String, Integer> nsKeyIndexPairs : nsKeyIndexMap.getValue().entrySet()) {
String keyName = nsKeyIndexPairs.getKey();
Integer keyIndex = nsKeyIndexPairs.getValue();
String stringValue = TabDelimitedFileUtil.getPartStringAllowEmptyAndNA(keyIndex, parts);
namespaceKeyValueMappings.put(keyName, parseNamespaceValue(stringValue));
}
// update namespace map with the key-value pairs extracted from record
recordNamespaceAnnotationJsonMap.put(namespace, namespaceKeyValueMappings);
}
return recordNamespaceAnnotationJsonMap;
}

public static Object parseNamespaceValue(String stringValue) {
if (stringValue == null || stringValue.isEmpty()) {
return null;
} else if (isInt(stringValue)) {
return Integer.parseInt(stringValue);
} else if (isFloat(stringValue)) {
return Float.parseFloat(stringValue);
} else if (isDouble(stringValue)) {
return Double.parseDouble(stringValue);
}
return stringValue;
}

}
27 changes: 14 additions & 13 deletions core/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ public static void addCaseCnaEvent(CnaEvent cnaEvent, boolean newCnaEvent) throw
MySQLbulkLoader.getMySQLbulkLoader("sample_cna_event").insertRecord(
Long.toString(eventId),
Integer.toString(cnaEvent.getSampleId()),
Integer.toString(cnaEvent.getCnaProfileId())
);
Integer.toString(cnaEvent.getCnaProfileId()),
cnaEvent.getAnnotationJson()
);

if ((cnaEvent.getDriverFilter() != null
&& !cnaEvent.getDriverFilter().isEmpty()
Expand All @@ -72,17 +73,17 @@ public static void addCaseCnaEvent(CnaEvent cnaEvent, boolean newCnaEvent) throw
&& !cnaEvent.getDriverTiersFilter().isEmpty()
&& !cnaEvent.getDriverTiersFilter().toLowerCase().equals("na"))
) {
MySQLbulkLoader
.getMySQLbulkLoader("alteration_driver_annotation")
.insertRecord(
Long.toString(eventId),
Integer.toString(cnaEvent.getCnaProfileId()),
Integer.toString(cnaEvent.getSampleId()),
cnaEvent.getDriverFilter(),
cnaEvent.getDriverFilterAnnotation(),
cnaEvent.getDriverTiersFilter(),
cnaEvent.getDriverTiersFilterAnnotation()
);
MySQLbulkLoader
.getMySQLbulkLoader("alteration_driver_annotation")
.insertRecord(
Long.toString(eventId),
Integer.toString(cnaEvent.getCnaProfileId()),
Integer.toString(cnaEvent.getSampleId()),
cnaEvent.getDriverFilter(),
cnaEvent.getDriverFilterAnnotation(),
cnaEvent.getDriverTiersFilter(),
cnaEvent.getDriverTiersFilterAnnotation()
);
}
}
}
Expand Down
11 changes: 10 additions & 1 deletion core/src/main/java/org/mskcc/cbio/portal/model/CnaEvent.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ public class CnaEvent {
private String driverFilterAnnotation;
private String driverTiersFilter;
private String driverTiersFilterAnnotation;

private String annotationJson;

public static class Event {
private long eventId;
private CanonicalGene gene;
Expand Down Expand Up @@ -221,6 +222,14 @@ public String getDriverTiersFilterAnnotation() {
public void setDriverTiersFilterAnnotation(String driverTiersFilterAnnotation) {
this.driverTiersFilterAnnotation = driverTiersFilterAnnotation;
}

public String getAnnotationJson() {
return annotationJson;
}

public void setAnnotationJson(String annotationJson) {
this.annotationJson = annotationJson;
}

@Override
public int hashCode() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,19 @@ public class ImportCnaDiscreteLongData {
private CnaUtil cnaUtil;
private Set<CnaEvent.Event> existingCnaEvents = new HashSet<>();
private int samplesSkipped = 0;

private Set<String> namespaces;

private final ArrayList<SampleIdGeneticProfileId> sampleIdGeneticProfileIds = new ArrayList<>();

public ImportCnaDiscreteLongData(
File cnaFile,
int geneticProfileId,
String genePanel,
DaoGeneOptimized daoGene,
DaoGeneticAlteration daoGeneticAlteration
DaoGeneticAlteration daoGeneticAlteration,
Set<String> namespaces
) {
this.namespaces = namespaces;
this.cnaFile = cnaFile;
this.geneticProfileId = geneticProfileId;
this.genePanel = genePanel;
Expand All @@ -71,7 +74,8 @@ public void importData() throws Exception {
// Pass first line with headers to util:
String line = buf.readLine();
int lineIndex = 1;
this.cnaUtil = new CnaUtil(line);
String[] headerParts = line.split("\t", -1);
this.cnaUtil = new CnaUtil(headerParts, this.namespaces);

GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId);

Expand Down Expand Up @@ -147,6 +151,8 @@ public void extractDataToImport(
long entrezId = gene.getEntrezGeneId();
int sampleId = sample.getInternalId();
CnaEventImportData eventContainer = new CnaEventImportData();
eventContainer.cnaEvent = cnaUtil.createEvent(geneticProfile, sample.getInternalId(), lineParts);

Table<Long, Integer, CnaEventImportData> geneBySampleEventTable = importContainer.eventsTable;

if (!geneBySampleEventTable.contains(entrezId, sample.getInternalId())) {
Expand All @@ -155,7 +161,6 @@ public void extractDataToImport(
ProgressMonitor.logWarning(format("Skipping line %d with duplicate gene %d and sample %d", lineIndex, entrezId, sampleId));
}

eventContainer.geneticEvent = cnaUtil.createEvent(geneticProfile, sample.getInternalId(), lineParts);
}

/**
Expand All @@ -166,8 +171,8 @@ private void storeCnaEvents(CnaImportData toImport, Long entrezId) throws DaoExc
.row(entrezId)
.values()
.stream()
.filter(v -> v.geneticEvent != null)
.map(v -> v.geneticEvent)
.filter(v -> v.cnaEvent != null)
.map(v -> v.cnaEvent)
.collect(Collectors.toList());
CnaUtil.storeCnaEvents(existingCnaEvents, events);
}
Expand All @@ -180,9 +185,9 @@ private boolean storeGeneticAlterations(CnaImportData toImport, Long entrezId) t
.row(entrezId)
.values()
.stream()
.filter(v -> v.geneticEvent != null)
.filter(v -> v.cnaEvent != null)
.map(v -> "" + v
.geneticEvent
.cnaEvent
.getAlteration()
.getCode()
)
Expand Down Expand Up @@ -324,7 +329,7 @@ private class CnaImportData {

private class CnaEventImportData {
public int line;
public CnaEvent geneticEvent;
public CnaEvent cnaEvent;
public String geneSymbol;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,14 @@ public void run() {
geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION
&& DISCRETE_LONG.name().equals(geneticProfile.getDatatype())
) {
Set<String> namespaces = GeneticProfileReader.getNamespaces(descriptorFile);
ImportCnaDiscreteLongData importer = new ImportCnaDiscreteLongData(
dataFile,
geneticProfile.getGeneticProfileId(),
genePanel,
daoGene,
daoGeneticAlteration
daoGeneticAlteration,
namespaces
);
importer.importData();
} else {
Expand Down
Loading