Skip to content

Commit

Permalink
Add custom namespace columns to CNA
Browse files Browse the repository at this point in the history
  • Loading branch information
Bas Leenknegt committed Nov 3, 2022
1 parent b90f830 commit ff39c51
Show file tree
Hide file tree
Showing 27 changed files with 6,889 additions and 6,466 deletions.
68 changes: 10 additions & 58 deletions core/src/main/java/org/mskcc/cbio/maf/MafUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,12 @@

package org.mskcc.cbio.maf;

import static org.mskcc.cbio.maf.ValueTypeUtil.isDouble;
import static org.mskcc.cbio.maf.ValueTypeUtil.isFloat;
import static org.mskcc.cbio.maf.ValueTypeUtil.isInt;


import java.util.*;
import java.util.regex.Pattern;

import joptsimple.internal.Strings;
import org.mskcc.cbio.portal.scripts.*;

/**
* Utility Class for Parsing MAF Files.
Expand All @@ -49,8 +46,6 @@
*/
public class MafUtil {
private static final Pattern validNucleotidesPattern = Pattern.compile("^([ATGC]*)$");
public static final String NAMESPACE_DELIMITER = ".";
public static final String NAMESPACE_DELIMITER_REGEX = "\\.";
// standard header column names
public static final String HUGO_SYMBOL = "Hugo_Symbol";
public static final String ENTREZ_GENE_ID = "Entrez_Gene_Id";
Expand Down Expand Up @@ -215,7 +210,7 @@ public class MafUtil {

// mapping for all column names (both standard and custom columns)
private HashMap<String, Integer> columnIndexMap;
private Map<String, Map<String, Integer>> namespaceIndexMap;
private final NamespaceColumnParser namespaceColumnParser;

public MafUtil(String headerLine) {
this(headerLine, null);
Expand All @@ -226,9 +221,9 @@ public MafUtil(String headerLine) {
* @param headerLine Header Line.
*/
public MafUtil(String headerLine, Set<String> namespaces) {

// init column index map
this.columnIndexMap = new HashMap<String, Integer>();
this.namespaceIndexMap = new HashMap<String, Map<String, Integer>>();

// split header names
String parts[] = headerLine.split("\t");
Expand Down Expand Up @@ -374,26 +369,10 @@ public MafUtil(String headerLine, Set<String> namespaces) {
driverTiersIndex = i;
} else if(header.equalsIgnoreCase(DRIVER_TIERS_FILTER_ANNOTATION)) {
driverTiersAnnIndex = i;
} else if (namespaces != null && !namespaces.isEmpty()) {
int columnIndex = i;
namespaces.stream()
// Perform a case-insensitive match of namespace in meta file with the column name.
.filter(namespace -> header.toLowerCase().startsWith(namespace.toLowerCase(
Locale.ROOT) + NAMESPACE_DELIMITER))
.findFirst()
.ifPresent(namespace -> {
String columnName = header.split(NAMESPACE_DELIMITER_REGEX)[1];
// For legacy reasons perform lower-case transformation for ASCN column names.
if (namespace.equalsIgnoreCase("ascn")) {
columnName = columnName.toLowerCase();
}
// Key the namespaces with the format (upper-/lowercase) specified in the meta file.
Map<String, Integer> nsKeyIndexMap = this.namespaceIndexMap.getOrDefault(namespace, new HashMap<>());
nsKeyIndexMap.put(columnName, columnIndex);
this.namespaceIndexMap.put(namespace, nsKeyIndexMap);
});
}
}

this.namespaceColumnParser = new NamespaceColumnParser(namespaces, parts);
}

public MafRecord parseRecord(String line) {
Expand Down Expand Up @@ -485,30 +464,15 @@ public MafRecord parseRecord(String line) {

fixEndPointForInsertion(record);

// extract namespace key-value pairs for json annotation support
Map<String, Map<String, Object>> recordNamespaceAnnotationJsonMap = new HashMap<>();
if (!namespaceIndexMap.isEmpty()) {
for (Map.Entry<String, Map<String, Integer>> nsKeyIndexMap : namespaceIndexMap.entrySet()) {
String namespace = nsKeyIndexMap.getKey();
// construct map of the key-value pairs from the record
Map<String, Object> namespaceKeyValueMappings = new HashMap<>();
for (Map.Entry<String, Integer> nsKeyIndexPairs : nsKeyIndexMap.getValue().entrySet()) {
String keyName = nsKeyIndexPairs.getKey();
Integer keyIndex = nsKeyIndexPairs.getValue();
String stringValue = TabDelimitedFileUtil.getPartStringAllowEmptyAndNA(keyIndex, parts);
namespaceKeyValueMappings.put(keyName, parseNamespaceValue(stringValue));
}
// update namespace map with the key-value pairs extracted from record
recordNamespaceAnnotationJsonMap.put(namespace, namespaceKeyValueMappings);
}
record.setNamespacesMap(recordNamespaceAnnotationJsonMap);
}
record.setNamespacesMap(
this.namespaceColumnParser.parseCustomNamespaces(parts)
);

return record;
}

public Map<String, Map<String, Integer>> getNamespaceIndexMap() {
return namespaceIndexMap;
return this.namespaceColumnParser.getNamespaceColumnIndexMap();
}

private void fixEndPointForInsertion(MafRecord record) {
Expand Down Expand Up @@ -845,17 +809,5 @@ public static boolean variantContainsAmbiguousTumorSeqAllele(String referenceAll
(validNucleotidesPattern.matcher(tumorSeqAllele1.toUpperCase()).matches() || validNucleotidesPattern.matcher(tumorSeqAllele2.toUpperCase()).matches()));
}

private static Object parseNamespaceValue(String stringValue) {
if (stringValue == null || stringValue.isEmpty()) {
return null;
} else if (isInt(stringValue)) {
return Integer.parseInt(stringValue);
} else if (isFloat(stringValue)) {
return Float.parseFloat(stringValue);
} else if (isDouble(stringValue)) {
return Double.parseDouble(stringValue);
}
return stringValue;
}

}

27 changes: 14 additions & 13 deletions core/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ public static void addCaseCnaEvent(CnaEvent cnaEvent, boolean newCnaEvent) throw
MySQLbulkLoader.getMySQLbulkLoader("sample_cna_event").insertRecord(
Long.toString(eventId),
Integer.toString(cnaEvent.getSampleId()),
Integer.toString(cnaEvent.getCnaProfileId())
);
Integer.toString(cnaEvent.getCnaProfileId()),
cnaEvent.getAnnotationJson()
);

if ((cnaEvent.getDriverFilter() != null
&& !cnaEvent.getDriverFilter().isEmpty()
Expand All @@ -72,17 +73,17 @@ public static void addCaseCnaEvent(CnaEvent cnaEvent, boolean newCnaEvent) throw
&& !cnaEvent.getDriverTiersFilter().isEmpty()
&& !cnaEvent.getDriverTiersFilter().toLowerCase().equals("na"))
) {
MySQLbulkLoader
.getMySQLbulkLoader("alteration_driver_annotation")
.insertRecord(
Long.toString(eventId),
Integer.toString(cnaEvent.getCnaProfileId()),
Integer.toString(cnaEvent.getSampleId()),
cnaEvent.getDriverFilter(),
cnaEvent.getDriverFilterAnnotation(),
cnaEvent.getDriverTiersFilter(),
cnaEvent.getDriverTiersFilterAnnotation()
);
MySQLbulkLoader
.getMySQLbulkLoader("alteration_driver_annotation")
.insertRecord(
Long.toString(eventId),
Integer.toString(cnaEvent.getCnaProfileId()),
Integer.toString(cnaEvent.getSampleId()),
cnaEvent.getDriverFilter(),
cnaEvent.getDriverFilterAnnotation(),
cnaEvent.getDriverTiersFilter(),
cnaEvent.getDriverTiersFilterAnnotation()
);
}
}
}
Expand Down
11 changes: 10 additions & 1 deletion core/src/main/java/org/mskcc/cbio/portal/model/CnaEvent.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ public class CnaEvent {
private String driverFilterAnnotation;
private String driverTiersFilter;
private String driverTiersFilterAnnotation;

private String annotationJson;

public static class Event {
private long eventId;
private CanonicalGene gene;
Expand Down Expand Up @@ -221,6 +222,14 @@ public String getDriverTiersFilterAnnotation() {
public void setDriverTiersFilterAnnotation(String driverTiersFilterAnnotation) {
this.driverTiersFilterAnnotation = driverTiersFilterAnnotation;
}

public String getAnnotationJson() {
return annotationJson;
}

public void setAnnotationJson(String annotationJson) {
this.annotationJson = annotationJson;
}

@Override
public int hashCode() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,19 @@ public class ImportCnaDiscreteLongData {
private CnaUtil cnaUtil;
private Set<CnaEvent.Event> existingCnaEvents = new HashSet<>();
private int samplesSkipped = 0;

private Set<String> namespaces;

private final ArrayList<SampleIdGeneticProfileId> sampleIdGeneticProfileIds = new ArrayList<>();

public ImportCnaDiscreteLongData(
File cnaFile,
int geneticProfileId,
String genePanel,
DaoGeneOptimized daoGene,
DaoGeneticAlteration daoGeneticAlteration
DaoGeneticAlteration daoGeneticAlteration,
Set<String> namespaces
) {
this.namespaces = namespaces;
this.cnaFile = cnaFile;
this.geneticProfileId = geneticProfileId;
this.genePanel = genePanel;
Expand All @@ -70,7 +73,8 @@ public void importData() throws Exception {
// Pass first line with headers to util:
String line = buf.readLine();
int lineIndex = 1;
this.cnaUtil = new CnaUtil(line);
String[] headerParts = line.split("\t", -1);
this.cnaUtil = new CnaUtil(headerParts, this.namespaces);

GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId);

Expand Down Expand Up @@ -106,8 +110,7 @@ public void importData() throws Exception {
}
}

ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + getSamplesSkipped()
);
ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + getSamplesSkipped());
buf.close();
MySQLbulkLoader.flushAll();
}
Expand Down Expand Up @@ -143,15 +146,15 @@ public void extractDataToImport(
long entrezId = gene.getEntrezGeneId();
int sampleId = sample.getInternalId();
CnaEventImportData eventContainer = new CnaEventImportData();
eventContainer.cnaEvent = cnaUtil.createEvent(geneticProfile, sample.getInternalId(), lineParts);

Table<Long, Integer, CnaEventImportData> geneBySampleEventTable = importContainer.eventsTable;

if (!geneBySampleEventTable.contains(entrezId, sample.getInternalId())) {
geneBySampleEventTable.put(entrezId, sampleId, eventContainer);
} else {
ProgressMonitor.logWarning(format("Skipping line %d with duplicate gene %d and sample %d", lineIndex, entrezId, sampleId));
}

eventContainer.geneticEvent = cnaUtil.createEvent(geneticProfile, sample.getInternalId(), lineParts);
}

/**
Expand All @@ -162,8 +165,8 @@ private void storeCnaEvents(CnaImportData toImport, Long entrezId) throws DaoExc
.row(entrezId)
.values()
.stream()
.filter(v -> v.geneticEvent != null)
.map(v -> v.geneticEvent)
.filter(v -> v.cnaEvent != null)
.map(v -> v.cnaEvent)
.collect(Collectors.toList());
CnaUtil.storeCnaEvents(existingCnaEvents, events);
}
Expand All @@ -176,9 +179,9 @@ private boolean storeGeneticAlterations(CnaImportData toImport, Long entrezId) t
.row(entrezId)
.values()
.stream()
.filter(v -> v.geneticEvent != null)
.filter(v -> v.cnaEvent != null)
.map(v -> "" + v
.geneticEvent
.cnaEvent
.getAlteration()
.getCode()
)
Expand Down Expand Up @@ -320,7 +323,7 @@ private class CnaImportData {

private class CnaEventImportData {
public int line;
public CnaEvent geneticEvent;
public CnaEvent cnaEvent;
public String geneSymbol;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ public void importData() throws IOException, DaoException {
ascn = new AlleleSpecificCopyNumber(ascnData);
}
if (record.getNamespacesMap() != null && !record.getNamespacesMap().isEmpty()) {
mutation.setAnnotationJson(convertMapToJsonString(record.getNamespacesMap()));
mutation.setAnnotationJson(this.convertMapToJsonString(record.getNamespacesMap()));
}

sequencedCaseSet.add(sample.getStableId());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,14 @@ public void run() {
geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION
&& geneticProfile.getDatatype().equals("DISCRETE_LONG")
) {
Set<String> namespaces = GeneticProfileReader.getNamespaces(descriptorFile);
ImportCnaDiscreteLongData importer = new ImportCnaDiscreteLongData(
dataFile,
geneticProfile.getGeneticProfileId(),
genePanel,
daoGene,
daoGeneticAlteration
daoGeneticAlteration,
namespaces
);
importer.importData();
} else {
Expand Down
Loading

0 comments on commit ff39c51

Please sign in to comment.