From ba92baa59a78cedbd6a2b9ba90390360601544e8 Mon Sep 17 00:00:00 2001 From: Robert Sheridan Date: Fri, 15 Mar 2024 12:12:40 -0400 Subject: [PATCH] Speed up CnaEvent lookup during import - store existing events from the database in a HashMap instead of HashSet - retrieval from HashMap uses CnaEvent.Event equals() and hashMap() semantics - retrieval is neccessary in order to obtain the associated event_id from the db record - this avoids a linear search through the set of all CnaEvent.Events in the database --- .../scripts/ImportCnaDiscreteLongData.java | 6 ++-- .../portal/scripts/ImportTabDelimData.java | 8 +++-- .../org/mskcc/cbio/portal/util/CnaUtil.java | 31 ++++++++++--------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index f03e5b45..6e82bd6c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -45,7 +45,7 @@ public class ImportCnaDiscreteLongData { private String genePanel; private final DaoGeneOptimized daoGene; private CnaUtil cnaUtil; - private Set existingCnaEvents = new HashSet<>(); + private Map existingCnaEvents = new HashMap<>(); private int samplesSkipped = 0; private Set namespaces; @@ -84,7 +84,9 @@ public void importData() throws Exception { && geneticProfile.showProfileInAnalysisTab(); if (isDiscretizedCnaProfile) { - existingCnaEvents.addAll(DaoCnaEvent.getAllCnaEvents()); + for (CnaEvent.Event event : DaoCnaEvent.getAllCnaEvents()) { + existingCnaEvents.put(event, event); + } MySQLbulkLoader.bulkLoadOn(); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index b984abf4..3787975b 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -265,9 +265,11 @@ public void importData(int numLines) throws IOException, DaoException { DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); //cache for data found in cna_event' table: - Set existingCnaEvents = new HashSet<>(); + Map existingCnaEvents = new HashMap<>(); if (isDiscretizedCnaProfile) { - existingCnaEvents.addAll(DaoCnaEvent.getAllCnaEvents()); + for (CnaEvent.Event event : DaoCnaEvent.getAllCnaEvents()) { + existingCnaEvents.put(event, event); + } MySQLbulkLoader.bulkLoadOn(); } @@ -502,7 +504,7 @@ private boolean parseLine(String line, int nrColumns, int sampleStartIndex, boolean isRppaProfile, boolean isDiscretizedCnaProfile, DaoGeneOptimized daoGene, List filteredSampleIndices, List orderedSampleList, - Set existingCnaEvents + Map existingCnaEvents ) throws DaoException { //TODO: refactor this entire function - split functionality into smaller units / subroutines diff --git a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java index 3cc6fd71..2540ae2a 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java @@ -46,25 +46,28 @@ public CnaUtil(String[] headerParts, Set namespaces) { } public static void storeCnaEvents( - Set existingCnaEvents, + Map existingCnaEvents, List cnaEventsToAdd ) throws DaoException { for (CnaEvent cnaEvent : cnaEventsToAdd) { if (!CNA.AMP.equals(cnaEvent.getAlteration()) && !CNA.HOMDEL.equals(cnaEvent.getAlteration())) { continue; } - - // Revert PR https://github.com/cBioPortal/cbioportal-core/pull/1 breaks importer - Optional existingCnaEvent = existingCnaEvents - .stream() - .filter(e -> e.equals(cnaEvent.getEvent())) - .findFirst(); - if (existingCnaEvent.isPresent()) { - cnaEvent.setEventId(existingCnaEvent.get().getEventId()); + CnaEvent.Event event = cnaEvent.getEvent(); + CnaEvent.Event existingEvent = existingCnaEvents.get(event); + // Caution : + // existingEvent (if found) was retrieved from the database and has a populated event_id field. + // event is constructed while parsing the CNA file and does not have a populated event_id field. + // The type CnaEvent.Event, and contained types, have overridden hashCode() and equals() functions + // which allow successful comparison so that an Event with a non-populated event_id field will + // match an Event with a populated / discrepant event_id field. That is to allow this hashmap lookup + // of the previously existing event from the database in order to obtain the event_id (see below). + if (existingEvent != null) { + cnaEvent.setEventId(existingEvent.getEventId()); DaoCnaEvent.addCaseCnaEvent(cnaEvent, false); } else { DaoCnaEvent.addCaseCnaEvent(cnaEvent, true); - existingCnaEvents.add(cnaEvent.getEvent()); + existingCnaEvents.put(event, event); } } } @@ -72,7 +75,7 @@ public static void storeCnaEvents( public CnaEvent createEvent( GeneticProfile geneticProfile, int sampleId, - long entrezId, + long entrezId, String[] parts ) throws IOException { int cnaProfileId = geneticProfile.getGeneticProfileId(); @@ -88,11 +91,11 @@ public CnaEvent createEvent( ); return cna; } - + private String convertMapToJsonString(Map> map) throws JsonProcessingException { return this.objectMapper.writeValueAsString(map); } - + public long getEntrezSymbol(String[] parts) { String entrezAsString = TabDelimitedFileUtil.getPartString(getColumnIndex(CnaUtil.ENTREZ_GENE_ID), parts); if (entrezAsString.isEmpty()) { @@ -123,7 +126,7 @@ private short createAlteration(String[] parts) { */ public int getColumnIndex(String colName) { return this.columnIndexMap.getOrDefault( - colName.toLowerCase(), + colName.toLowerCase(), -1 ); }