From 2192f2af118ce884a9ed62238795634f4054650f Mon Sep 17 00:00:00 2001
From: Tamara El Naboulsi <ten@ebi.ac.uk>
Date: Tue, 1 Oct 2024 16:29:13 +0100
Subject: [PATCH] Split checksum file into chunks

---
 src/python/ensembl/production/xrefs/Base.py | 90 ++++++++++++++-------
 1 file changed, 61 insertions(+), 29 deletions(-)

diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py
index fcf94dc42..1a6a1ea23 100644
--- a/src/python/ensembl/production/xrefs/Base.py
+++ b/src/python/ensembl/production/xrefs/Base.py
@@ -379,44 +379,76 @@ def load_checksum(self, path: str, url: str):
         The path where the checksum files can be found
     url: str
         The database URL to load the checksum data into
+
+    Raises
+    ------
+    LookupError
+        If no source_id found for source name.
     """
     checksum_dir = os.path.join(path, 'Checksum')
     if not os.path.exists(checksum_dir): os.makedirs(checksum_dir, exist_ok = True)
 
+    output_files = []
+    threshold = 50000000
+    counter = 1
+    output_fh = None
+
     # Connect to db
     url = url + "?local_infile=1"
     db_engine = self.get_db_engine(url)
     with db_engine.connect() as dbi:
-      counter = 1
-      source_id = 1
-
-      # Open the checksum output file
+      # Get all checksum files
       files = os.listdir(checksum_dir)
-      checksum_file = os.path.join(checksum_dir, 'checksum.txt')
-      with open(checksum_file, 'w') as output_fh:
-        # Go through all available checksum files
-        for file in files:
-          if re.search("checksum", file): continue
-
-          input_file = os.path.join(checksum_dir, file)
-          match = re.search(r"\/([A-Za-z]*)-.*$", input_file)
-          source_name = match.group(1)
-          source_id = self.get_source_id_from_name(dbi, source_name)
-
-          input_fh = self.get_filehandle(input_file)
-          for line in input_fh:
-            line = line.rstrip()
-            (id, checksum) = re.split(r"\s+", line)
-
-            counter += 1
-            output = [str(counter), str(source_id), id, checksum]
-            output_str = "\t".join(output)
-            output_fh.write(f'{output_str}\n')
-
-          input_fh.close()
-
-      query = f'load data local infile \'{checksum_file}\' into table checksum_xref'
-      dbi.execute(text(query))
+
+      # Go through all available checksum files
+      index = 0
+      for checksum_file in files:
+        if re.search("checksum", checksum_file): continue
+
+        # Get the source name and ID
+        input_file = os.path.join(checksum_dir, checksum_file)
+        match = re.search(r"\/([A-Za-z]*)-.*$", input_file)
+        source_name = match.group(1)
+        source_id = self.get_source_id_from_name(dbi, source_name)
+
+        if not source_id:
+          raise LookupError(f'No source_id found for source name {source_name}')
+
+        # Open the input file
+        input_fh = self.get_filehandle(input_file)
+        for line in input_fh:
+          # Open the output file
+          if not output_fh or (counter % threshold) == 0:
+            if output_fh: output_fh.close()
+            index += 1
+            output_file = os.path.join(checksum_dir, f'checksum_{index}.txt')
+            output_files.append(output_file)
+            output_fh = open(output_file, 'w')
+
+          line = line.rstrip()
+          (checksum_id, checksum) = re.split(r"\s+", line)
+
+          output = [str(counter), str(source_id), checksum_id, checksum]
+          output_str = "\t".join(output)
+          output_fh.write(f'{output_str}\n')
+
+          counter += 1
+
+        input_fh.close()
+
+      if output_fh: output_fh.close()
+
+      # Add the data in the files to the db
+      for output_file in output_files:
+        dbi.execute(text(f'load data local infile \'{output_file}\' into table checksum_xref'))
+
+      # Merge  the created files
+      merged_file = os.path.join(checksum_dir, f'checksum.txt')
+      with open(merged_file,'w') as output_fh:
+        for output_file in output_files:
+          with open(output_file,'r') as input_fh:
+            shutil.copyfileobj(input_fh, output_fh)
+          os.remove(output_file)
 
   def get_filehandle(self, filename: str):
     """ Opens an appropriate read filehandle for a file based on its type.