Fix indentation for the check of the last line in the files to be han…

…dled outside the for-loop through the files
NBISweden · Nov 12, 2024 · 25acad5 · 25acad5
1 parent f210da0
commit 25acad5
Showing 1 changed file with 19 additions and 22 deletions.
diff --git a/workflow/scripts/gerp_derived_alleles.py b/workflow/scripts/gerp_derived_alleles.py
@@ -52,22 +52,21 @@ def read_gerp_windows(gerpFile, chrom, start, end):
                     break
                 elif currentGerpChrom == chrom and nextGerpChrom != chrom and currentGerpPos >= int(start) and currentGerpPos > int(end):
                     break
-            if len(lineDeque) == 1: # handle the last line in the file
-                lastGerpChrom = lineDeque[0].strip().split('\t')[0] # get the chromosome name of the line in the deque
-                lastGerpPos = int(lineDeque[0].strip().split('\t')[1]) # get the position of the line
-                if lastGerpChrom == chrom and lastGerpPos >= int(start) and lastGerpPos == int(end):
-                    n_rows += 1
-                    break
+        if len(lineDeque) == 1: # handle the last line in the file
+            last_line = lineDeque[0]
+            lastGerpChrom = last_line.strip().split('\t')[0]  # get the chromosome name of the line in the deque
+            lastGerpPos = int(last_line.strip().split('\t')[1])  # get the position of the line
+            if lastGerpChrom == chrom and lastGerpPos >= int(start) and lastGerpPos <= int(end):
+                n_rows += 1
     if n_rows > 0:
         gerpDF = pd.read_csv(gerpFile, sep='\t', skiprows=skip_rows, nrows=n_rows, 
                                 names=['#CHROM', 'POS', 'ancestral_state', 'gerp_score'], 
                                 dtype={'#CHROM': 'string', 'POS': int, 'ancestral_state': 'string', 'gerp_score': float}).set_index(['#CHROM', 'POS'])
         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "from GERP file read into memory")
     else: # Add a row with "NaN" in case the window is missing from the file
         gerpDF = pd.DataFrame(columns=['#CHROM', 'POS', 'ancestral_state', 'gerp_score']).set_index(['#CHROM', 'POS'])
-        gerpDF.loc[(chrom,start),:] = (np.nan)
+        gerpDF.loc[(chrom, start), :] = np.nan
         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "not present in GERP file, will be set to 'NaN'")
-    #print(gerpDF.info(memory_usage="deep"))
     return gerpDF
 
 # Function to read the VCF file per window for further processing
@@ -101,28 +100,26 @@ def read_vcf_windows(vcfFile, chrom, start, end):
                         break
                     elif currentVcfChrom == chrom and nextVcfChrom != chrom and currentVcfPos >= int(start) and currentVcfPos > int(end):
                         break
-                if len(lineDeque) == 1:  # handle the last line in the file
-                    last_line = lineDeque[0]
-                    if isinstance(last_line, bytes):
-                        last_line = last_line.decode('utf8').strip()
-                    else:
-                        last_line = last_line.strip()
-                    lastVcfChrom = last_line.split('\t')[0]  # get the chromosome name of the line in the deque
-                    lastVcfPos = int(last_line.split('\t')[1])  # get the position of the line
-                    if lastVcfChrom == chrom and lastVcfPos >= int(start) and lastVcfPos <= int(end):
-                        n_rows += 1
-                        break
-    usecols_list = [0,1,3,4] + [*range(9,len(header))]
+        if len(lineDeque) == 1: # handle the last line in the file
+            last_line = lineDeque[0]
+            if isinstance(last_line, bytes):
+                last_line = last_line.decode('utf8').strip()
+            else:
+                last_line = last_line.strip()
+            lastVcfChrom = last_line.split('\t')[0]  # get the chromosome name of the line in the deque
+            lastVcfPos = int(last_line.split('\t')[1])  # get the position of the line
+            if lastVcfChrom == chrom and lastVcfPos >= int(start) and lastVcfPos <= int(end):
+                n_rows += 1
+    usecols_list = [0, 1, 3, 4] + [*range(9, len(header))]
     usecols_header = [header[i] for i in usecols_list]
     if n_rows > 0:
         vcfDF = pd.read_csv(vcfFile, sep='\t', skiprows=skip_rows, nrows=n_rows, usecols=usecols_list, 
                               names=usecols_header, dtype = 'string', converters = {'POS': int}).set_index(['#CHROM', 'POS'])
         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "from VCF file read into memory")
     else: # Add a row with "NaN" in case the window is missing from the file
         vcfDF = pd.DataFrame(columns=usecols_header).set_index(['#CHROM', 'POS'])
-        vcfDF.loc[(chrom,start),:] = (np.nan)
+        vcfDF.loc[(chrom, start), :] = np.nan
         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "not present in VCF file, will be set to 'NaN'")
-    #print(vcfDF.info(memory_usage="deep"))
     return vcfDF
 
 # Join GERP and VCFs for each window