Skip to content

Commit

Permalink
Fix indentation for the check of the last line in the files to be han…
Browse files Browse the repository at this point in the history
…dled outside the for-loop through the files
  • Loading branch information
verku committed Nov 12, 2024
1 parent f210da0 commit 25acad5
Showing 1 changed file with 19 additions and 22 deletions.
41 changes: 19 additions & 22 deletions workflow/scripts/gerp_derived_alleles.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,22 +52,21 @@ def read_gerp_windows(gerpFile, chrom, start, end):
break
elif currentGerpChrom == chrom and nextGerpChrom != chrom and currentGerpPos >= int(start) and currentGerpPos > int(end):
break
if len(lineDeque) == 1: # handle the last line in the file
lastGerpChrom = lineDeque[0].strip().split('\t')[0] # get the chromosome name of the line in the deque
lastGerpPos = int(lineDeque[0].strip().split('\t')[1]) # get the position of the line
if lastGerpChrom == chrom and lastGerpPos >= int(start) and lastGerpPos == int(end):
n_rows += 1
break
if len(lineDeque) == 1: # handle the last line in the file
last_line = lineDeque[0]
lastGerpChrom = last_line.strip().split('\t')[0] # get the chromosome name of the line in the deque
lastGerpPos = int(last_line.strip().split('\t')[1]) # get the position of the line
if lastGerpChrom == chrom and lastGerpPos >= int(start) and lastGerpPos <= int(end):
n_rows += 1
if n_rows > 0:
gerpDF = pd.read_csv(gerpFile, sep='\t', skiprows=skip_rows, nrows=n_rows,
names=['#CHROM', 'POS', 'ancestral_state', 'gerp_score'],
dtype={'#CHROM': 'string', 'POS': int, 'ancestral_state': 'string', 'gerp_score': float}).set_index(['#CHROM', 'POS'])
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "from GERP file read into memory")
else: # Add a row with "NaN" in case the window is missing from the file
gerpDF = pd.DataFrame(columns=['#CHROM', 'POS', 'ancestral_state', 'gerp_score']).set_index(['#CHROM', 'POS'])
gerpDF.loc[(chrom,start),:] = (np.nan)
gerpDF.loc[(chrom, start), :] = np.nan
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "not present in GERP file, will be set to 'NaN'")
#print(gerpDF.info(memory_usage="deep"))
return gerpDF

# Function to read the VCF file per window for further processing
Expand Down Expand Up @@ -101,28 +100,26 @@ def read_vcf_windows(vcfFile, chrom, start, end):
break
elif currentVcfChrom == chrom and nextVcfChrom != chrom and currentVcfPos >= int(start) and currentVcfPos > int(end):
break
if len(lineDeque) == 1: # handle the last line in the file
last_line = lineDeque[0]
if isinstance(last_line, bytes):
last_line = last_line.decode('utf8').strip()
else:
last_line = last_line.strip()
lastVcfChrom = last_line.split('\t')[0] # get the chromosome name of the line in the deque
lastVcfPos = int(last_line.split('\t')[1]) # get the position of the line
if lastVcfChrom == chrom and lastVcfPos >= int(start) and lastVcfPos <= int(end):
n_rows += 1
break
usecols_list = [0,1,3,4] + [*range(9,len(header))]
if len(lineDeque) == 1: # handle the last line in the file
last_line = lineDeque[0]
if isinstance(last_line, bytes):
last_line = last_line.decode('utf8').strip()
else:
last_line = last_line.strip()
lastVcfChrom = last_line.split('\t')[0] # get the chromosome name of the line in the deque
lastVcfPos = int(last_line.split('\t')[1]) # get the position of the line
if lastVcfChrom == chrom and lastVcfPos >= int(start) and lastVcfPos <= int(end):
n_rows += 1
usecols_list = [0, 1, 3, 4] + [*range(9, len(header))]
usecols_header = [header[i] for i in usecols_list]
if n_rows > 0:
vcfDF = pd.read_csv(vcfFile, sep='\t', skiprows=skip_rows, nrows=n_rows, usecols=usecols_list,
names=usecols_header, dtype = 'string', converters = {'POS': int}).set_index(['#CHROM', 'POS'])
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "from VCF file read into memory")
else: # Add a row with "NaN" in case the window is missing from the file
vcfDF = pd.DataFrame(columns=usecols_header).set_index(['#CHROM', 'POS'])
vcfDF.loc[(chrom,start),:] = (np.nan)
vcfDF.loc[(chrom, start), :] = np.nan
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Chromosome", chrom, "from position", start, "to position", end, "not present in VCF file, will be set to 'NaN'")
#print(vcfDF.info(memory_usage="deep"))
return vcfDF

# Join GERP and VCFs for each window
Expand Down

0 comments on commit 25acad5

Please sign in to comment.