Skip to content

Commit

Permalink
Merge pull request #4 from davidkastner/scripts
Browse files Browse the repository at this point in the history
Job manager scripts
  • Loading branch information
davidkastner authored Sep 30, 2023
2 parents 8517f8e + 8038c7f commit 4a31761
Show file tree
Hide file tree
Showing 6 changed files with 261 additions and 4 deletions.
2 changes: 1 addition & 1 deletion qp/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.0+55.g715c855.dirty"
__version__ = "1.0.0+57.g38387ef.dirty"
9 changes: 6 additions & 3 deletions qp/cluster/coordination_spheres.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def voronoi(model):
return neighbors


def get_next_neighbors(start, neighbors, limit, ligands):
def get_next_neighbors(start, neighbors, limit, ligands, include_waters=False):

"""
Iteratively determines spheres around a given starting atom
Expand Down Expand Up @@ -90,7 +91,7 @@ def get_next_neighbors(start, neighbors, limit, ligands):
par = n.get_parent()
if par not in seen and (
Polypeptide.is_aa(par)
or par.get_resname() == "HOH"
or (include_waters and par.get_resname() == "HOH")
or par.get_resname() in ligands
):
nxt.add(par)
Expand Down Expand Up @@ -371,6 +372,7 @@ def extract_clusters(
charge=False,
count=False,
xyz=False,
include_waters=False,
):
"""
Extract active site coordination spheres. Neighboring residues determined by
Expand Down Expand Up @@ -410,8 +412,9 @@ def extract_clusters(
for res in model.get_residues():
if res.get_resname() in metals:
metal_id, residues, spheres = get_next_neighbors(
res, neighbors, limit, ligands
res, neighbors, limit, ligands, include_waters
)

os.makedirs(f"{out}/{metal_id}", exist_ok=True)

if charge:
Expand Down
49 changes: 49 additions & 0 deletions scripts/cluster_sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import re
import csv

def count_atoms_in_pdb(pdb_file):
"""Counts atoms in a PDB file."""
count = 0
with open(pdb_file, 'r') as file:
for line in file:
if line.startswith("ATOM"):
count += 1
return count

def get_earliest_dir(dirs):
"""Get the earliest directory based on name starting with a letter followed by numbers."""
valid_dirs = sorted([d for d in dirs if re.match(r"^[a-zA-Z][0-9]+", d)])
return valid_dirs[0] if valid_dirs else None

def main():
results = []
cwd = os.getcwd()

# Iterate over PDB directories
for pdb_dir in os.listdir(cwd):
pdb_path = os.path.join(cwd, pdb_dir)
if os.path.isdir(pdb_path) and len(pdb_dir) == 4: # Typically PDB codes are 4 characters
subdirs = os.listdir(pdb_path)
# Ignore the "Protoss" directory
subdirs = [sd for sd in subdirs if sd != "Protoss"]

# Get the earliest directory
earliest_dir = get_earliest_dir(subdirs)
if earliest_dir:
atom_count = 0
subdir_path = os.path.join(pdb_path, earliest_dir)
for file in sorted(os.listdir(subdir_path)):
if re.match(r"^[0-9]+\.pdb$", file): # Matches 0.pdb, 1.pdb, etc.
pdb_file_path = os.path.join(subdir_path, file)
atom_count += count_atoms_in_pdb(pdb_file_path)
results.append((pdb_dir, atom_count))

# Store results in cluster_sizes.csv
with open("cluster_sizes.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["PDB", "ATOMS"])
writer.writerows(results)

if __name__ == "__main__":
main()
60 changes: 60 additions & 0 deletions scripts/doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd
import requests

def get_doi_url(pdb_code):
"""Fetch the DOI for a given PDB code using the PDB REST API."""
url = f'https://data.rcsb.org/rest/v1/core/entry/{pdb_code}'
response = requests.get(url)

if response.status_code != 200:
print(f"Error: Failed to fetch data for PDB: {pdb_code}. Status code: {response.status_code}")
return None

response_data = response.json()

# Extract the DOI from the response data
doi = response_data.get('pdbx_database_id_doi')
if doi:
return doi
else:
print(f"No DOI found for PDB: {pdb_code}")
return None

def main():
# Read the CSV file
df = pd.read_csv('protein_master_list_charge_multiplicity.csv')

# Total number of PDBs
total_pdbs = len(df)

# Counter for API calls since last save
api_calls_since_last_save = 0

# For each PDB code, if there isn't already a DOI URL, fetch the DOI and set the DOI URL in Column 6
for index, row in df.iterrows():
pdb_code = row.iloc[0]
existing_doi_url = row.iloc[5]

# Check if the DOI URL is already present
if pd.isnull(existing_doi_url):
doi = get_doi_url(pdb_code)
if doi:
doi_url = f'https://doi.org/{doi}'
df.at[index, df.columns[5]] = doi_url

# Increment the API call counter
api_calls_since_last_save += 1

# Print the progress update
print(f"> Finished PDB: {pdb_code} ({index + 1}/{total_pdbs})")

# If 5 API calls have been made since the last save, save the dataframe and reset the counter
if api_calls_since_last_save == 5:
df.to_csv('doi_charge_multiplicity.csv', index=False)
api_calls_since_last_save = 0

# Save one final time after the loop is done to catch any remaining changes
df.to_csv('doi_charge_multiplicity.csv', index=False)

if __name__ == "__main__":
main()
20 changes: 20 additions & 0 deletions scripts/new_master.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os

def main():
# Read PDB ID codes from the file
with open('protein_master_list.txt', 'r') as f:
master_list = [line.strip() for line in f.readlines()]

# Get list of directories in current working directory
current_dirs = [name for name in os.listdir() if os.path.isdir(name)]

# Find the PDB IDs which do not have corresponding folders
missing_dirs = [pdb_id for pdb_id in master_list if pdb_id not in current_dirs]

# Print and save the list of missing IDs to a new file
with open('missing_ids.txt', 'w') as out_file:
for pdb_id in missing_dirs:
out_file.write(pdb_id + '\n')

if __name__ == "__main__":
main()
125 changes: 125 additions & 0 deletions scripts/qp_job_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import csv
import sys
import time

PDB_PATH = "/home/kastner/projects/quantumPDB/dataset/holo"

def read_charge(pdb_dir):
charge_file = os.path.join(pdb_dir, 'charge.csv')
charges = {}
with open(charge_file, 'r') as file:
reader = csv.reader(file)
next(reader) # Skip the header
for row in reader:
# Stop processing at empty line or rows with fewer than 2 columns
if len(row) < 2:
break
charges[row[0]] = row[1:]
return charges

def check_and_submit_jobs(pdb_dir, max_jobs, current_count):
charges = read_charge(pdb_dir)

for structure_dir in sorted([d for d in os.listdir(pdb_dir) if os.path.isdir(os.path.join(pdb_dir, d)) and not d == 'Protoss']):
structure_path = os.path.join(pdb_dir, structure_dir)
if len([f for f in os.listdir(structure_path) if os.path.isfile(os.path.join(structure_path, f))]) != 4:
print(f"> Directory {pdb_dir.split('/')[-1]} is in progress with quantumPDB.")
continue

xyz_file = next((f for f in os.listdir(structure_path) if f.endswith('.xyz')), None)
if not xyz_file:
continue
with open(os.path.join(structure_path, xyz_file), 'r') as file:
coords = file.read().replace('FE', 'Fe')

for folder in ['Fe2', 'Fe3']:
folder_path = os.path.join(structure_path, 'QM', folder)
os.makedirs(folder_path, exist_ok=True)
with open(os.path.join(folder_path, xyz_file), 'w') as file:
file.write(coords)

qmscript_content = f"""coordinates {xyz_file}
basis lacvps_ecp
method ub3lyp
charge {charges[structure_dir][0 if folder == 'Fe2' else 1]}
spinmult {2 if folder == 'Fe2' else 3}
maxit 1000
dftd d3
scrdir ./scr
pcm cosmo
epsilon 10
scf diis+a
pcm_radii read
pcm_radii_file /home/kastner/reference/pcm_radii
ml_prop yes
end
"""
with open(os.path.join(folder_path, 'qmscript.in'), 'w') as file:
file.write(qmscript_content)

jobscript_content = f"""#! /bin/bash
#$ -N {structure_dir}_{pdb_dir.split('/')[-1]}
#$ -cwd
#$ -l h_rt=300:00:00
#$ -l h_rss=8G
#$ -q (gpusnew|gpus|gpusbig)
#$ -l gpus=1
#$ -pe smp 2
# -fin qmscript.in
# -fin {xyz_file}
# -fout scr/
module load cuda
module load terachem
export OMP_NUM_THREADS=2
terachem qmscript.in > $SGE_O_WORKDIR/qmscript.out
"""
with open(os.path.join(folder_path, 'jobscript.sh'), 'w') as file:
file.write(jobscript_content)

# Change to the directory of the job file
os.chdir(folder_path)

# Submit the job to the scheduler
os.system(f"qsub jobscript.sh")

# Change back to the original directory
os.chdir(PDB_PATH)

current_count[0] += 1
if current_count[0] >= max_jobs:
return True # indicate that the limit has been reached

# Check if there's a stop signal file to stop the script
if os.path.exists('stop_script.signal'):
print("Stop signal detected. Exiting script.")
sys.exit(0)

# Introducing a delay of 2 seconds between submissions
time.sleep(2)
return False # indicate that the limit hasn't been reached yet


def cleanup(pdb_dir):
for structure_dir in [d for d in os.listdir(pdb_dir) if os.path.isdir(os.path.join(pdb_dir, d)) and not d == 'Protoss']:
for folder in ['Fe2', 'Fe3']:
folder_path = os.path.join(pdb_dir, structure_dir, 'QM', folder)
if os.path.exists(folder_path):
for file in os.listdir(folder_path):
if file.startswith('PDB'):
os.remove(os.path.join(folder_path, file))

if __name__ == "__main__":
# Get the number of jobs to submit from the user
max_jobs = int(input("Enter the number of jobs to submit: "))

job_count = [0] # using a mutable list to pass the count by reference
limit_reached = False

for pdb_dir in sorted([d for d in os.listdir(PDB_PATH) if os.path.isdir(os.path.join(PDB_PATH, d))]):
if limit_reached:
break
cleanup(os.path.join(PDB_PATH, pdb_dir))
limit_reached = check_and_submit_jobs(os.path.join(PDB_PATH, pdb_dir), max_jobs, job_count)

0 comments on commit 4a31761

Please sign in to comment.