-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from davidkastner/scripts
Job manager scripts
- Loading branch information
Showing
6 changed files
with
261 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "1.0.0+55.g715c855.dirty" | ||
__version__ = "1.0.0+57.g38387ef.dirty" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import os | ||
import re | ||
import csv | ||
|
||
def count_atoms_in_pdb(pdb_file): | ||
"""Counts atoms in a PDB file.""" | ||
count = 0 | ||
with open(pdb_file, 'r') as file: | ||
for line in file: | ||
if line.startswith("ATOM"): | ||
count += 1 | ||
return count | ||
|
||
def get_earliest_dir(dirs): | ||
"""Get the earliest directory based on name starting with a letter followed by numbers.""" | ||
valid_dirs = sorted([d for d in dirs if re.match(r"^[a-zA-Z][0-9]+", d)]) | ||
return valid_dirs[0] if valid_dirs else None | ||
|
||
def main(): | ||
results = [] | ||
cwd = os.getcwd() | ||
|
||
# Iterate over PDB directories | ||
for pdb_dir in os.listdir(cwd): | ||
pdb_path = os.path.join(cwd, pdb_dir) | ||
if os.path.isdir(pdb_path) and len(pdb_dir) == 4: # Typically PDB codes are 4 characters | ||
subdirs = os.listdir(pdb_path) | ||
# Ignore the "Protoss" directory | ||
subdirs = [sd for sd in subdirs if sd != "Protoss"] | ||
|
||
# Get the earliest directory | ||
earliest_dir = get_earliest_dir(subdirs) | ||
if earliest_dir: | ||
atom_count = 0 | ||
subdir_path = os.path.join(pdb_path, earliest_dir) | ||
for file in sorted(os.listdir(subdir_path)): | ||
if re.match(r"^[0-9]+\.pdb$", file): # Matches 0.pdb, 1.pdb, etc. | ||
pdb_file_path = os.path.join(subdir_path, file) | ||
atom_count += count_atoms_in_pdb(pdb_file_path) | ||
results.append((pdb_dir, atom_count)) | ||
|
||
# Store results in cluster_sizes.csv | ||
with open("cluster_sizes.csv", "w", newline='') as csvfile: | ||
writer = csv.writer(csvfile) | ||
writer.writerow(["PDB", "ATOMS"]) | ||
writer.writerows(results) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import pandas as pd | ||
import requests | ||
|
||
def get_doi_url(pdb_code): | ||
"""Fetch the DOI for a given PDB code using the PDB REST API.""" | ||
url = f'https://data.rcsb.org/rest/v1/core/entry/{pdb_code}' | ||
response = requests.get(url) | ||
|
||
if response.status_code != 200: | ||
print(f"Error: Failed to fetch data for PDB: {pdb_code}. Status code: {response.status_code}") | ||
return None | ||
|
||
response_data = response.json() | ||
|
||
# Extract the DOI from the response data | ||
doi = response_data.get('pdbx_database_id_doi') | ||
if doi: | ||
return doi | ||
else: | ||
print(f"No DOI found for PDB: {pdb_code}") | ||
return None | ||
|
||
def main(): | ||
# Read the CSV file | ||
df = pd.read_csv('protein_master_list_charge_multiplicity.csv') | ||
|
||
# Total number of PDBs | ||
total_pdbs = len(df) | ||
|
||
# Counter for API calls since last save | ||
api_calls_since_last_save = 0 | ||
|
||
# For each PDB code, if there isn't already a DOI URL, fetch the DOI and set the DOI URL in Column 6 | ||
for index, row in df.iterrows(): | ||
pdb_code = row.iloc[0] | ||
existing_doi_url = row.iloc[5] | ||
|
||
# Check if the DOI URL is already present | ||
if pd.isnull(existing_doi_url): | ||
doi = get_doi_url(pdb_code) | ||
if doi: | ||
doi_url = f'https://doi.org/{doi}' | ||
df.at[index, df.columns[5]] = doi_url | ||
|
||
# Increment the API call counter | ||
api_calls_since_last_save += 1 | ||
|
||
# Print the progress update | ||
print(f"> Finished PDB: {pdb_code} ({index + 1}/{total_pdbs})") | ||
|
||
# If 5 API calls have been made since the last save, save the dataframe and reset the counter | ||
if api_calls_since_last_save == 5: | ||
df.to_csv('doi_charge_multiplicity.csv', index=False) | ||
api_calls_since_last_save = 0 | ||
|
||
# Save one final time after the loop is done to catch any remaining changes | ||
df.to_csv('doi_charge_multiplicity.csv', index=False) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import os | ||
|
||
def main(): | ||
# Read PDB ID codes from the file | ||
with open('protein_master_list.txt', 'r') as f: | ||
master_list = [line.strip() for line in f.readlines()] | ||
|
||
# Get list of directories in current working directory | ||
current_dirs = [name for name in os.listdir() if os.path.isdir(name)] | ||
|
||
# Find the PDB IDs which do not have corresponding folders | ||
missing_dirs = [pdb_id for pdb_id in master_list if pdb_id not in current_dirs] | ||
|
||
# Print and save the list of missing IDs to a new file | ||
with open('missing_ids.txt', 'w') as out_file: | ||
for pdb_id in missing_dirs: | ||
out_file.write(pdb_id + '\n') | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import os | ||
import csv | ||
import sys | ||
import time | ||
|
||
PDB_PATH = "/home/kastner/projects/quantumPDB/dataset/holo" | ||
|
||
def read_charge(pdb_dir): | ||
charge_file = os.path.join(pdb_dir, 'charge.csv') | ||
charges = {} | ||
with open(charge_file, 'r') as file: | ||
reader = csv.reader(file) | ||
next(reader) # Skip the header | ||
for row in reader: | ||
# Stop processing at empty line or rows with fewer than 2 columns | ||
if len(row) < 2: | ||
break | ||
charges[row[0]] = row[1:] | ||
return charges | ||
|
||
def check_and_submit_jobs(pdb_dir, max_jobs, current_count): | ||
charges = read_charge(pdb_dir) | ||
|
||
for structure_dir in sorted([d for d in os.listdir(pdb_dir) if os.path.isdir(os.path.join(pdb_dir, d)) and not d == 'Protoss']): | ||
structure_path = os.path.join(pdb_dir, structure_dir) | ||
if len([f for f in os.listdir(structure_path) if os.path.isfile(os.path.join(structure_path, f))]) != 4: | ||
print(f"> Directory {pdb_dir.split('/')[-1]} is in progress with quantumPDB.") | ||
continue | ||
|
||
xyz_file = next((f for f in os.listdir(structure_path) if f.endswith('.xyz')), None) | ||
if not xyz_file: | ||
continue | ||
with open(os.path.join(structure_path, xyz_file), 'r') as file: | ||
coords = file.read().replace('FE', 'Fe') | ||
|
||
for folder in ['Fe2', 'Fe3']: | ||
folder_path = os.path.join(structure_path, 'QM', folder) | ||
os.makedirs(folder_path, exist_ok=True) | ||
with open(os.path.join(folder_path, xyz_file), 'w') as file: | ||
file.write(coords) | ||
|
||
qmscript_content = f"""coordinates {xyz_file} | ||
basis lacvps_ecp | ||
method ub3lyp | ||
charge {charges[structure_dir][0 if folder == 'Fe2' else 1]} | ||
spinmult {2 if folder == 'Fe2' else 3} | ||
maxit 1000 | ||
dftd d3 | ||
scrdir ./scr | ||
pcm cosmo | ||
epsilon 10 | ||
scf diis+a | ||
pcm_radii read | ||
pcm_radii_file /home/kastner/reference/pcm_radii | ||
ml_prop yes | ||
end | ||
""" | ||
with open(os.path.join(folder_path, 'qmscript.in'), 'w') as file: | ||
file.write(qmscript_content) | ||
|
||
jobscript_content = f"""#! /bin/bash | ||
#$ -N {structure_dir}_{pdb_dir.split('/')[-1]} | ||
#$ -cwd | ||
#$ -l h_rt=300:00:00 | ||
#$ -l h_rss=8G | ||
#$ -q (gpusnew|gpus|gpusbig) | ||
#$ -l gpus=1 | ||
#$ -pe smp 2 | ||
# -fin qmscript.in | ||
# -fin {xyz_file} | ||
# -fout scr/ | ||
module load cuda | ||
module load terachem | ||
export OMP_NUM_THREADS=2 | ||
terachem qmscript.in > $SGE_O_WORKDIR/qmscript.out | ||
""" | ||
with open(os.path.join(folder_path, 'jobscript.sh'), 'w') as file: | ||
file.write(jobscript_content) | ||
|
||
# Change to the directory of the job file | ||
os.chdir(folder_path) | ||
|
||
# Submit the job to the scheduler | ||
os.system(f"qsub jobscript.sh") | ||
|
||
# Change back to the original directory | ||
os.chdir(PDB_PATH) | ||
|
||
current_count[0] += 1 | ||
if current_count[0] >= max_jobs: | ||
return True # indicate that the limit has been reached | ||
|
||
# Check if there's a stop signal file to stop the script | ||
if os.path.exists('stop_script.signal'): | ||
print("Stop signal detected. Exiting script.") | ||
sys.exit(0) | ||
|
||
# Introducing a delay of 2 seconds between submissions | ||
time.sleep(2) | ||
return False # indicate that the limit hasn't been reached yet | ||
|
||
|
||
def cleanup(pdb_dir): | ||
for structure_dir in [d for d in os.listdir(pdb_dir) if os.path.isdir(os.path.join(pdb_dir, d)) and not d == 'Protoss']: | ||
for folder in ['Fe2', 'Fe3']: | ||
folder_path = os.path.join(pdb_dir, structure_dir, 'QM', folder) | ||
if os.path.exists(folder_path): | ||
for file in os.listdir(folder_path): | ||
if file.startswith('PDB'): | ||
os.remove(os.path.join(folder_path, file)) | ||
|
||
if __name__ == "__main__": | ||
# Get the number of jobs to submit from the user | ||
max_jobs = int(input("Enter the number of jobs to submit: ")) | ||
|
||
job_count = [0] # using a mutable list to pass the count by reference | ||
limit_reached = False | ||
|
||
for pdb_dir in sorted([d for d in os.listdir(PDB_PATH) if os.path.isdir(os.path.join(PDB_PATH, d))]): | ||
if limit_reached: | ||
break | ||
cleanup(os.path.join(PDB_PATH, pdb_dir)) | ||
limit_reached = check_and_submit_jobs(os.path.join(PDB_PATH, pdb_dir), max_jobs, job_count) |