Merge pull request #4 from davidkastner/scripts

Job manager scripts
davidkastner · Sep 30, 2023 · 4a31761 · 4a31761
2 parents 8517f8e + 8038c7f
commit 4a31761
Show file tree

Hide file tree

Showing 6 changed files with 261 additions and 4 deletions.
diff --git a/qp/_version.py b/qp/_version.py
@@ -1 +1 @@
-__version__ = "1.0.0+55.g715c855.dirty"
+__version__ = "1.0.0+57.g38387ef.dirty"
diff --git a/qp/cluster/coordination_spheres.py b/qp/cluster/coordination_spheres.py
@@ -58,7 +58,8 @@ def voronoi(model):
     return neighbors
 
 
-def get_next_neighbors(start, neighbors, limit, ligands):
+def get_next_neighbors(start, neighbors, limit, ligands, include_waters=False):
+
     """
     Iteratively determines spheres around a given starting atom
 
@@ -90,7 +91,7 @@ def get_next_neighbors(start, neighbors, limit, ligands):
                     par = n.get_parent()
                     if par not in seen and (
                         Polypeptide.is_aa(par)
-                        or par.get_resname() == "HOH"
+                        or (include_waters and par.get_resname() == "HOH")
                         or par.get_resname() in ligands
                     ):
                         nxt.add(par)
@@ -371,6 +372,7 @@ def extract_clusters(
     charge=False,
     count=False,
     xyz=False,
+    include_waters=False,
 ):
     """
     Extract active site coordination spheres. Neighboring residues determined by
@@ -410,8 +412,9 @@ def extract_clusters(
     for res in model.get_residues():
         if res.get_resname() in metals:
             metal_id, residues, spheres = get_next_neighbors(
-                res, neighbors, limit, ligands
+                res, neighbors, limit, ligands, include_waters
             )
+
             os.makedirs(f"{out}/{metal_id}", exist_ok=True)
 
             if charge:

diff --git a/scripts/cluster_sizes.py b/scripts/cluster_sizes.py
@@ -0,0 +1,49 @@
+import os
+import re
+import csv
+
+def count_atoms_in_pdb(pdb_file):
+    """Counts atoms in a PDB file."""
+    count = 0
+    with open(pdb_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                count += 1
+    return count
+
+def get_earliest_dir(dirs):
+    """Get the earliest directory based on name starting with a letter followed by numbers."""
+    valid_dirs = sorted([d for d in dirs if re.match(r"^[a-zA-Z][0-9]+", d)])
+    return valid_dirs[0] if valid_dirs else None
+
+def main():
+    results = []
+    cwd = os.getcwd()
+
+    # Iterate over PDB directories
+    for pdb_dir in os.listdir(cwd):
+        pdb_path = os.path.join(cwd, pdb_dir)
+        if os.path.isdir(pdb_path) and len(pdb_dir) == 4:  # Typically PDB codes are 4 characters
+            subdirs = os.listdir(pdb_path)
+            # Ignore the "Protoss" directory
+            subdirs = [sd for sd in subdirs if sd != "Protoss"]
+
+            # Get the earliest directory
+            earliest_dir = get_earliest_dir(subdirs)
+            if earliest_dir:
+                atom_count = 0
+                subdir_path = os.path.join(pdb_path, earliest_dir)
+                for file in sorted(os.listdir(subdir_path)):
+                    if re.match(r"^[0-9]+\.pdb$", file):  # Matches 0.pdb, 1.pdb, etc.
+                        pdb_file_path = os.path.join(subdir_path, file)
+                        atom_count += count_atoms_in_pdb(pdb_file_path)
+                results.append((pdb_dir, atom_count))
+
+    # Store results in cluster_sizes.csv
+    with open("cluster_sizes.csv", "w", newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(["PDB", "ATOMS"])
+        writer.writerows(results)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/doi.py b/scripts/doi.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import requests
+
+def get_doi_url(pdb_code):
+    """Fetch the DOI for a given PDB code using the PDB REST API."""
+    url = f'https://data.rcsb.org/rest/v1/core/entry/{pdb_code}'
+    response = requests.get(url)
+
+    if response.status_code != 200:
+        print(f"Error: Failed to fetch data for PDB: {pdb_code}. Status code: {response.status_code}")
+        return None
+
+    response_data = response.json()
+
+    # Extract the DOI from the response data
+    doi = response_data.get('pdbx_database_id_doi')
+    if doi:
+        return doi
+    else:
+        print(f"No DOI found for PDB: {pdb_code}")
+        return None
+
+def main():
+    # Read the CSV file
+    df = pd.read_csv('protein_master_list_charge_multiplicity.csv')
+
+    # Total number of PDBs
+    total_pdbs = len(df)
+
+    # Counter for API calls since last save
+    api_calls_since_last_save = 0
+
+    # For each PDB code, if there isn't already a DOI URL, fetch the DOI and set the DOI URL in Column 6
+    for index, row in df.iterrows():
+        pdb_code = row.iloc[0]
+        existing_doi_url = row.iloc[5]
+
+        # Check if the DOI URL is already present
+        if pd.isnull(existing_doi_url):
+            doi = get_doi_url(pdb_code)
+            if doi:
+                doi_url = f'https://doi.org/{doi}'
+                df.at[index, df.columns[5]] = doi_url
+
+            # Increment the API call counter
+            api_calls_since_last_save += 1
+
+            # Print the progress update
+            print(f"> Finished PDB: {pdb_code} ({index + 1}/{total_pdbs})")
+
+            # If 5 API calls have been made since the last save, save the dataframe and reset the counter
+            if api_calls_since_last_save == 5:
+                df.to_csv('doi_charge_multiplicity.csv', index=False)
+                api_calls_since_last_save = 0
+
+    # Save one final time after the loop is done to catch any remaining changes
+    df.to_csv('doi_charge_multiplicity.csv', index=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/new_master.py b/scripts/new_master.py
@@ -0,0 +1,20 @@
+import os
+
+def main():
+    # Read PDB ID codes from the file
+    with open('protein_master_list.txt', 'r') as f:
+        master_list = [line.strip() for line in f.readlines()]
+
+    # Get list of directories in current working directory
+    current_dirs = [name for name in os.listdir() if os.path.isdir(name)]
+
+    # Find the PDB IDs which do not have corresponding folders
+    missing_dirs = [pdb_id for pdb_id in master_list if pdb_id not in current_dirs]
+
+    # Print and save the list of missing IDs to a new file
+    with open('missing_ids.txt', 'w') as out_file:
+        for pdb_id in missing_dirs:
+            out_file.write(pdb_id + '\n')
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/qp_job_manager.py b/scripts/qp_job_manager.py
@@ -0,0 +1,125 @@
+import os
+import csv
+import sys
+import time
+
+PDB_PATH = "/home/kastner/projects/quantumPDB/dataset/holo"
+
+def read_charge(pdb_dir):
+    charge_file = os.path.join(pdb_dir, 'charge.csv')
+    charges = {}
+    with open(charge_file, 'r') as file:
+        reader = csv.reader(file)
+        next(reader)  # Skip the header
+        for row in reader:
+            # Stop processing at empty line or rows with fewer than 2 columns
+            if len(row) < 2:
+                break
+            charges[row[0]] = row[1:]
+    return charges
+
+def check_and_submit_jobs(pdb_dir, max_jobs, current_count):
+    charges = read_charge(pdb_dir)
+
+    for structure_dir in sorted([d for d in os.listdir(pdb_dir) if os.path.isdir(os.path.join(pdb_dir, d)) and not d == 'Protoss']):
+        structure_path = os.path.join(pdb_dir, structure_dir)
+        if len([f for f in os.listdir(structure_path) if os.path.isfile(os.path.join(structure_path, f))]) != 4:
+            print(f"> Directory {pdb_dir.split('/')[-1]} is in progress with quantumPDB.")
+            continue
+
+        xyz_file = next((f for f in os.listdir(structure_path) if f.endswith('.xyz')), None)
+        if not xyz_file:
+            continue
+        with open(os.path.join(structure_path, xyz_file), 'r') as file:
+            coords = file.read().replace('FE', 'Fe')
+
+        for folder in ['Fe2', 'Fe3']:
+            folder_path = os.path.join(structure_path, 'QM', folder)
+            os.makedirs(folder_path, exist_ok=True)
+            with open(os.path.join(folder_path, xyz_file), 'w') as file:
+                file.write(coords)
+
+            qmscript_content = f"""coordinates {xyz_file}
+basis lacvps_ecp
+method ub3lyp
+charge {charges[structure_dir][0 if folder == 'Fe2' else 1]}
+spinmult {2 if folder == 'Fe2' else 3}
+maxit 1000 
+dftd d3
+scrdir ./scr
+pcm cosmo
+epsilon 10
+scf diis+a
+pcm_radii read
+pcm_radii_file /home/kastner/reference/pcm_radii
+ml_prop yes
+end
+"""
+            with open(os.path.join(folder_path, 'qmscript.in'), 'w') as file:
+                file.write(qmscript_content)
+
+        jobscript_content = f"""#! /bin/bash
+#$ -N {structure_dir}_{pdb_dir.split('/')[-1]}
+#$ -cwd
+#$ -l h_rt=300:00:00
+#$ -l h_rss=8G
+#$ -q (gpusnew|gpus|gpusbig)
+#$ -l gpus=1
+#$ -pe smp 2
+# -fin qmscript.in
+# -fin {xyz_file}
+# -fout scr/
+
+module load cuda
+module load terachem
+export OMP_NUM_THREADS=2
+
+terachem qmscript.in > $SGE_O_WORKDIR/qmscript.out
+"""
+        with open(os.path.join(folder_path, 'jobscript.sh'), 'w') as file:
+            file.write(jobscript_content)
+
+        # Change to the directory of the job file
+        os.chdir(folder_path)
+
+        # Submit the job to the scheduler
+        os.system(f"qsub jobscript.sh")
+
+        # Change back to the original directory
+        os.chdir(PDB_PATH)
+
+        current_count[0] += 1
+        if current_count[0] >= max_jobs:
+            return True  # indicate that the limit has been reached
+
+        # Check if there's a stop signal file to stop the script
+        if os.path.exists('stop_script.signal'):
+            print("Stop signal detected. Exiting script.")
+            sys.exit(0)
+
+        # Introducing a delay of 2 seconds between submissions
+        time.sleep(2)
+    return False  # indicate that the limit hasn't been reached yet
+
+
+def cleanup(pdb_dir):
+    for structure_dir in [d for d in os.listdir(pdb_dir) if os.path.isdir(os.path.join(pdb_dir, d)) and not d == 'Protoss']:
+        for folder in ['Fe2', 'Fe3']:
+            folder_path = os.path.join(pdb_dir, structure_dir, 'QM', folder)
+            if os.path.exists(folder_path):
+                for file in os.listdir(folder_path):
+                    if file.startswith('PDB'):
+                        os.remove(os.path.join(folder_path, file))
+
+if __name__ == "__main__":
+    # Get the number of jobs to submit from the user
+    max_jobs = int(input("Enter the number of jobs to submit: "))
+
+    job_count = [0]  # using a mutable list to pass the count by reference
+    limit_reached = False
+
+    for pdb_dir in sorted([d for d in os.listdir(PDB_PATH) if os.path.isdir(os.path.join(PDB_PATH, d))]):
+        if limit_reached:
+            break
+        cleanup(os.path.join(PDB_PATH, pdb_dir))
+        limit_reached = check_and_submit_jobs(os.path.join(PDB_PATH, pdb_dir), max_jobs, job_count)