Skip to content

Commit

Permalink
Merge pull request #71 from davidkastner/failure-checkup
Browse files Browse the repository at this point in the history
Updated the failure check in module
  • Loading branch information
davidkastner authored Sep 6, 2024
2 parents c304bb0 + 4aaa38c commit a0e2061
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 68 deletions.
2 changes: 1 addition & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ pcm_radii_file: /path/to/pcm_radii # The path to the pcm_radii file if using
create_jobs: false # Create QM job input files
submit_jobs: false # Submit the created jobs
job_count: 10 # The number of jobs to maintain on the scheduler
checkup: false # Check how status of qm jobs
job_checkup: false # Check how status of qm jobs
82 changes: 45 additions & 37 deletions qp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,49 +311,57 @@ def run(config):

@cli.command()
@click.option("--config", "-c", required=True, type=click.Path(exists=True), help="Path to the configuration YAML file")
@click.option("--failure_checkup", "-f", is_flag=True, help="Find failed structures")
def submit(config, failure_checkup):
"""Handles the submission of jobs for the quantumPDB."""

from qp.job_manager import create
from qp.job_manager import submit

if config:
# Parse configuration parameters
config_data = read_config(config)
optimization = config_data.get('optimization', False)
method = config_data.get('method', 'wpbeh')
basis = config_data.get('basis', 'lacvps_ecp')
guess = config_data.get('guess', 'generate')
gpus = config_data.get('gpus', 1)
memory = config_data.get('memory', '8G')
scheduler = config_data.get('scheduler', 'slurm')
pcm_radii_file = config_data.get('pcm_radii_file', 'pcm_radii')
job_count = config_data.get('job_count', 80)
charge_embedding = config_data.get('charge_embedding', False)
charge_embedding_cutoff = config_data.get('charge_embedding_cutoff', 20)
dielectric = config_data.get('dielectric', 10)
create_jobs = config_data.get('create_jobs', False)
submit_jobs = config_data.get('submit_jobs', False)
input = config_data.get('input', [])
output = config_data.get('output_dir', '')

if not os.path.exists(input):
raise FileNotFoundError(f"Could not find input file named {input}.")
input = os.path.abspath(input)

if create_jobs:
click.echo("> Creating job files for QM calculations")
create.create_jobs(input, output, optimization, basis, method, guess, charge_embedding, charge_embedding_cutoff, gpus, memory, scheduler, pcm_radii_file, dielectric)
if submit_jobs:
click.echo("\n> Submitting QM calculations")
submit.manage_jobs(output, job_count, method, scheduler)


if failure_checkup:
from qp.job_manager import failure_checkup
qm_job_dir = input("> What is the name of your QM job directory? ")
failure_counts = failure_checkup.check_all_jobs(qm_job_dir)
# Parse configuration parameters
config_data = read_config(config)
optimization = config_data.get('optimization', False)
method = config_data.get('method', 'wpbeh')
basis = config_data.get('basis', 'lacvps_ecp')
guess = config_data.get('guess', 'generate')
gpus = config_data.get('gpus', 1)
memory = config_data.get('memory', '8G')
scheduler = config_data.get('scheduler', 'slurm')
pcm_radii_file = config_data.get('pcm_radii_file', 'pcm_radii')
job_count = config_data.get('job_count', 80)
charge_embedding = config_data.get('charge_embedding', False)
charge_embedding_cutoff = config_data.get('charge_embedding_cutoff', 20)
dielectric = config_data.get('dielectric', 10)
create_jobs = config_data.get('create_jobs', False)
submit_jobs = config_data.get('submit_jobs', False)
input = config_data.get('input', [])
output = config_data.get('output_dir', '')

if not os.path.exists(input):
raise FileNotFoundError(f"Could not find input file named {input}.")
input = os.path.abspath(input)

if create_jobs:
click.echo("> Creating job files for QM calculations")
create.create_jobs(input, output, optimization, basis, method, guess, charge_embedding, charge_embedding_cutoff, gpus, memory, scheduler, pcm_radii_file, dielectric)
if submit_jobs:
click.echo("\n> Submitting QM calculations")
submit.manage_jobs(output, job_count, method, scheduler)


@cli.command()
@click.option("--config", "-c", required=True, type=click.Path(exists=True), help="Path to the configuration YAML file")
def analyze(config):
"""Functionality for analyzing complete jobs."""

from qp.job_manager import failure_checkup

config_data = read_config(config)
method = config_data.get('method', 'wpbeh')
job_checkup = config_data.get('job_checkup', True)
output = config_data.get('output_dir', 'dataset/v1')

if job_checkup:
failure_counts = failure_checkup.check_all_jobs(method, output)
failure_checkup.plot_failures(failure_counts)


Expand Down
96 changes: 66 additions & 30 deletions qp/job_manager/failure_checkup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

def format_plot() -> None:
"""General plotting parameters for the Kulik Lab."""

font = {"family": "sans-serif", "weight": "bold", "size": 10}
plt.rc("font", **font)
plt.rcParams["xtick.major.pad"] = 5
Expand All @@ -28,7 +27,7 @@ def check_failure_mode(filepath):
"""Checks for specific failure mode keywords in generated output."""
with open(filepath, 'r') as f:
content = f.read()

if "Incorrect molecular charge or spin multiplicity" in content:
return "charge"
elif "In Alloc2D: malloc failed" in content:
Expand All @@ -41,54 +40,91 @@ def check_failure_mode(filepath):
return "running"


def check_submit_record(submit_record_path):
"""Check the .submit_record file for backlog, queue, running, or done status."""
with open(submit_record_path, 'r') as f:
content = f.read()

queue_time = "Queue Time:" in content
run_start_time = "Run Start Time:" in content
run_end_time = "Run End Time:" in content

if queue_time and not run_start_time:
return "queue"
elif run_start_time and not run_end_time:
return "running"
elif run_end_time:
return "done"

return "backlog"


def classify_job(qm_dir_path):
"""Classify the job status based on the presence of .submit_record and qmscript.out."""
submit_record_path = os.path.join(qm_dir_path, ".submit_record")
qmscript_path = os.path.join(qm_dir_path, "qmscript.out")

# Check if there's no .submit_record -> backlog
if not os.path.exists(submit_record_path):
return "backlog"

# Use the .submit_record file to classify queue, running, or done
submit_status = check_submit_record(submit_record_path)

# If it's classified as done, check for failure modes
if submit_status == "done" and os.path.exists(qmscript_path):
return check_failure_mode(qmscript_path)

return submit_status


def plot_failures(failure_counts):
"""Create a bar plot for the failure modes."""
"""Create a bar plot for the failure modes in a specific order."""
format_plot()

labels = failure_counts.keys()
counts = failure_counts.values()
# Ensure that the statuses are ordered as desired
ordered_labels = ["done", "backlog", "queue", "running", "charge", "memory", "unknown"]
counts = [failure_counts[status] for status in ordered_labels]

plt.bar(labels, counts, color="silver")
plt.xlabel('job status', fontsize=12, fontweight='bold')
plt.ylabel('job count', fontsize=12, fontweight='bold')
plt.figure(figsize=(7, 4))
plt.bar(ordered_labels, counts, color="silver")
plt.xlabel('job status', fontsize=10, fontweight='bold')
plt.ylabel('job count', fontsize=10, fontweight='bold')
plt.savefig('job_status.png', bbox_inches="tight", dpi=600)


def check_all_jobs(qm_job_dir):
def check_all_jobs(qm_job_dir, output):
"""Loop over all jobs and check if they failed or are still queued."""

print(f" > Checking for failed QM jobs in the {qm_job_dir} directory.")
print(f"> Checking for failed QM jobs in the {output} directory.")
output_name = "failure_modes.txt"
failure_counts = {"done": 0, "charge": 0, "memory": 0, "unknown": 0, "running": 0, "queue": 0}
failure_counts = {"done": 0, "backlog": 0, "queue": 0, "running": 0,
"charge": 0, "memory": 0, "unknown": 0}

with open(output_name, "w") as output_file:
for pdb_dir in sorted(glob.glob('[0-9]*')): # Loop over PDB directories
base_dir = os.getcwd()
os.chdir(output)

all_pdb_dirs = sorted(glob.glob('[0-9]*'))
for pdb_dir in all_pdb_dirs: # Loop over PDB directories
for chain_dir in os.listdir(pdb_dir): # Loop over chain subdirectories
if chain_dir == "Protoss":
continue
chain_dir_path = os.path.join(pdb_dir, chain_dir)

if os.path.isdir(chain_dir_path) and chain_dir_path != "Protoss":
# Check each chain sub-subdirectory (e.g., A208, C208)
if qm_job_dir in os.listdir(chain_dir_path):
qm_dir_path = os.path.join(chain_dir_path, qm_job_dir)
qmscript_path = os.path.join(qm_dir_path, "qmscript.out")

if os.path.exists(qmscript_path):
failure_mode = check_failure_mode(qmscript_path)
failure_counts[failure_mode] += 1
if failure_mode not in ["done", "running"]:
output_file.write(f"{chain_dir_path} - {failure_mode}\n")
if os.path.isdir(chain_dir_path):
qm_dir_path = os.path.join(chain_dir_path, qm_job_dir)

if os.path.exists(qm_dir_path):
job_status = classify_job(qm_dir_path)
failure_counts[job_status] += 1
if job_status not in ["done", "running", "queue"]:
output_file.write(f"{chain_dir_path} - {job_status}\n")
else:
# No QM directory has been generated yet
failure_counts["queue"] += 1

print(f" > Saving checkup results in {output_name}\n")
os.chdir(base_dir)
print(f"> Saving checkup results in {output_name}\n")

return failure_counts


if __name__ == '__main__':
qm_job_dir = input("What is the name of your QM job directory? ")
failure_counts = check_all_jobs(qm_job_dir)
plot_failures(failure_counts)

0 comments on commit a0e2061

Please sign in to comment.