diff --git a/Makefile b/Makefile index 6b8827d2..948e15f4 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ pytest: test-docs: $(DOCTEST) src/malco/*.py src/malco/*/*.py + $(DOCTEST) src/malco/runner.py src/malco/run/*.py src/malco/prepare/*.py src/malco/post_process/*.py %-doctest: % $(DOCTEST) $< diff --git a/src/malco/analysis/disease_avail_knowledge.py b/src/malco/analysis/disease_avail_knowledge.py index f6f33a41..8e32d707 100644 --- a/src/malco/analysis/disease_avail_knowledge.py +++ b/src/malco/analysis/disease_avail_knowledge.py @@ -4,6 +4,7 @@ # (1) Parse out disease genes discovered after 2008/9 (First thing in HPOA) # Look for a correlation between date annotated and disease correctly diagnosed. # Hypothesis: the older the easier to diagnose +# PNR suggests: for each ppkt we have a date # (2) To start, looking at the two broad categories found/not-found, count average number of all links # After that, count average number of links of some kind # Then, something more graphy, such as, centrality? Maybe need to project out something first to find signal in the noise... diff --git a/src/malco/prepare/setup_run_pars.py b/src/malco/prepare/setup_run_pars.py index d27dd0b6..6944ee90 100644 --- a/src/malco/prepare/setup_run_pars.py +++ b/src/malco/prepare/setup_run_pars.py @@ -2,8 +2,9 @@ import csv import sys -def import_inputdata(self): - """Example inputfile is located in input_dir and named run_parameters.csv +def import_inputdata(self) -> None: + """ + Example input file is located in ``self.input_dir`` and named run_parameters.csv It should contain something like: ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ "en" diff --git a/src/malco/run/run.py b/src/malco/run/run.py index 61870d5f..1ad3e416 100644 --- a/src/malco/run/run.py +++ b/src/malco/run/run.py @@ -8,9 +8,25 @@ from malco.run.search_ppkts import search_ppkts def call_ontogpt( - lang, raw_results_dir, input_dir, model, + lang: str, + raw_results_dir: Path, + input_dir: Path, + model: str, modality: typing.Literal['several_languages', 'several_models'], -): +)-> None: + """ + Wrapper used for parallel execution of ontogpt. + + Args: + lang (str): Two-letter language code, for example "en" for English. + raw_results_dir (Path): Path to the raw results directory. + output_dir (Path): Path to the output directory. + model (str): Name of the model to be run, e.g. "gpt-4-turbo". + modality (str): Determines whether English and several models or gpt-4o and several languages are being run. + + Returns: + None + """ prompt_dir = f'{input_dir}/prompts/' if modality == 'several_languages': lang_or_model_dir = lang diff --git a/src/malco/run/search_ppkts.py b/src/malco/run/search_ppkts.py index 61496a62..d0cfd560 100644 --- a/src/malco/run/search_ppkts.py +++ b/src/malco/run/search_ppkts.py @@ -7,8 +7,12 @@ def search_ppkts(input_dir, prompt_dir, raw_results_dir, lang_or_model): """ Check what ppkts have already been computed in current output dir, for current run parameters. + ontogpt will run every .txt that is in inputdir, we need a tmp inputdir - excluding already run cases. + excluding already run cases. Source of truth is the results.yaml output by ontogpt. + Only extracted_object containing terms is considered successfully run. + + Note that rerunning """ # List of "labels" that are already present in results.yaml iff terms is not None