From fa219fdfd6edddda4524c19de104d2d84718d23a Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 28 Mar 2023 17:05:08 +0200 Subject: [PATCH 01/24] Add data_manager_mode argument fort IDC use Idea is that we build bundles against the existing servers, then just pull the bundles into an instance that can write to cvmfs. --- src/ephemeris/run_data_managers.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index d89268f..dff6498 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -28,7 +28,9 @@ import logging import time from collections import namedtuple +from typing import Literal +from bioblend.galaxy import GalaxyInstance from bioblend.galaxy.tool_data import ToolDataClient from bioblend.galaxy.tools import ToolClient from jinja2 import Template @@ -49,6 +51,7 @@ DEFAULT_URL = "http://localhost" DEFAULT_SOURCE_TABLES = ["all_fasta"] +DATA_MANAGER_MODES = Literal["dry_run", "populate", "bundle"] def wait(gi, job_list, log): @@ -104,7 +107,7 @@ def get_first_valid_entry(input_dict, key_list): class DataManagers: - def __init__(self, galaxy_instance, configuration): + def __init__(self, galaxy_instance: GalaxyInstance, configuration): """ :param galaxy_instance: A GalaxyInstance object (import from bioblend.galaxy) :param configuration: A dictionary. Examples in the ephemeris documentation. @@ -236,7 +239,7 @@ def parse_items(self, items): items = json.loads(rendered_items) return items - def run(self, log=None, ignore_errors=False, overwrite=False): + def run(self, log=None, ignore_errors=False, overwrite=False, data_manager_mode: DATA_MANAGER_MODES = "populate"): """ Runs the data managers. :param log: The log to be used. @@ -266,7 +269,7 @@ def run_jobs(jobs, skipped_jobs): all_skipped_jobs.append(skipped_job) for job in jobs: started_job = self.tool_client.run_tool( - history_id=None, tool_id=job["tool_id"], tool_inputs=job["inputs"] + history_id=None, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode ) log.info( 'Dispatched job %i. Running DM: "%s" with parameters: %s' @@ -327,6 +330,7 @@ def _parser(): action="store_true", help="Do not stop running when jobs have failed.", ) + parser.add_argument("--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") return parser @@ -342,7 +346,7 @@ def main(argv=None): gi = get_galaxy_connection(args, file=args.config, log=log, login_required=True) config = load_yaml_file(args.config) data_managers = DataManagers(gi, config) - data_managers.run(log, args.ignore_errors, args.overwrite) + data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode) if __name__ == "__main__": From ac2103b41e7bc6439050ac2a11f03a9d5d04dcfe Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 18 Apr 2023 12:56:37 +0200 Subject: [PATCH 02/24] Not sure if that was necessary --- src/ephemeris/shed_tools.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/shed_tools.py b/src/ephemeris/shed_tools.py index 8807fc0..a54e5e1 100644 --- a/src/ephemeris/shed_tools.py +++ b/src/ephemeris/shed_tools.py @@ -35,6 +35,7 @@ """ import datetime as dt import json +import logging import os import re import time @@ -94,6 +95,7 @@ "Loading proprietary datatypes", } +log = logging.getLogger(__name__) class InstallRepoDict(TypedDict): name: str @@ -164,7 +166,7 @@ def filter_installed_repos(self, repos: Iterable[InstallRepoDict], check_revisio def install_repositories( self, repositories: List[InstallRepoDict], - log=None, + log=log, force_latest_revision: bool = False, default_toolshed: str = "https://toolshed.g2.bx.psu.edu/", default_install_tool_dependencies: bool = False, @@ -265,7 +267,7 @@ def install_repositories( errored_repositories=errored_repositories, ) - def update_repositories(self, repositories=None, log=None, **kwargs): + def update_repositories(self, repositories=None, log=log, **kwargs): if not repositories: # Repositories None or empty list repositories = self.installed_repositories() else: @@ -284,7 +286,7 @@ def test_tools( self, test_json, repositories=None, - log=None, + log=log, test_user_api_key=None, test_user="ephemeris@galaxyproject.org", test_history_name=None, @@ -535,7 +537,7 @@ def install_repository_revision(self, repository: InstallRepoDict, log): ) return "error" - def wait_for_install(self, repository, log=None, timeout=3600): + def wait_for_install(self, repository, log=log, timeout=3600): """ If nginx times out, we look into the list of installed repositories and try to determine if a repository of the same namer/owner is still installing. From 0b60c003fd92d48697ad27db31acdb6dedfad7d9 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 29 Jun 2023 12:25:18 +0200 Subject: [PATCH 03/24] All setting history name in run_data_managers.py --- src/ephemeris/__init__.py | 8 ++++++++ src/ephemeris/run_data_managers.py | 24 +++++++++++++++++++----- src/ephemeris/shed_tools.py | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/ephemeris/__init__.py b/src/ephemeris/__init__.py index 2a1d461..42f42b8 100644 --- a/src/ephemeris/__init__.py +++ b/src/ephemeris/__init__.py @@ -11,6 +11,14 @@ RAW_CONTENT_URL = f"https://raw.github.com/{PROJECT_USERAME}/{PROJECT_NAME}/master/" +def get_or_create_history(history_name: str, gi: galaxy.GalaxyInstance): + histories = gi.histories.get_histories(name=history_name) + if histories: + return histories[0] + else: + return gi.histories.create_history(name=history_name) + + def check_url(url, log=None): if not url.startswith("http"): if log: diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index dff6498..5e0a056 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -28,15 +28,17 @@ import logging import time from collections import namedtuple -from typing import Literal +from typing import Optional from bioblend.galaxy import GalaxyInstance from bioblend.galaxy.tool_data import ToolDataClient from bioblend.galaxy.tools import ToolClient from jinja2 import Template +from typing_extensions import Literal from . import ( get_galaxy_connection, + get_or_create_history, load_yaml_file, ) from .common_parser import ( @@ -239,7 +241,14 @@ def parse_items(self, items): items = json.loads(rendered_items) return items - def run(self, log=None, ignore_errors=False, overwrite=False, data_manager_mode: DATA_MANAGER_MODES = "populate"): + def run( + self, + log=None, + ignore_errors=False, + overwrite=False, + data_manager_mode: DATA_MANAGER_MODES = "populate", + history_name: Optional[str] = None, + ): """ Runs the data managers. :param log: The log to be used. @@ -254,6 +263,10 @@ def run(self, log=None, ignore_errors=False, overwrite=False, data_manager_mode: if not log: log = logging.getLogger() + history_id: Optional[str] = None + if history_name is not None: + history_id = get_or_create_history()["id"] + def run_jobs(jobs, skipped_jobs): job_list = [] for skipped_job in skipped_jobs: @@ -269,7 +282,7 @@ def run_jobs(jobs, skipped_jobs): all_skipped_jobs.append(skipped_job) for job in jobs: started_job = self.tool_client.run_tool( - history_id=None, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode + history_id=history_id, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode ) log.info( 'Dispatched job %i. Running DM: "%s" with parameters: %s' @@ -330,7 +343,8 @@ def _parser(): action="store_true", help="Do not stop running when jobs have failed.", ) - parser.add_argument("--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") + parser.add_argument("--data-manager-mode", "--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") + parser.add_argument("--history-name", default=None) return parser @@ -346,7 +360,7 @@ def main(argv=None): gi = get_galaxy_connection(args, file=args.config, log=log, login_required=True) config = load_yaml_file(args.config) data_managers = DataManagers(gi, config) - data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode) + data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode, history_name=args.history_name) if __name__ == "__main__": diff --git a/src/ephemeris/shed_tools.py b/src/ephemeris/shed_tools.py index a54e5e1..e68f1b3 100644 --- a/src/ephemeris/shed_tools.py +++ b/src/ephemeris/shed_tools.py @@ -97,6 +97,7 @@ log = logging.getLogger(__name__) + class InstallRepoDict(TypedDict): name: str owner: str From 272d798abbffa8ac5c91559e39bb38d06f2706bc Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Thu, 29 Jun 2023 12:09:47 -0400 Subject: [PATCH 04/24] Pass args to get_or_create_history() --- src/ephemeris/run_data_managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 5e0a056..5d0a569 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -265,7 +265,7 @@ def run( history_id: Optional[str] = None if history_name is not None: - history_id = get_or_create_history()["id"] + history_id = get_or_create_history(history_name, self.gi)["id"] def run_jobs(jobs, skipped_jobs): job_list = [] From 1f93beb892ed907b105248fd98ce85db1d71f442 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 29 Jun 2023 16:37:06 -0400 Subject: [PATCH 05/24] split genomes for IDC --- setup.py | 6 +- src/ephemeris/_config_models.py | 60 +++++ src/ephemeris/_idc_data_managers_to_tools.py | 97 ++++++++ src/ephemeris/_idc_lint.py | 31 +++ .../_idc_split_data_manager_genomes.py | 223 ++++++++++++++++++ src/ephemeris/common_parser.py | 24 +- tests/__init__.py | 0 tests/test_idc_lint.py | 9 + tests/test_split_genomes.py | 83 +++++++ 9 files changed, 524 insertions(+), 9 deletions(-) create mode 100644 src/ephemeris/_config_models.py create mode 100644 src/ephemeris/_idc_data_managers_to_tools.py create mode 100644 src/ephemeris/_idc_lint.py create mode 100644 src/ephemeris/_idc_split_data_manager_genomes.py create mode 100644 tests/__init__.py create mode 100644 tests/test_idc_lint.py create mode 100644 tests/test_split_genomes.py diff --git a/setup.py b/setup.py index 5b1f7bc..a95c996 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,11 @@ def get_var(var_name): install_tool_deps=ephemeris.install_tool_deps:main install-tool-deps=ephemeris.install_tool_deps:main set-library-permissions=ephemeris.set_library_permissions:main -""" + _idc-lint=ephemeris._idc_lint:main + _idc-split-data-manager-genomes=ephemeris._idc_split_data_manager_genomes:main + _idc-data-managers-to-tools=ephemeris._idc_data_managers_to_tools:main + """ + PACKAGE_DATA = { # Be sure to update MANIFEST.in for source dist. } diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py new file mode 100644 index 0000000..3b84925 --- /dev/null +++ b/src/ephemeris/_config_models.py @@ -0,0 +1,60 @@ +from pathlib import Path +from typing import ( + Dict, + List, + Optional, + Union, +) + +import yaml +from pydantic import ( + BaseModel, + Extra, +) + + +StrOrPath = Union[Path, str] + + +class DataManager(BaseModel, extra=Extra.forbid): + tags: List[str] + tool_id: str + + +class DataManagers(BaseModel, extra=Extra.forbid): + __root__: Dict[str, DataManager] + + +class Genome(BaseModel): + id: str # The unique id of the data in Galaxy + description: str # The description of the data, including its taxonomy, version and date + dbkey: Optional[str] + source: Optional[str] # The source of the data. Can be: 'ucsc', an NCBI accession number or a URL to a fasta file. + + # The following fields are currently purely for human consumption and unused by + # IDC infrastructure. + doi: Optional[str] # Any DOI associated with the data + blob: Optional[str] # A blob for any other pertinent information + checksum: Optional[str] # A SHA256 checksum of the original + version: Optional[str] # Any version information associated with the data + + # Description of actions (data managers) to run on target genome. + indexers: Optional[List[str]] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools + skiplist: Optional[List[str]] # unimplemented: but if we implement classes of indexers, these will be ones to skip + + +class Genomes(BaseModel): + genomes: List[Genome] + + +def _read_yaml(path: StrOrPath): + with open(path, "r") as f: + return yaml.safe_load(f) + + +def read_data_managers(path: StrOrPath) -> DataManagers: + return DataManagers(__root__=_read_yaml(path)) + + +def read_genomes(path: StrOrPath) -> Genomes: + return Genomes(**_read_yaml(path)) diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py new file mode 100644 index 0000000..fdb39f4 --- /dev/null +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +"""Helper script for IDC - not yet meant for public consumption. + +This script takes a data_managers.yml configuration describing the +set of data managers the IDC configuration targets and builds a +a tools.yml file from it for use with shed_tools. +""" +import argparse +import logging +from typing import ( + Dict, + List, + NamedTuple, +) + +import yaml + +from ._config_models import read_data_managers +from .common_parser import ( + add_log_file_argument, + add_verbosity_argument, +) +from .ephemeris_log import ( + disable_external_library_logging, + setup_global_logger, +) + + +class DataManager(NamedTuple): + tool_id: str + repository_name: str + tags: List[str] + + +def read_data_managers_configuration(path: str) -> Dict[str, DataManager]: + raw_data_managers = read_data_managers(path) + data_managers: Dict[str, DataManager] = {} + for repository_name, data_manager_configuration in raw_data_managers.__root__.items(): + data_manager = DataManager( + tool_id=data_manager_configuration.tool_id, + repository_name=repository_name, + tags=data_manager_configuration.tags or [], + ) + data_managers[repository_name] = data_manager + return data_managers + + +def build_shed_install_conf(path: str) -> dict: + data_managers = read_data_managers_configuration(path) + tools = [] + for data_manager in data_managers.values(): + tool_id = data_manager.tool_id + tool_id_parts = tool_id.split("/") + repo_owner = tool_id_parts[2] + repo_name = tool_id_parts[3] + entry = { + "name": repo_name, + "owner": repo_owner, + "tool_panel_section_label": None, + "tool_shed_url": "toolshed.g2.bx.psu.edu", + } + tools.append(entry) + tools_yaml = {"tools": tools} + return tools_yaml + + +def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None: + tools_yaml = build_shed_install_conf(data_manager_conf_path) + with open(output_path, "w") as f: + yaml.safe_dump(tools_yaml, f) + + +def _parser(): + """returns the parser object.""" + + parser = argparse.ArgumentParser(add_help=False) + general_group = parser.add_argument_group("General options") + add_verbosity_argument(general_group) + add_log_file_argument(general_group) + parser.add_argument('--data-managers-conf', default="data_managers.yml") + parser.add_argument('--shed-install-output-conf', default="tools.yml") + + +def main(): + disable_external_library_logging() + parser = _parser() + args = parser.parse_args() + log = setup_global_logger(name=__name__, log_file=args.log_file) + if args.verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) + write_shed_install_conf(args.data_managers_conf, args.shed_install_output_conf) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py new file mode 100644 index 0000000..859f7a8 --- /dev/null +++ b/src/ephemeris/_idc_lint.py @@ -0,0 +1,31 @@ +import os +from pathlib import Path + +import yaml + +from ._config_models import ( + read_data_managers, + read_genomes, +) + + +def read_yaml(path: Path): + with open(path, "r") as f: + return yaml.safe_load(f) + + +def lint_idc_directory(directory: Path): + genomes_path = directory / "genomes.yml" + data_managers_path = directory / "data_managers.yml" + assert genomes_path.exists() + assert data_managers_path.exists() + read_data_managers(data_managers_path) + read_genomes(genomes_path) + + +def main(): + lint_idc_directory(Path(os.curdir)) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py new file mode 100644 index 0000000..7740ef2 --- /dev/null +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +"""Helper script for IDC - not yet meant for public consumption. + +This script splits genomes.yml into tasks that are meant to be sent to +run_data_managers.py - while excluding data managers executions specified +by genomes.yml that have already been executed and appear in the target +installed data table configuration. +""" +import logging +import os +import re +from copy import deepcopy +from typing import ( + Any, + Callable, + Dict, + List, + Optional, +) + +import yaml +from galaxy.util import safe_makedirs +from pydantic import ( + BaseModel, + Extra, +) + +from . import get_galaxy_connection +from .common_parser import ( + get_common_args, +) +from ._idc_data_managers_to_tools import ( + DataManager, + read_data_managers_configuration, +) +from .ephemeris_log import ( + disable_external_library_logging, + setup_global_logger, +) + +IsBuildComplete = Callable[[str, str], bool] +TASK_FILE_NAME = "run_data_managers.yaml" + + +class SplitOptions: + merged_genomes_path: str + split_genomes_path: str + data_managers_path: str + is_build_complete: IsBuildComplete + + +def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: + data_manager = data_managers[indexer] + assert data_manager, f"Could not find a target data manager for indexer name {indexer}" + return data_manager.tool_id + + +class RunDataManager(BaseModel): + id: str + items: Optional[List[Any]] = None + params: Optional[List[Any]] = None + data_table_reload: Optional[List[str]] = None + + +class RunDataManagers(BaseModel): + data_managers: List[RunDataManager] + + +class DataManager(BaseModel, extra=Extra.forbid): + tags: List[str] + tool_id: str + + +class DataManagers(BaseModel, extra=Extra.forbid): + __root__: Dict[str, DataManager] + + +class Genome(BaseModel): + pass + + +class Genomes(BaseModel): + genomes: List[Genome] + + +def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): + parent, _ = os.path.split(path) + if not os.path.exists(parent): + safe_makedirs(parent) + run_data_managers = RunDataManagers(data_managers=[run_data_manager]) + with open(path, "w") as of: + yaml.safe_dump(run_data_managers.dict(), of) + + +def split_genomes(split_options: SplitOptions) -> None: + + def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: str): + split_genomes_path = split_options.split_genomes_path + if not os.path.exists(split_options.split_genomes_path): + safe_makedirs(split_genomes_path) + + task_file_dir = os.path.join(split_genomes_path, build_id, indexer) + task_file = os.path.join(task_file_dir, TASK_FILE_NAME) + write_run_data_manager_to_file(run_data_manager, task_file) + + data_managers = read_data_managers_configuration(split_options.data_managers_path) + with open(split_options.merged_genomes_path) as f: + genomes_all = yaml.safe_load(f) + genomes = genomes_all["genomes"] + for genome in genomes: + build_id = genome["id"] + + fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" + if not split_options.is_build_complete(build_id, fetch_indexer): + fetch_tool_id = tool_id_for(fetch_indexer, data_managers) + fetch_params = [] + fetch_params.append({"dbkey_source|dbkey": genome["id"]}) + source = genome.get("source") + if source is None: + continue + elif source == "ucsc": + fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) + fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) + elif re.match("^[A-Z_]+[0-9.]+", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) + fetch_params.append( + {"reference_source|requested_identifier": source} + ) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + elif re.match("^http", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "url"}) + fetch_params.append({"reference_source|user_url": source}) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + + fetch_run_data_manager = RunDataManager( + id=fetch_tool_id, + params=fetch_params, + # Not needed according to Marius + # data_table_reload=["all_fasta", "__dbkeys__"], + ) + write_task_file(fetch_run_data_manager, build_id, fetch_indexer) + + indexers = genome.get("indexers", []) + for indexer in indexers: + if split_options.is_build_complete(build_id, indexer): + continue + + data_manager = {} + tool_id = tool_id_for(indexer, data_managers) + params = [ + {"all_fasta_source": "{{ item.id }}"}, + {"sequence_name": "{{ item.name }}"}, + {"sequence_id": "{{ item.id }}"}, + ] + if re.search("bwa", tool_id): + data_manager["params"].append({"index_algorithm": "bwtsw"}) + if re.search("color_space", tool_id): + continue + + item = deepcopy(genome) + item.pop("indexers", None) + item.pop("blacklist", None) + + run_data_manager = RunDataManager( + id=tool_id, + params=params, + items=[item], + ) + write_task_file(run_data_manager, build_id, indexer) + + +class GalaxyHistoryIsBuildComplete: + + def __init__(self, history_names: List[str]): + self._history_names = history_names + + def __call__(self, build_id: str, indexer_name: str) -> bool: + target_history_name = f"idc-{build_id}-{indexer_name}" + return target_history_name in self._history_names + + +def _parser(): + """returns the parser object.""" + # login required to check history... + parser = get_common_args(login_required=True) + + parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") + parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") + parser.add_argument('--data-managers-path', default="data_managers.yml") + + +def get_galaxy_history_names(args) -> List[str]: + gi = get_galaxy_connection(args, login_required=True) + return [h["name"] for h in gi.histories.get_histories()] + + +def main(): + disable_external_library_logging() + parser = _parser() + args = parser.parse_args() + log = setup_global_logger(name=__name__, log_file=args.log_file) + if args.verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) + + is_build_complete = GalaxyHistoryIsBuildComplete(get_galaxy_history_names(args)) + + split_options = SplitOptions() + split_options.data_managers_path = args.data_managers_path + split_options.merged_genomes_path = args.merged_genomes_path + split_options.split_genomes_path = args.split_genomes_path + split_options.is_build_complete = is_build_complete + + split_genomes(split_options) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/common_parser.py b/src/ephemeris/common_parser.py index cf56a75..f94c7ae 100644 --- a/src/ephemeris/common_parser.py +++ b/src/ephemeris/common_parser.py @@ -22,18 +22,26 @@ class ArgumentDefaultsHideUnderscoresHelpFormatter( pass +def add_verbosity_argument(parser_or_group): + parser_or_group.add_argument("-v", "--verbose", help="Increase output verbosity.", action="store_true") + + +def add_log_file_argument(parser_or_group): + parser_or_group.add_argument( + "--log-file", + "--log_file", + dest="log_file", + help="Where the log file should be stored. " "Default is a file in your system's temp folder", + default=None, + ) + + def get_common_args(login_required=True, log_file=False): parser = argparse.ArgumentParser(add_help=False) general_group = parser.add_argument_group("General options") - general_group.add_argument("-v", "--verbose", help="Increase output verbosity.", action="store_true") + add_verbosity_argument(general_group) if log_file: - general_group.add_argument( - "--log-file", - "--log_file", - dest="log_file", - help="Where the log file should be stored. " "Default is a file in your system's temp folder", - default=None, - ) + add_log_file_argument(general_group) con_group = parser.add_argument_group("Galaxy connection") con_group.add_argument( diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py new file mode 100644 index 0000000..50de40c --- /dev/null +++ b/tests/test_idc_lint.py @@ -0,0 +1,9 @@ +from pathlib import Path + +from ephemeris._idc_lint import lint_idc_directory +from .test_split_genomes import setup_mock_idc_dir + + +def test_idc_lint(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + lint_idc_directory(tmp_path) diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py new file mode 100644 index 0000000..e623321 --- /dev/null +++ b/tests/test_split_genomes.py @@ -0,0 +1,83 @@ +from pathlib import Path + +import yaml + +from ephemeris._idc_split_data_manager_genomes import ( + GalaxyHistoryIsBuildComplete, + RunDataManagers, + split_genomes, + SplitOptions, +) + +MERGED_YAML_STR = """ +genomes: + - dbkey: hg19_rCRS_pUC18_phiX174 + description: Homo sapiens (hg19 with mtDNA replaced with rCRS, and containing pUC18 + and phiX174) + source: http://datacache.galaxyproject.org/managed/seq/hg19_rCRS_pUC18_phiX174.fa + id: hg19_rCRS_pUC18_phiX174 + indexers: + - data_manager_twobit_builder + - data_manager_star_index_builder + + - dbkey: rn6 + description: Rat Jul. 2014 (RGSC 6.0/rn6) (rn6) + id: rn6 + source: ucsc + indexers: + - data_manager_twobit_builder + - data_manager_picard_index_builder +""" + +DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_twobit_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + + +def setup_mock_idc_dir(directory: Path): + merged = directory / "genomes.yml" + merged.write_text(MERGED_YAML_STR) + + data_managers = directory / "data_managers.yml" + data_managers.write_text(DATA_MANAGER_YAML_STR) + + +def read_and_validate_run_data_manager_yaml(path): + with open(path, "r") as f: + return RunDataManagers(**yaml.safe_load(f)) + + +def test_split_genomes(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + + split_path = tmp_path / "split" + + history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] + is_build_complete = GalaxyHistoryIsBuildComplete(history_names) + + split_options = SplitOptions() + split_options.merged_genomes_path = tmp_path / "genomes.yml" + split_options.split_genomes_path = str(split_path) + split_options.data_managers_path = tmp_path / "data_managers.yml" + split_options.is_build_complete = is_build_complete + split_genomes(split_options) + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder" + assert new_task.exists() + assert not complete_task.exists() + read_and_validate_run_data_manager_yaml(new_task / "run_data_managers.yaml") From 2f8e3bb3cd52eba6cebe4aa9ffd9e2396dfc99e4 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 18:47:22 -0400 Subject: [PATCH 06/24] Actually return the parser --- src/ephemeris/_idc_data_managers_to_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py index fdb39f4..ca8127b 100644 --- a/src/ephemeris/_idc_data_managers_to_tools.py +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -79,6 +79,7 @@ def _parser(): add_log_file_argument(general_group) parser.add_argument('--data-managers-conf', default="data_managers.yml") parser.add_argument('--shed-install-output-conf', default="tools.yml") + return parser def main(): From 7f115d5626ae855ef8b3d9dda1e65afe1ea24b5d Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 20:39:01 -0400 Subject: [PATCH 07/24] Fix _idc_split_data_manager_genomes parser --- src/ephemeris/_idc_split_data_manager_genomes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 7740ef2..a1f82b7 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -186,11 +186,11 @@ def __call__(self, build_id: str, indexer_name: str) -> bool: def _parser(): """returns the parser object.""" # login required to check history... - parser = get_common_args(login_required=True) - + parser = get_common_args(login_required=True, log_file=True) parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") parser.add_argument('--data-managers-path', default="data_managers.yml") + return parser def get_galaxy_history_names(args) -> List[str]: From 07e050497af017476959593ff68095f52aef7643 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 21:12:51 -0400 Subject: [PATCH 08/24] Include description in fetch --- src/ephemeris/_idc_split_data_manager_genomes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index a1f82b7..5de770b 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -121,6 +121,7 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st elif source == "ucsc": fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) + fetch_params.append({"sequence_name": genome["description"]}) elif re.match("^[A-Z_]+[0-9.]+", source): fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) From 8f5352712734ba216a81158db3f5a06953a070e7 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 21:13:03 -0400 Subject: [PATCH 09/24] Add logging --- src/ephemeris/_idc_split_data_manager_genomes.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 5de770b..ad0ac8f 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -41,6 +41,8 @@ IsBuildComplete = Callable[[str, str], bool] TASK_FILE_NAME = "run_data_managers.yaml" +log = logging.getLogger(__name__) + class SplitOptions: merged_genomes_path: str @@ -112,6 +114,7 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" if not split_options.is_build_complete(build_id, fetch_indexer): + log.info(f"Fetching: {build_id}") fetch_tool_id = tool_id_for(fetch_indexer, data_managers) fetch_params = [] fetch_params.append({"dbkey_source|dbkey": genome["id"]}) @@ -144,21 +147,26 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st # data_table_reload=["all_fasta", "__dbkeys__"], ) write_task_file(fetch_run_data_manager, build_id, fetch_indexer) + else: + log.debug(f"Fetch is already completed: {build_id}") indexers = genome.get("indexers", []) for indexer in indexers: if split_options.is_build_complete(build_id, indexer): + log.debug(f"Build is already completed: {build_id} {indexer}") continue - data_manager = {} + log.info(f"Building: {build_id} {indexer}") + tool_id = tool_id_for(indexer, data_managers) params = [ {"all_fasta_source": "{{ item.id }}"}, {"sequence_name": "{{ item.name }}"}, {"sequence_id": "{{ item.id }}"}, ] + # why is this not pulled from the data managers conf? -nate if re.search("bwa", tool_id): - data_manager["params"].append({"index_algorithm": "bwtsw"}) + params.append({"index_algorithm": "bwtsw"}) if re.search("color_space", tool_id): continue From c83ec6897244d7aa170d05fc1dc40ecab39506cd Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 22:13:08 -0400 Subject: [PATCH 10/24] Set defaults compatible with run-data-managers --- src/ephemeris/_idc_split_data_manager_genomes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index ad0ac8f..e371497 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -59,9 +59,9 @@ def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: class RunDataManager(BaseModel): id: str - items: Optional[List[Any]] = None + items: Optional[List[Any]] = [] params: Optional[List[Any]] = None - data_table_reload: Optional[List[str]] = None + data_table_reload: Optional[List[str]] = [] class RunDataManagers(BaseModel): @@ -172,7 +172,7 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st item = deepcopy(genome) item.pop("indexers", None) - item.pop("blacklist", None) + item.pop("skiplist", None) run_data_manager = RunDataManager( id=tool_id, From 230be451421f2fd1fb4e43ad72552fcf3458bff8 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:19:08 -0400 Subject: [PATCH 11/24] Handle empty items and data reload fields in run_data_managers. --- src/ephemeris/run_data_managers.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 5d0a569..1dc4898 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -156,8 +156,8 @@ def get_dm_jobs(self, dm): :returns job_list, skipped_job_list""" job_list = [] skipped_job_list = [] - items = self.parse_items(dm.get("items", [""])) - for item in items: + + def handle_item(item: str): dm_id = dm["id"] params = dm["params"] inputs = dict() @@ -171,11 +171,20 @@ def get_dm_jobs(self, dm): job = dict(tool_id=dm_id, inputs=inputs) - data_tables = dm.get("data_table_reload", []) + data_tables = dm.get("data_table_reload") or [] if self.input_entries_exist_in_data_tables(data_tables, inputs): skipped_job_list.append(job) else: job_list.append(job) + + raw_items = dm.get("items") or None + if raw_items: + items = self.parse_items(raw_items) + for item in items: + handle_item(item) + else: + handle_item("") + return job_list, skipped_job_list def dm_is_fetcher(self, dm): From ae3a75af17de1de9104760b4c6bcf8a4defb9f3e Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:32:07 -0400 Subject: [PATCH 12/24] Don't serialize unset fields when splitting genomes.yml into run tasks. --- src/ephemeris/_idc_split_data_manager_genomes.py | 6 +++--- tests/test_split_genomes.py | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index e371497..038865e 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -59,9 +59,9 @@ def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: class RunDataManager(BaseModel): id: str - items: Optional[List[Any]] = [] + items: Optional[List[Any]] = None params: Optional[List[Any]] = None - data_table_reload: Optional[List[str]] = [] + data_table_reload: Optional[List[str]] = None class RunDataManagers(BaseModel): @@ -91,7 +91,7 @@ def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): safe_makedirs(parent) run_data_managers = RunDataManagers(data_managers=[run_data_manager]) with open(path, "w") as of: - yaml.safe_dump(run_data_managers.dict(), of) + yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of) def split_genomes(split_options: SplitOptions) -> None: diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index e623321..5dc7c5a 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -80,4 +80,12 @@ def test_split_genomes(tmp_path: Path): complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder" assert new_task.exists() assert not complete_task.exists() - read_and_validate_run_data_manager_yaml(new_task / "run_data_managers.yaml") + new_task_run_yaml = new_task / "run_data_managers.yaml" + # ensure we don't serialize unset fields + assert "data_table_reload" not in new_task_run_yaml.read_text() + run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) + assert len(run.data_managers) == 1 + data_manager = run.data_managers[0] + assert data_manager.id == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2" + assert data_manager.items[0]["id"] == "hg19_rCRS_pUC18_phiX174" + assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" From 45fe43c8f52fbfd373ad5371a67fdae3de4bd5be Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:38:29 -0400 Subject: [PATCH 13/24] Lint fixes & fix for adding __init__ in tests. --- src/ephemeris/_config_models.py | 4 ++-- src/ephemeris/_idc_split_data_manager_genomes.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py index 3b84925..7eff971 100644 --- a/src/ephemeris/_config_models.py +++ b/src/ephemeris/_config_models.py @@ -34,8 +34,8 @@ class Genome(BaseModel): # The following fields are currently purely for human consumption and unused by # IDC infrastructure. doi: Optional[str] # Any DOI associated with the data - blob: Optional[str] # A blob for any other pertinent information - checksum: Optional[str] # A SHA256 checksum of the original + blob: Optional[str] # A blob for any other pertinent information + checksum: Optional[str] # A SHA256 checksum of the original version: Optional[str] # Any version information associated with the data # Description of actions (data managers) to run on target genome. diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 038865e..dfcc770 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -26,13 +26,13 @@ ) from . import get_galaxy_connection -from .common_parser import ( - get_common_args, -) from ._idc_data_managers_to_tools import ( DataManager, read_data_managers_configuration, ) +from .common_parser import ( + get_common_args, +) from .ephemeris_log import ( disable_external_library_logging, setup_global_logger, From 76b4519e6ae299462723a3c8684d061378c94713 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:55:28 -0400 Subject: [PATCH 14/24] Improved IDC linting... --- src/ephemeris/_idc_lint.py | 10 ++++++++-- tests/test_idc_lint.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py index 859f7a8..bf02745 100644 --- a/src/ephemeris/_idc_lint.py +++ b/src/ephemeris/_idc_lint.py @@ -19,8 +19,14 @@ def lint_idc_directory(directory: Path): data_managers_path = directory / "data_managers.yml" assert genomes_path.exists() assert data_managers_path.exists() - read_data_managers(data_managers_path) - read_genomes(genomes_path) + data_managers = read_data_managers(data_managers_path).__root__ + genomes = read_genomes(genomes_path) + print(genomes) + for genome in genomes.genomes: + print(genome) + for indexer in (genome.indexers or []): + if indexer not in data_managers: + raise Exception(f"Failed to find data manager {indexer} referenced for genome {genome}") def main(): diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py index 50de40c..5ec997a 100644 --- a/tests/test_idc_lint.py +++ b/tests/test_idc_lint.py @@ -1,9 +1,40 @@ from pathlib import Path +import pytest + from ephemeris._idc_lint import lint_idc_directory from .test_split_genomes import setup_mock_idc_dir -def test_idc_lint(tmp_path: Path): +MISSPELLED_DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_two_bit_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + + +def test_idc_lint_valid(tmp_path: Path): setup_mock_idc_dir(tmp_path) lint_idc_directory(tmp_path) + + +def test_idc_lint_misspelled_dm(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + (tmp_path / "data_managers.yml").write_text(MISSPELLED_DATA_MANAGER_YAML_STR) + with pytest.raises(Exception) as exc_info: + lint_idc_directory(tmp_path) + # misspelled two_bit in data managers so data_manager_twobit_builder is missing + assert "data_manager_twobit_builder" in str(exc_info.value) From c0b4cd49252452e07c088570386de411cc189d12 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 14:42:15 -0400 Subject: [PATCH 15/24] More shed linting... --- src/ephemeris/_idc_lint.py | 7 ++++++- tests/test_idc_lint.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py index bf02745..e8949ea 100644 --- a/src/ephemeris/_idc_lint.py +++ b/src/ephemeris/_idc_lint.py @@ -21,7 +21,12 @@ def lint_idc_directory(directory: Path): assert data_managers_path.exists() data_managers = read_data_managers(data_managers_path).__root__ genomes = read_genomes(genomes_path) - print(genomes) + + for data_manager in data_managers.values(): + data_manager_tool_id = data_manager.tool_id + if not data_manager_tool_id.startswith("toolshed.g2.bx.psu.edu/"): + raise Exception(f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}") + for genome in genomes.genomes: print(genome) for indexer in (genome.indexers or []): diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py index 5ec997a..3db0e74 100644 --- a/tests/test_idc_lint.py +++ b/tests/test_idc_lint.py @@ -25,6 +25,25 @@ - genome """ +TESTTOOLSHED_DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_twobit_builder: + tool_id: 'testtoolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + def test_idc_lint_valid(tmp_path: Path): setup_mock_idc_dir(tmp_path) @@ -38,3 +57,8 @@ def test_idc_lint_misspelled_dm(tmp_path: Path): lint_idc_directory(tmp_path) # misspelled two_bit in data managers so data_manager_twobit_builder is missing assert "data_manager_twobit_builder" in str(exc_info.value) + + (tmp_path / "data_managers.yml").write_text(TESTTOOLSHED_DATA_MANAGER_YAML_STR) + with pytest.raises(Exception) as exc_info: + lint_idc_directory(tmp_path) + assert "testtoolshed" in str(exc_info.value) From ff2d58f44a8eb1ae87283920801e8d48cbc5988f Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 14:47:54 -0400 Subject: [PATCH 16/24] Test case for data manager tools YAML generation. --- src/ephemeris/_config_models.py | 23 ++++++++++++++++++++ src/ephemeris/_idc_data_managers_to_tools.py | 9 +++++++- tests/test_idc_data_managers_to_tools.py | 13 +++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 tests/test_idc_data_managers_to_tools.py diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py index 7eff971..ce74804 100644 --- a/src/ephemeris/_config_models.py +++ b/src/ephemeris/_config_models.py @@ -16,6 +16,25 @@ StrOrPath = Union[Path, str] +class RepositoryInstallTarget(BaseModel): + name: str + owner: str + tool_shed_url: Optional[str] + tool_panel_section_id: Optional[str] + tool_panel_section_label: Optional[str] + revisions: Optional[List[str]] + install_tool_dependencies: Optional[bool] + install_repository_dependencies: Optional[bool] + install_resolver_dependencies: Optional[bool] + + +class RepositoryInstallTargets(BaseModel): + """ """ + api_key: Optional[str] + galaxy_instance: Optional[str] + tools: List[RepositoryInstallTarget] + + class DataManager(BaseModel, extra=Extra.forbid): tags: List[str] tool_id: str @@ -58,3 +77,7 @@ def read_data_managers(path: StrOrPath) -> DataManagers: def read_genomes(path: StrOrPath) -> Genomes: return Genomes(**_read_yaml(path)) + + +def read_tools(path: StrOrPath) -> RepositoryInstallTargets: + return RepositoryInstallTargets(**_read_yaml(path)) diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py index ca8127b..20f7b68 100644 --- a/src/ephemeris/_idc_data_managers_to_tools.py +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -15,7 +15,10 @@ import yaml -from ._config_models import read_data_managers +from ._config_models import ( + read_data_managers, + RepositoryInstallTargets, +) from .common_parser import ( add_log_file_argument, add_verbosity_argument, @@ -66,6 +69,10 @@ def build_shed_install_conf(path: str) -> dict: def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None: tools_yaml = build_shed_install_conf(data_manager_conf_path) + + # validate generated dict to ensure we're writing out valid file + RepositoryInstallTargets(**tools_yaml) + with open(output_path, "w") as f: yaml.safe_dump(tools_yaml, f) diff --git a/tests/test_idc_data_managers_to_tools.py b/tests/test_idc_data_managers_to_tools.py new file mode 100644 index 0000000..f00a647 --- /dev/null +++ b/tests/test_idc_data_managers_to_tools.py @@ -0,0 +1,13 @@ +from pathlib import Path + +from ephemeris._config_models import read_tools +from ephemeris._idc_data_managers_to_tools import write_shed_install_conf +from .test_split_genomes import setup_mock_idc_dir + + +def test_idc_lint_valid(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + output_path = tmp_path / "output.yaml" + write_shed_install_conf(tmp_path / "data_managers.yml", output_path) + # validate the generated tools file... + read_tools(output_path) From 2ff111dbb1c89fd303f1074688f5c7a0aff91ad9 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 15:30:17 -0400 Subject: [PATCH 17/24] Implement stages and filtering in IDC split script. --- .../_idc_split_data_manager_genomes.py | 80 ++++++++++++++---- tests/test_split_genomes.py | 81 +++++++++++++++++-- 2 files changed, 138 insertions(+), 23 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index dfcc770..ee68441 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -44,11 +44,27 @@ log = logging.getLogger(__name__) +class Filters: + stage: Optional[int] = None + data_manager: Optional[str] = None + build_id: Optional[str] = None + + def filter_out_data_manager(self, data_manager: str) -> bool: + return bool(self.data_manager and data_manager != self.data_manager) + + def filter_out_build_id(self, build_id: str) -> bool: + return bool(self.build_id and build_id != self.build_id) + + def filter_out_stage(self, stage: int) -> bool: + return bool(self.stage is not None and self.stage != stage) + + class SplitOptions: merged_genomes_path: str split_genomes_path: str data_managers_path: str is_build_complete: IsBuildComplete + filters: Filters = Filters() def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: @@ -94,34 +110,31 @@ def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of) -def split_genomes(split_options: SplitOptions) -> None: - - def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: str): - split_genomes_path = split_options.split_genomes_path - if not os.path.exists(split_options.split_genomes_path): - safe_makedirs(split_genomes_path) - - task_file_dir = os.path.join(split_genomes_path, build_id, indexer) - task_file = os.path.join(task_file_dir, TASK_FILE_NAME) - write_run_data_manager_to_file(run_data_manager, task_file) - +def walk_over_incomplete_runs(split_options: SplitOptions): data_managers = read_data_managers_configuration(split_options.data_managers_path) with open(split_options.merged_genomes_path) as f: genomes_all = yaml.safe_load(f) genomes = genomes_all["genomes"] for genome in genomes: build_id = genome["id"] + if split_options.filters.filter_out_build_id(build_id): + continue fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" - if not split_options.is_build_complete(build_id, fetch_indexer): + do_fetch = not split_options.filters.filter_out_data_manager(fetch_indexer) + source = genome.get("source") + if source is None: + do_fetch = False + if do_fetch and split_options.filters.filter_out_stage(0): + do_fetch = False + + if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer): log.info(f"Fetching: {build_id}") fetch_tool_id = tool_id_for(fetch_indexer, data_managers) fetch_params = [] fetch_params.append({"dbkey_source|dbkey": genome["id"]}) source = genome.get("source") - if source is None: - continue - elif source == "ucsc": + if source == "ucsc": fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) fetch_params.append({"sequence_name": genome["description"]}) @@ -146,12 +159,18 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st # Not needed according to Marius # data_table_reload=["all_fasta", "__dbkeys__"], ) - write_task_file(fetch_run_data_manager, build_id, fetch_indexer) + yield (build_id, fetch_indexer, fetch_run_data_manager) else: log.debug(f"Fetch is already completed: {build_id}") indexers = genome.get("indexers", []) for indexer in indexers: + if split_options.filters.filter_out_data_manager(indexer): + continue + + if split_options.filters.filter_out_stage(1): + continue + if split_options.is_build_complete(build_id, indexer): log.debug(f"Build is already completed: {build_id} {indexer}") continue @@ -179,7 +198,22 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st params=params, items=[item], ) - write_task_file(run_data_manager, build_id, indexer) + yield (build_id, indexer, run_data_manager) + + +def split_genomes(split_options: SplitOptions) -> None: + + def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManager): + split_genomes_path = split_options.split_genomes_path + if not os.path.exists(split_options.split_genomes_path): + safe_makedirs(split_genomes_path) + + task_file_dir = os.path.join(split_genomes_path, build_id, indexer) + task_file = os.path.join(task_file_dir, TASK_FILE_NAME) + write_run_data_manager_to_file(run_data_manager, task_file) + + for build_id, indexer, run_data_manager in walk_over_incomplete_runs(split_options): + write_task_file(build_id, indexer, run_data_manager) class GalaxyHistoryIsBuildComplete: @@ -199,6 +233,12 @@ def _parser(): parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") parser.add_argument('--data-managers-path', default="data_managers.yml") + + # filters + parser.add_argument('--filter-stage', default=None) + parser.add_argument('--filter-data-manager', default=None) + parser.add_argument('--filter-build-id', default=None) + return parser @@ -225,6 +265,12 @@ def main(): split_options.split_genomes_path = args.split_genomes_path split_options.is_build_complete = is_build_complete + filters = Filters() + filters.build_id = args.filter_build_id + filters.data_manager = args.filter_data_manager + filters.stage = args.filter_stage + split_options.filters = filters + split_genomes(split_options) diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index 5dc7c5a..fbddcef 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -3,6 +3,7 @@ import yaml from ephemeris._idc_split_data_manager_genomes import ( + Filters, GalaxyHistoryIsBuildComplete, RunDataManagers, split_genomes, @@ -62,19 +63,22 @@ def read_and_validate_run_data_manager_yaml(path): return RunDataManagers(**yaml.safe_load(f)) -def test_split_genomes(tmp_path: Path): - setup_mock_idc_dir(tmp_path) - - split_path = tmp_path / "split" - +def split_options_for(tmp_path: Path) -> SplitOptions: history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] is_build_complete = GalaxyHistoryIsBuildComplete(history_names) split_options = SplitOptions() split_options.merged_genomes_path = tmp_path / "genomes.yml" - split_options.split_genomes_path = str(split_path) + split_options.split_genomes_path = str(tmp_path / "split") split_options.data_managers_path = tmp_path / "data_managers.yml" split_options.is_build_complete = is_build_complete + return split_options + + +def test_split_genomes(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) split_genomes(split_options) new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder" @@ -89,3 +93,68 @@ def test_split_genomes(tmp_path: Path): assert data_manager.id == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2" assert data_manager.items[0]["id"] == "hg19_rCRS_pUC18_phiX174" assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" + + +def test_split_genomes_filter_on_data_manager(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.data_manager = "data_manager_star_index_builder" + split_options.filters = filters + + split_genomes(split_options) + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not new_task.exists() + + filters.data_manager = "data_manager_twobit_builder" + split_genomes(split_options) + assert new_task.exists() + + +def test_split_genomes_filter_on_build_id(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.build_id = "rn6" + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "rn6" / "data_manager_twobit_builder" + assert filtered_in_task.exists() + + +def test_split_genomes_filter_on_stage_0(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.stage = 0 + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta" + assert filtered_in_task.exists() + + +def test_split_genomes_filter_on_stage_1(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.stage = 1 + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert filtered_in_task.exists() From 2d9e757bba77a3490bbd05d10d0976f5e9826c3e Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 16:22:15 -0400 Subject: [PATCH 18/24] New defaults... EPHEMERIS_GALAXY and EPHEMERIS_API_KEY --- src/ephemeris/__init__.py | 4 +++- src/ephemeris/common_parser.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/ephemeris/__init__.py b/src/ephemeris/__init__.py index 42f42b8..60d0847 100644 --- a/src/ephemeris/__init__.py +++ b/src/ephemeris/__init__.py @@ -1,3 +1,5 @@ +import os + import yaml from bioblend import galaxy @@ -40,7 +42,7 @@ def get_galaxy_connection(args, file=None, log=None, login_required=True): url = args.galaxy or file_content.get("galaxy_instance") galaxy_url = check_url(url, log) - api_key = args.api_key or file_content.get("api_key") + api_key = args.api_key or file_content.get("api_key") or os.environ.get("EPHEMERIS_API_KEY") if args.user and args.password: return galaxy.GalaxyInstance(url=galaxy_url, email=args.user, password=args.password) diff --git a/src/ephemeris/common_parser.py b/src/ephemeris/common_parser.py index f94c7ae..288c9c4 100644 --- a/src/ephemeris/common_parser.py +++ b/src/ephemeris/common_parser.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +import os DEFAULT_JOB_SLEEP = 3 @@ -44,11 +45,12 @@ def get_common_args(login_required=True, log_file=False): add_log_file_argument(general_group) con_group = parser.add_argument_group("Galaxy connection") + default_galaxy = os.environ.get("EPHEMERIS_GALAXY") or "http://localhost:8080" con_group.add_argument( "-g", "--galaxy", help="Target Galaxy instance URL/IP address", - default="http://localhost:8080", + default=default_galaxy, ) if login_required: From 5356077e1e61fe24f291d4f0a425f2521e56d079 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 16:28:52 -0400 Subject: [PATCH 19/24] Skip broken data table checking logic if no data table reloads are found... --- src/ephemeris/run_data_managers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 1dc4898..377249b 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -217,6 +217,11 @@ def data_table_entry_exists(self, data_table_name, entry, column="value"): def input_entries_exist_in_data_tables(self, data_tables, input_dict): """Checks whether name and value entries from the input are already present in the data tables. If an entry is missing in of the tables, this function returns False""" + if data_tables is None or len(data_tables) == 0: + # this logic is all broken I (@jmchilton) think, but lets just skip it all + # if we know we don't have data tables to check + return False + value_entry = get_first_valid_entry(input_dict, self.possible_value_keys) name_entry = get_first_valid_entry(input_dict, self.possible_name_keys) From 7b2773c3fb98d6112061b8200bc8d36069abf0d8 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 17:06:42 -0400 Subject: [PATCH 20/24] Implement --tool-id-mode=short option to get short IDs for run-data-managers --- .../_idc_split_data_manager_genomes.py | 22 +++++++++++++++---- tests/test_split_genomes.py | 15 +++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index ee68441..be5c253 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -40,6 +40,7 @@ IsBuildComplete = Callable[[str, str], bool] TASK_FILE_NAME = "run_data_managers.yaml" +DEFAULT_TOOL_ID_MODE = "tool_shed_guid" log = logging.getLogger(__name__) @@ -64,13 +65,23 @@ class SplitOptions: split_genomes_path: str data_managers_path: str is_build_complete: IsBuildComplete + tool_id_mode: str = DEFAULT_TOOL_ID_MODE filters: Filters = Filters() -def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: +def tool_id_for(indexer: str, data_managers: Dict[str, DataManager], mode: str) -> str: data_manager = data_managers[indexer] assert data_manager, f"Could not find a target data manager for indexer name {indexer}" - return data_manager.tool_id + tool_shed_guid = data_manager.tool_id + if mode == "short": + _ts, _, _owner, _repo_name, rest = tool_shed_guid.split("/", 4) + if "/" in rest: + print(rest) + return rest.split("/")[0] + else: + return rest + else: + return tool_shed_guid class RunDataManager(BaseModel): @@ -130,7 +141,7 @@ def walk_over_incomplete_runs(split_options: SplitOptions): if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer): log.info(f"Fetching: {build_id}") - fetch_tool_id = tool_id_for(fetch_indexer, data_managers) + fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode) fetch_params = [] fetch_params.append({"dbkey_source|dbkey": genome["id"]}) source = genome.get("source") @@ -177,7 +188,7 @@ def walk_over_incomplete_runs(split_options: SplitOptions): log.info(f"Building: {build_id} {indexer}") - tool_id = tool_id_for(indexer, data_managers) + tool_id = tool_id_for(indexer, data_managers, split_options.tool_id_mode) params = [ {"all_fasta_source": "{{ item.id }}"}, {"sequence_name": "{{ item.name }}"}, @@ -234,6 +245,8 @@ def _parser(): parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") parser.add_argument('--data-managers-path', default="data_managers.yml") + parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) + # filters parser.add_argument('--filter-stage', default=None) parser.add_argument('--filter-data-manager', default=None) @@ -264,6 +277,7 @@ def main(): split_options.merged_genomes_path = args.merged_genomes_path split_options.split_genomes_path = args.split_genomes_path split_options.is_build_complete = is_build_complete + split_options.tool_id_mode = args.tool_id_mode filters = Filters() filters.build_id = args.filter_build_id diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index fbddcef..2ebb39f 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -95,6 +95,21 @@ def test_split_genomes(tmp_path: Path): assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" +def test_split_genomes_short_ids(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + split_options.tool_id_mode = "short" + split_genomes(split_options) + + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + new_task_run_yaml = new_task / "run_data_managers.yaml" + run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) + assert len(run.data_managers) == 1 + data_manager = run.data_managers[0] + assert data_manager.id == "twobit_builder_data_manager" + + def test_split_genomes_filter_on_data_manager(tmp_path: Path): setup_mock_idc_dir(tmp_path) split_path = tmp_path / "split" From f7ac1245e8336cec32055f1a6a107a5f00c1e7cd Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Sun, 9 Jul 2023 11:48:27 +1000 Subject: [PATCH 21/24] Always use "New" dbkeys (even with UCSC builds) in IDC scripts, and automatically fetch UCSC description if unset. --- .../_idc_split_data_manager_genomes.py | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index be5c253..0fd5103 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -17,7 +17,9 @@ List, Optional, ) +import xml.etree.ElementTree as ElementTree +import requests import yaml from galaxy.util import safe_makedirs from pydantic import ( @@ -41,6 +43,7 @@ IsBuildComplete = Callable[[str, str], bool] TASK_FILE_NAME = "run_data_managers.yaml" DEFAULT_TOOL_ID_MODE = "tool_shed_guid" +UCSC_DSN_URL = "http://genome.cse.ucsc.edu/cgi-bin/das/dsn" log = logging.getLogger(__name__) @@ -112,6 +115,32 @@ class Genomes(BaseModel): genomes: List[Genome] +def ucsc_description_for_build(requested_build: str) -> str: + # from galaxy/cron/parse_builds.py + url = UCSC_DSN_URL + text = requests.get(url).text + tree = ElementTree.fromstring(text) + + for dsn in tree: + build = dsn.find("SOURCE").attrib["id"] + if build != requested_build: + continue + + description = dsn.find("DESCRIPTION").text.replace(" - Genome at UCSC", "").replace(" Genome at UCSC", "") + + fields = description.split(" ") + temp = fields[0] + for i in range(len(fields) - 1): + if temp == fields[i + 1]: + fields.pop(i + 1) + else: + temp = fields[i + 1] + description = " ".join(fields) + return description + + raise Exception(f"Could not fetch UCSC description for build, a description must be provided: {requested_build}") + + def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): parent, _ = os.path.split(path) if not os.path.exists(parent): @@ -143,14 +172,17 @@ def walk_over_incomplete_runs(split_options: SplitOptions): log.info(f"Fetching: {build_id}") fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode) fetch_params = [] + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) fetch_params.append({"dbkey_source|dbkey": genome["id"]}) + description = genome.get("description") source = genome.get("source") if source == "ucsc": + if not description: + description = ucsc_description_for_build(genome["id"]) fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) - fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence_name": description}) elif re.match("^[A-Z_]+[0-9.]+", source): - fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) fetch_params.append( {"reference_source|requested_identifier": source} @@ -158,12 +190,14 @@ def walk_over_incomplete_runs(split_options: SplitOptions): fetch_params.append({"sequence_name": genome["description"]}) fetch_params.append({"sequence.id": genome["id"]}) elif re.match("^http", source): - fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) fetch_params.append({"reference_source|reference_source_selector": "url"}) fetch_params.append({"reference_source|user_url": source}) fetch_params.append({"sequence_name": genome["description"]}) fetch_params.append({"sequence.id": genome["id"]}) + if description: + fetch_params.append({"dbkey_source|dbkey_name": description}) + fetch_run_data_manager = RunDataManager( id=fetch_tool_id, params=fetch_params, From d5b92a74b8291622489383e547dead906e275a93 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Sun, 16 Jul 2023 13:43:38 +1000 Subject: [PATCH 22/24] Support checking what needs to be imported/published to CVMFS --- .../_idc_split_data_manager_genomes.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 0fd5103..2392c11 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -271,6 +271,15 @@ def __call__(self, build_id: str, indexer_name: str) -> bool: return target_history_name in self._history_names +class CVMFSPublishIsComplete: + + def __init__(self, records: Dict[str, List[str]]): + self.records = records + + def __call__(self, build_id: str, indexer_name: str) -> bool: + return indexer_name in self.records.get(build_id, []) + + def _parser(): """returns the parser object.""" # login required to check history... @@ -278,6 +287,8 @@ def _parser(): parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") parser.add_argument('--data-managers-path', default="data_managers.yml") + parser.add_argument('--complete-check-cvmfs', default=False, action="store_true") + parser.add_argument('--cvmfs-root', default="/cvmfs/idc.galaxyproject.org") parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) @@ -294,6 +305,18 @@ def get_galaxy_history_names(args) -> List[str]: return [h["name"] for h in gi.histories.get_histories()] +def get_regular_files(dirname: str) -> List[str]: + return [f for f in os.listdir(dirname) if not f.startswith(".")] + + +def get_cvmfs_publish_records(args) -> Dict[str, List[str]]: + records = {} + records_dir = os.path.join(args.cvmfs_root, "record") + for build_id in get_regular_files(records_dir): + records[build_id] = get_regular_files(os.path.join(records_dir, build_id)) + return records + + def main(): disable_external_library_logging() parser = _parser() @@ -304,7 +327,10 @@ def main(): else: log.setLevel(logging.INFO) - is_build_complete = GalaxyHistoryIsBuildComplete(get_galaxy_history_names(args)) + if args.complete_check_cvmfs: + is_build_complete = CVMFSPublishIsComplete(get_cvmfs_publish_records(args)) + else: + is_build_complete = GalaxyHistoryIsBuildComplete(get_galaxy_history_names(args)) split_options = SplitOptions() split_options.data_managers_path = args.data_managers_path From a42771d2f679d9f469ba4b7cda7c2d58012ee0a4 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Wed, 31 Jan 2024 16:59:03 +0100 Subject: [PATCH 23/24] Run black --- src/ephemeris/_config_models.py | 8 +++-- src/ephemeris/_idc_data_managers_to_tools.py | 4 +-- src/ephemeris/_idc_lint.py | 8 +++-- .../_idc_split_data_manager_genomes.py | 29 +++++++------------ src/ephemeris/run_data_managers.py | 17 +++++++++-- src/ephemeris/shed_tools.py | 1 + tests/test_idc_lint.py | 1 - tests/test_split_genomes.py | 7 +++-- 8 files changed, 43 insertions(+), 32 deletions(-) diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py index ce74804..75daa24 100644 --- a/src/ephemeris/_config_models.py +++ b/src/ephemeris/_config_models.py @@ -12,7 +12,6 @@ Extra, ) - StrOrPath = Union[Path, str] @@ -30,6 +29,7 @@ class RepositoryInstallTarget(BaseModel): class RepositoryInstallTargets(BaseModel): """ """ + api_key: Optional[str] galaxy_instance: Optional[str] tools: List[RepositoryInstallTarget] @@ -58,7 +58,9 @@ class Genome(BaseModel): version: Optional[str] # Any version information associated with the data # Description of actions (data managers) to run on target genome. - indexers: Optional[List[str]] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools + indexers: Optional[ + List[str] + ] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools skiplist: Optional[List[str]] # unimplemented: but if we implement classes of indexers, these will be ones to skip @@ -67,7 +69,7 @@ class Genomes(BaseModel): def _read_yaml(path: StrOrPath): - with open(path, "r") as f: + with open(path) as f: return yaml.safe_load(f) diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py index 20f7b68..dd14ead 100644 --- a/src/ephemeris/_idc_data_managers_to_tools.py +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -84,8 +84,8 @@ def _parser(): general_group = parser.add_argument_group("General options") add_verbosity_argument(general_group) add_log_file_argument(general_group) - parser.add_argument('--data-managers-conf', default="data_managers.yml") - parser.add_argument('--shed-install-output-conf', default="tools.yml") + parser.add_argument("--data-managers-conf", default="data_managers.yml") + parser.add_argument("--shed-install-output-conf", default="tools.yml") return parser diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py index e8949ea..b1fecc3 100644 --- a/src/ephemeris/_idc_lint.py +++ b/src/ephemeris/_idc_lint.py @@ -10,7 +10,7 @@ def read_yaml(path: Path): - with open(path, "r") as f: + with open(path) as f: return yaml.safe_load(f) @@ -25,11 +25,13 @@ def lint_idc_directory(directory: Path): for data_manager in data_managers.values(): data_manager_tool_id = data_manager.tool_id if not data_manager_tool_id.startswith("toolshed.g2.bx.psu.edu/"): - raise Exception(f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}") + raise Exception( + f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}" + ) for genome in genomes.genomes: print(genome) - for indexer in (genome.indexers or []): + for indexer in genome.indexers or []: if indexer not in data_managers: raise Exception(f"Failed to find data manager {indexer} referenced for genome {genome}") diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 2392c11..da737b7 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -9,6 +9,7 @@ import logging import os import re +import xml.etree.ElementTree as ElementTree from copy import deepcopy from typing import ( Any, @@ -17,7 +18,6 @@ List, Optional, ) -import xml.etree.ElementTree as ElementTree import requests import yaml @@ -32,9 +32,7 @@ DataManager, read_data_managers_configuration, ) -from .common_parser import ( - get_common_args, -) +from .common_parser import get_common_args from .ephemeris_log import ( disable_external_library_logging, setup_global_logger, @@ -184,9 +182,7 @@ def walk_over_incomplete_runs(split_options: SplitOptions): fetch_params.append({"sequence_name": description}) elif re.match("^[A-Z_]+[0-9.]+", source): fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) - fetch_params.append( - {"reference_source|requested_identifier": source} - ) + fetch_params.append({"reference_source|requested_identifier": source}) fetch_params.append({"sequence_name": genome["description"]}) fetch_params.append({"sequence.id": genome["id"]}) elif re.match("^http", source): @@ -247,7 +243,6 @@ def walk_over_incomplete_runs(split_options: SplitOptions): def split_genomes(split_options: SplitOptions) -> None: - def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManager): split_genomes_path = split_options.split_genomes_path if not os.path.exists(split_options.split_genomes_path): @@ -262,7 +257,6 @@ def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManage class GalaxyHistoryIsBuildComplete: - def __init__(self, history_names: List[str]): self._history_names = history_names @@ -272,7 +266,6 @@ def __call__(self, build_id: str, indexer_name: str) -> bool: class CVMFSPublishIsComplete: - def __init__(self, records: Dict[str, List[str]]): self.records = records @@ -284,18 +277,18 @@ def _parser(): """returns the parser object.""" # login required to check history... parser = get_common_args(login_required=True, log_file=True) - parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") - parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") - parser.add_argument('--data-managers-path', default="data_managers.yml") - parser.add_argument('--complete-check-cvmfs', default=False, action="store_true") - parser.add_argument('--cvmfs-root', default="/cvmfs/idc.galaxyproject.org") + parser.add_argument("--merged-genomes-path", "-m", default="genomes.yml") + parser.add_argument("--split-genomes-path", "-s", default="data_manager_tasks") + parser.add_argument("--data-managers-path", default="data_managers.yml") + parser.add_argument("--complete-check-cvmfs", default=False, action="store_true") + parser.add_argument("--cvmfs-root", default="/cvmfs/idc.galaxyproject.org") parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) # filters - parser.add_argument('--filter-stage', default=None) - parser.add_argument('--filter-data-manager', default=None) - parser.add_argument('--filter-build-id', default=None) + parser.add_argument("--filter-stage", default=None) + parser.add_argument("--filter-data-manager", default=None) + parser.add_argument("--filter-build-id", default=None) return parser diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 377249b..c109429 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -296,7 +296,10 @@ def run_jobs(jobs, skipped_jobs): all_skipped_jobs.append(skipped_job) for job in jobs: started_job = self.tool_client.run_tool( - history_id=history_id, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode + history_id=history_id, + tool_id=job["tool_id"], + tool_inputs=job["inputs"], + data_manager_mode=data_manager_mode, ) log.info( 'Dispatched job %i. Running DM: "%s" with parameters: %s' @@ -357,7 +360,9 @@ def _parser(): action="store_true", help="Do not stop running when jobs have failed.", ) - parser.add_argument("--data-manager-mode", "--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") + parser.add_argument( + "--data-manager-mode", "--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate" + ) parser.add_argument("--history-name", default=None) return parser @@ -374,7 +379,13 @@ def main(argv=None): gi = get_galaxy_connection(args, file=args.config, log=log, login_required=True) config = load_yaml_file(args.config) data_managers = DataManagers(gi, config) - data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode, history_name=args.history_name) + data_managers.run( + log, + args.ignore_errors, + args.overwrite, + data_manager_mode=args.data_manager_mode, + history_name=args.history_name, + ) if __name__ == "__main__": diff --git a/src/ephemeris/shed_tools.py b/src/ephemeris/shed_tools.py index e68f1b3..76f0019 100644 --- a/src/ephemeris/shed_tools.py +++ b/src/ephemeris/shed_tools.py @@ -33,6 +33,7 @@ Galaxy's configuration directory and set Galaxy configuration option `tool_config_file` to include it. """ + import datetime as dt import json import logging diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py index 3db0e74..c3dbe67 100644 --- a/tests/test_idc_lint.py +++ b/tests/test_idc_lint.py @@ -5,7 +5,6 @@ from ephemeris._idc_lint import lint_idc_directory from .test_split_genomes import setup_mock_idc_dir - MISSPELLED_DATA_MANAGER_YAML_STR = """ data_manager_fetch_genome_dbkeys_all_fasta: tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index 2ebb39f..8e186f3 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -59,7 +59,7 @@ def setup_mock_idc_dir(directory: Path): def read_and_validate_run_data_manager_yaml(path): - with open(path, "r") as f: + with open(path) as f: return RunDataManagers(**yaml.safe_load(f)) @@ -90,7 +90,10 @@ def test_split_genomes(tmp_path: Path): run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) assert len(run.data_managers) == 1 data_manager = run.data_managers[0] - assert data_manager.id == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2" + assert ( + data_manager.id + == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2" + ) assert data_manager.items[0]["id"] == "hg19_rCRS_pUC18_phiX174" assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" From cf586a9651daa5e5b550cc234949c3e7ab04109d Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Wed, 31 Jan 2024 17:20:41 +0100 Subject: [PATCH 24/24] Fix conftest imports --- tests/test_setup_data_libraries_cli.py | 3 +-- tests/test_shed_tools_cli.py | 3 +-- tests/test_workflow_install_cli.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_setup_data_libraries_cli.py b/tests/test_setup_data_libraries_cli.py index 12f19b5..7ae6906 100644 --- a/tests/test_setup_data_libraries_cli.py +++ b/tests/test_setup_data_libraries_cli.py @@ -1,8 +1,7 @@ import pathlib -from conftest import GalaxyContainer - from ephemeris.setup_data_libraries import main as setup_data_libraries_cli +from .conftest import GalaxyContainer LIBRARY_DATA_EXAMPLE = pathlib.Path(__file__).parent / "library_data_example.yaml" LIBRARY_DATA_LEGACY_EXAMPLE = pathlib.Path(__file__).parent / "library_data_example_legacy.yaml" diff --git a/tests/test_shed_tools_cli.py b/tests/test_shed_tools_cli.py index 88780bf..d504140 100644 --- a/tests/test_shed_tools_cli.py +++ b/tests/test_shed_tools_cli.py @@ -1,11 +1,10 @@ import pathlib import tempfile -from conftest import GalaxyContainer - from ephemeris.generate_tool_list_from_ga_workflow_files import main as workflow_to_tools_cli from ephemeris.get_tool_list_from_galaxy import main as get_tool_list_cli from ephemeris.shed_tools import main as shed_tools_cli +from .conftest import GalaxyContainer OLD_TOOL_YAML = "{'owner':'jjohnson','name':'cdhit','revisions':['34a799d173f7'],'tool_panel_section_label':'CD_HIT'}" diff --git a/tests/test_workflow_install_cli.py b/tests/test_workflow_install_cli.py index bf12cef..d8c1db7 100644 --- a/tests/test_workflow_install_cli.py +++ b/tests/test_workflow_install_cli.py @@ -1,8 +1,7 @@ import pathlib -from conftest import GalaxyContainer - from ephemeris.workflow_install import main as workflow_install_cli +from .conftest import GalaxyContainer TEST_WORKFLOW_PATH = pathlib.Path(__file__).parent / "test_workflow.ga"