Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancements to the IDC scripts #201

Merged
merged 24 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
fa219fd
Add data_manager_mode argument fort IDC use
mvdbeek Mar 28, 2023
ac2103b
Not sure if that was necessary
mvdbeek Apr 18, 2023
0b60c00
All setting history name in run_data_managers.py
mvdbeek Jun 29, 2023
272d798
Pass args to get_or_create_history()
natefoo Jun 29, 2023
1f93beb
split genomes for IDC
jmchilton Jun 29, 2023
2f8e3bb
Actually return the parser
natefoo Jun 30, 2023
7f115d5
Fix _idc_split_data_manager_genomes parser
natefoo Jul 1, 2023
07e0504
Include description in fetch
natefoo Jul 1, 2023
8f53527
Add logging
natefoo Jul 1, 2023
c83ec68
Set defaults compatible with run-data-managers
natefoo Jul 1, 2023
230be45
Handle empty items and data reload fields in run_data_managers.
jmchilton Jul 1, 2023
ae3a75a
Don't serialize unset fields when splitting genomes.yml into run tasks.
jmchilton Jul 1, 2023
45fe43c
Lint fixes & fix for adding __init__ in tests.
jmchilton Jul 1, 2023
76b4519
Improved IDC linting...
jmchilton Jul 1, 2023
c0b4cd4
More shed linting...
jmchilton Jul 1, 2023
ff2d58f
Test case for data manager tools YAML generation.
jmchilton Jul 1, 2023
2ff111d
Implement stages and filtering in IDC split script.
jmchilton Jul 1, 2023
2d9e757
New defaults... EPHEMERIS_GALAXY and EPHEMERIS_API_KEY
jmchilton Jul 1, 2023
5356077
Skip broken data table checking logic if no data table reloads are fo…
jmchilton Jul 1, 2023
7b2773c
Implement --tool-id-mode=short option to get short IDs for run-data-m…
jmchilton Jul 1, 2023
f7ac124
Always use "New" dbkeys (even with UCSC builds) in IDC scripts, and
natefoo Jul 9, 2023
d5b92a7
Support checking what needs to be imported/published to CVMFS
natefoo Jul 16, 2023
a42771d
Run black
mvdbeek Jan 31, 2024
cf586a9
Fix conftest imports
mvdbeek Jan 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@ def get_var(var_name):
install_tool_deps=ephemeris.install_tool_deps:main
install-tool-deps=ephemeris.install_tool_deps:main
set-library-permissions=ephemeris.set_library_permissions:main
"""
_idc-lint=ephemeris._idc_lint:main
_idc-split-data-manager-genomes=ephemeris._idc_split_data_manager_genomes:main
_idc-data-managers-to-tools=ephemeris._idc_data_managers_to_tools:main
"""

PACKAGE_DATA = {
# Be sure to update MANIFEST.in for source dist.
}
Expand Down
12 changes: 11 additions & 1 deletion src/ephemeris/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

import yaml
from bioblend import galaxy

Expand All @@ -11,6 +13,14 @@
RAW_CONTENT_URL = f"https://raw.github.com/{PROJECT_USERAME}/{PROJECT_NAME}/master/"


def get_or_create_history(history_name: str, gi: galaxy.GalaxyInstance):
histories = gi.histories.get_histories(name=history_name)
if histories:
return histories[0]
else:
return gi.histories.create_history(name=history_name)


def check_url(url, log=None):
if not url.startswith("http"):
if log:
Expand All @@ -32,7 +42,7 @@ def get_galaxy_connection(args, file=None, log=None, login_required=True):

url = args.galaxy or file_content.get("galaxy_instance")
galaxy_url = check_url(url, log)
api_key = args.api_key or file_content.get("api_key")
api_key = args.api_key or file_content.get("api_key") or os.environ.get("EPHEMERIS_API_KEY")

if args.user and args.password:
return galaxy.GalaxyInstance(url=galaxy_url, email=args.user, password=args.password)
Expand Down
85 changes: 85 additions & 0 deletions src/ephemeris/_config_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from pathlib import Path
from typing import (
Dict,
List,
Optional,
Union,
)

import yaml
from pydantic import (
BaseModel,
Extra,
)

StrOrPath = Union[Path, str]


class RepositoryInstallTarget(BaseModel):
name: str
owner: str
tool_shed_url: Optional[str]
tool_panel_section_id: Optional[str]
tool_panel_section_label: Optional[str]
revisions: Optional[List[str]]
install_tool_dependencies: Optional[bool]
install_repository_dependencies: Optional[bool]
install_resolver_dependencies: Optional[bool]


class RepositoryInstallTargets(BaseModel):
""" """

api_key: Optional[str]
galaxy_instance: Optional[str]
tools: List[RepositoryInstallTarget]


class DataManager(BaseModel, extra=Extra.forbid):
tags: List[str]
tool_id: str


class DataManagers(BaseModel, extra=Extra.forbid):
__root__: Dict[str, DataManager]


class Genome(BaseModel):
id: str # The unique id of the data in Galaxy
description: str # The description of the data, including its taxonomy, version and date
dbkey: Optional[str]
source: Optional[str] # The source of the data. Can be: 'ucsc', an NCBI accession number or a URL to a fasta file.

# The following fields are currently purely for human consumption and unused by
# IDC infrastructure.
doi: Optional[str] # Any DOI associated with the data
blob: Optional[str] # A blob for any other pertinent information
checksum: Optional[str] # A SHA256 checksum of the original
version: Optional[str] # Any version information associated with the data

# Description of actions (data managers) to run on target genome.
indexers: Optional[
List[str]
] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools
skiplist: Optional[List[str]] # unimplemented: but if we implement classes of indexers, these will be ones to skip


class Genomes(BaseModel):
genomes: List[Genome]


def _read_yaml(path: StrOrPath):
with open(path) as f:
return yaml.safe_load(f)


def read_data_managers(path: StrOrPath) -> DataManagers:
return DataManagers(__root__=_read_yaml(path))


def read_genomes(path: StrOrPath) -> Genomes:
return Genomes(**_read_yaml(path))


def read_tools(path: StrOrPath) -> RepositoryInstallTargets:
return RepositoryInstallTargets(**_read_yaml(path))
105 changes: 105 additions & 0 deletions src/ephemeris/_idc_data_managers_to_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python
"""Helper script for IDC - not yet meant for public consumption.

This script takes a data_managers.yml configuration describing the
set of data managers the IDC configuration targets and builds a
a tools.yml file from it for use with shed_tools.
"""
import argparse
import logging
from typing import (
Dict,
List,
NamedTuple,
)

import yaml

from ._config_models import (
read_data_managers,
RepositoryInstallTargets,
)
from .common_parser import (
add_log_file_argument,
add_verbosity_argument,
)
from .ephemeris_log import (
disable_external_library_logging,
setup_global_logger,
)


class DataManager(NamedTuple):
tool_id: str
repository_name: str
tags: List[str]


def read_data_managers_configuration(path: str) -> Dict[str, DataManager]:
raw_data_managers = read_data_managers(path)
data_managers: Dict[str, DataManager] = {}
for repository_name, data_manager_configuration in raw_data_managers.__root__.items():
data_manager = DataManager(
tool_id=data_manager_configuration.tool_id,
repository_name=repository_name,
tags=data_manager_configuration.tags or [],
)
data_managers[repository_name] = data_manager
return data_managers


def build_shed_install_conf(path: str) -> dict:
data_managers = read_data_managers_configuration(path)
tools = []
for data_manager in data_managers.values():
tool_id = data_manager.tool_id
tool_id_parts = tool_id.split("/")
repo_owner = tool_id_parts[2]
repo_name = tool_id_parts[3]
entry = {
"name": repo_name,
"owner": repo_owner,
"tool_panel_section_label": None,
"tool_shed_url": "toolshed.g2.bx.psu.edu",
}
tools.append(entry)
tools_yaml = {"tools": tools}
return tools_yaml


def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None:
tools_yaml = build_shed_install_conf(data_manager_conf_path)

# validate generated dict to ensure we're writing out valid file
RepositoryInstallTargets(**tools_yaml)

with open(output_path, "w") as f:
yaml.safe_dump(tools_yaml, f)


def _parser():
"""returns the parser object."""

parser = argparse.ArgumentParser(add_help=False)
general_group = parser.add_argument_group("General options")
add_verbosity_argument(general_group)
add_log_file_argument(general_group)
parser.add_argument("--data-managers-conf", default="data_managers.yml")
parser.add_argument("--shed-install-output-conf", default="tools.yml")
return parser


def main():
disable_external_library_logging()
parser = _parser()
args = parser.parse_args()
log = setup_global_logger(name=__name__, log_file=args.log_file)
if args.verbose:
log.setLevel(logging.DEBUG)
else:
log.setLevel(logging.INFO)
write_shed_install_conf(args.data_managers_conf, args.shed_install_output_conf)


if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions src/ephemeris/_idc_lint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
from pathlib import Path

import yaml

from ._config_models import (
read_data_managers,
read_genomes,
)


def read_yaml(path: Path):
with open(path) as f:
return yaml.safe_load(f)


def lint_idc_directory(directory: Path):
genomes_path = directory / "genomes.yml"
data_managers_path = directory / "data_managers.yml"
assert genomes_path.exists()
assert data_managers_path.exists()
data_managers = read_data_managers(data_managers_path).__root__
genomes = read_genomes(genomes_path)

for data_manager in data_managers.values():
data_manager_tool_id = data_manager.tool_id
if not data_manager_tool_id.startswith("toolshed.g2.bx.psu.edu/"):
raise Exception(
f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}"
)

for genome in genomes.genomes:
print(genome)
for indexer in genome.indexers or []:
if indexer not in data_managers:
raise Exception(f"Failed to find data manager {indexer} referenced for genome {genome}")


def main():
lint_idc_directory(Path(os.curdir))


if __name__ == "__main__":
main()
Loading
Loading