diff --git a/scripts/can_bundle.py b/scripts/can_bundle.py index 7457b4a8a..8fbe67427 100644 --- a/scripts/can_bundle.py +++ b/scripts/can_bundle.py @@ -69,7 +69,7 @@ def fetch_bundle(source: str, bundle_uuid: str, bundle_version: str) -> Bundle: plugin = plugin_for(catalog) sources = set(map(str, plugin.sources)) if source in sources: - source = plugin.resolve_source(name=source) + source = plugin.resolve_source(spec=source) fqid = SourcedBundleFQID(source=source, uuid=bundle_uuid, version=bundle_version) diff --git a/scripts/recan_bundle_tdr.py b/scripts/recan_bundle_tdr.py index f4b2e55a6..6dbc3d59b 100644 --- a/scripts/recan_bundle_tdr.py +++ b/scripts/recan_bundle_tdr.py @@ -35,7 +35,7 @@ ) from azul.indexer import ( Bundle, - SimpleSourceName, + SimpleSourceSpec, SourcedBundleFQID, ) from azul.indexer.document import ( @@ -60,7 +60,7 @@ TDRSourceRef, ) from azul.terra import ( - TDRSourceName, + TDRSourceSpec, ) from azul.types import ( JSON, @@ -237,7 +237,7 @@ def __init__(self, bundle: TDRBundle, file_name: str): assert self.concrete_type.endswith('_file') self.file_manifest_entry = one(e for e in bundle.manifest if e['name'] == self.metadata['file_core']['file_name']) - assert bundle.fqid.source.name.is_snapshot + assert bundle.fqid.source.spec.is_snapshot assert self.file_manifest_entry['drs_path'] is not None def to_json_row(self) -> JSON: @@ -387,7 +387,7 @@ def main(argv): metadata = json.load(f) dss_source = DSSSourceRef(id='', - name=SimpleSourceName(prefix='', + spec=SimpleSourceSpec(prefix='', name=config.dss_endpoint)) dss_bundle = DSSBundle(fqid=SourcedBundleFQID(source=dss_source, uuid=args.bundle_uuid, @@ -396,7 +396,7 @@ def main(argv): metadata_files=metadata) tdr_source = TDRSourceRef(id=args.source_id, - name=TDRSourceName(project='test_project', + spec=TDRSourceSpec(project='test_project', name='test_name', is_snapshot=True)) tdr_bundle = dss_bundle_to_tdr(dss_bundle, tdr_source) diff --git a/scripts/register_sam.py b/scripts/register_sam.py index 5d046e78f..aa8df0e79 100644 --- a/scripts/register_sam.py +++ b/scripts/register_sam.py @@ -12,7 +12,7 @@ ) from azul.terra import ( TDRClient, - TDRSourceName, + TDRSourceSpec, ) log = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def main(): if catalog.plugins['repository'] == 'tdr' ) for source in set(chain(*map(config.tdr_sources, tdr_catalogs))): - source = TDRSourceName.parse(source) + source = TDRSourceSpec.parse(source) api_project = tdr.lookup_source_project(source) require(api_project == source.project, 'Actual Google project of TDR source differs from configured ' diff --git a/src/azul/azulclient.py b/src/azul/azulclient.py index 0be3351cd..bc850f31a 100644 --- a/src/azul/azulclient.py +++ b/src/azul/azulclient.py @@ -103,7 +103,7 @@ def synthesize_notification(self, return { 'source': { 'id': bundle_fqid.source.id, - 'name': str(bundle_fqid.source.name), + 'spec': str(bundle_fqid.source.spec), }, 'query': self.query(catalog, prefix), 'subscription_id': 'cafebabe-feed-4bad-dead-beaf8badf00d', @@ -212,7 +212,7 @@ def list_bundles(self, ) -> List[SourcedBundleFQID]: validate_uuid_prefix(prefix) plugin = self.repository_plugin(catalog) - source = plugin.resolve_source(name=source) + source = plugin.resolve_source(spec=source) return plugin.list_bundles(source, prefix) @property @@ -292,8 +292,8 @@ def filter_obsolete_bundle_versions(cls, each bundle UUID. >>> AzulClient.filter_obsolete_bundle_versions([]) [] - >>> from azul.indexer import SimpleSourceName, SourceRef - >>> s = SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n')) + >>> from azul.indexer import SimpleSourceSpec, SourceRef + >>> s = SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n')) >>> def b(u, v): ... return SourcedBundleFQID(source=s, uuid=u, version=v) >>> AzulClient.filter_obsolete_bundle_versions([ @@ -303,32 +303,32 @@ def filter_obsolete_bundle_versions(cls, ... ]) # doctest: +NORMALIZE_WHITESPACE [SourcedBundleFQID(uuid='c', version='0', - source=SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n'))), + source=SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n'))), SourcedBundleFQID(uuid='b', version='3', - source=SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n'))), + source=SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n'))), SourcedBundleFQID(uuid='a', version='1', - source=SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n')))] + source=SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n')))] >>> AzulClient.filter_obsolete_bundle_versions([ ... b('C', '0'), b('a', '1'), b('a', '0'), ... b('a', '2'), b('b', '1'), b('c', '2') ... ]) # doctest: +NORMALIZE_WHITESPACE [SourcedBundleFQID(uuid='c', version='2', - source=SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n'))), + source=SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n'))), SourcedBundleFQID(uuid='b', version='1', - source=SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n'))), + source=SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n'))), SourcedBundleFQID(uuid='a', version='2', - source=SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n')))] + source=SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n')))] >>> AzulClient.filter_obsolete_bundle_versions([ ... b('a', '0'), b('A', '1') ... ]) # doctest: +NORMALIZE_WHITESPACE [SourcedBundleFQID(uuid='A', version='1', - source=SourceRef(id='i', name=SimpleSourceName(prefix='42', name='n')))] + source=SourceRef(id='i', spec=SimpleSourceSpec(prefix='42', name='n')))] """ # Sort lexicographically by source and FQID. I've observed the DSS diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py index a3eee69a4..543110d2b 100644 --- a/src/azul/indexer/__init__.py +++ b/src/azul/indexer/__init__.py @@ -21,6 +21,7 @@ import attr from azul import ( + reject, require, ) from azul.types import ( @@ -31,6 +32,10 @@ # FIXME: Remove hacky import of SupportsLessThan # https://github.com/DataBiosphere/azul/issues/2783 +from azul.uuids import ( + validate_uuid_prefix, +) + if TYPE_CHECKING: from _typeshed import ( SupportsLessThan, @@ -50,25 +55,27 @@ class BundleFQID(SupportsLessThan): version: BundleVersion -SOURCE_NAME = TypeVar('SOURCE_NAME', bound='SourceName') +SOURCE_SPEC = TypeVar('SOURCE_SPEC', bound='SourceSpec') -# FIXME: Rename to SourceSpec/SOURCE_SPEC, and all .name to .spec -# https://github.com/DataBiosphere/azul/issues/2843 @attr.s(frozen=True, auto_attribs=True, kw_only=True) -class SourceName(ABC, Generic[SOURCE_NAME]): +class SourceSpec(ABC, Generic[SOURCE_SPEC]): """ The name of a repository source containing bundles to index. A repository has at least one source. Repository plugins whose repository source names are structured might want to implement this abstract class. Plugins that - have simple unstructured names may want to use :class:`StringSourceName`. + have simple unstructured names may want to use :class:`SimpleSourceSpec`. """ - prefix: Optional[str] = '' + prefix: str = '' + + def __attrs_post_init__(self): + validate_uuid_prefix(self.prefix) + assert ':' not in self.prefix, self.prefix @classmethod @abstractmethod - def parse(cls, name: str) -> SOURCE_NAME: + def parse(cls, spec: str) -> SOURCE_SPEC: raise NotImplementedError @abstractmethod @@ -77,25 +84,54 @@ def __str__(self) -> str: @attr.s(frozen=True, auto_attribs=True, kw_only=True) -class SimpleSourceName(SourceName['SimpleSourceName']): +class SimpleSourceSpec(SourceSpec['SimpleSourceSpec']): """ Default implementation for unstructured source names. """ name: str @classmethod - def parse(cls, name: str) -> 'SimpleSourceName': - return cls(name=name) + def parse(cls, spec: str) -> 'SimpleSourceSpec': + """ + >>> SimpleSourceSpec.parse('https://foo.edu:12') + SimpleSourceSpec(prefix='12', name='https://foo.edu') + + >>> SimpleSourceSpec.parse('foo') + Traceback (most recent call last): + ... + azul.RequirementError: Source specifications must end in a colon followed by an optional UUID prefix + + >>> SimpleSourceSpec.parse('foo:8F53') + Traceback (most recent call last): + ... + azul.uuids.InvalidUUIDPrefixError: '8F53' is not a valid UUID prefix. + + >>> SimpleSourceSpec.parse('https://foo.edu') + Traceback (most recent call last): + ... + azul.uuids.InvalidUUIDPrefixError: '//foo.edu' is not a valid UUID prefix. + """ + + # FIXME: Move parsing of prefix to SourceSpec + # https://github.com/DataBiosphere/azul/issues/3073 + name, sep, prefix = spec.rpartition(':') + reject(sep == '', + 'Source specifications must end in a colon followed by an optional UUID prefix') + return cls(prefix=prefix, name=name) def __str__(self) -> str: - return self.name + """ + >>> str(SimpleSourceSpec(prefix='12', name='foo:bar/baz')) + 'foo:bar/baz:12' + """ + return f'{self.name}:{self.prefix}' SOURCE_REF = TypeVar('SOURCE_REF', bound='SourceRef') @attr.s(auto_attribs=True, frozen=True, kw_only=True) -class SourceRef(Generic[SOURCE_NAME, SOURCE_REF]): +class SourceRef(Generic[SOURCE_SPEC, SOURCE_REF]): """ A reference to a repository source containing bundles to index. A repository has at least one source. A source is primarily referenced by its ID but we @@ -112,40 +148,40 @@ class SourceRef(Generic[SOURCE_NAME, SOURCE_REF]): body is empty. """ id: str - name: SOURCE_NAME + spec: SOURCE_SPEC _lookup: ClassVar[Dict[Tuple[Type['SourceRef'], str], 'SourceRef']] = {} _lookup_lock = RLock() - def __new__(cls: Type[SOURCE_REF], *, id: str, name: SOURCE_NAME) -> SOURCE_REF: + def __new__(cls: Type[SOURCE_REF], *, id: str, spec: SOURCE_SPEC) -> SOURCE_REF: """ Interns instances by their ID and ensures that names are unambiguous for any given ID. Two different sources may still use the same name. >>> class S(SourceRef): pass - >>> a, b = SimpleSourceName.parse('a'), SimpleSourceName.parse('b') + >>> a, b = SimpleSourceSpec.parse('a:'), SimpleSourceSpec.parse('b:') - >>> S(id='1', name=a) is S(id='1', name=a) + >>> S(id='1', spec=a) is S(id='1', spec=a) True - >>> S(id='1', name=a) is S(id='2', name=a) + >>> S(id='1', spec=a) is S(id='2', spec=a) False - >>> S(id='1', name=b) # doctest: +NORMALIZE_WHITESPACE + >>> S(id='1', spec=b) # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - azul.RequirementError: ('Ambiguous source names for same ID.', - SimpleSourceName(prefix='', name='a'), - SimpleSourceName(prefix='', name='b'), + azul.RequirementError: ('Ambiguous source specs for same ID.', + SimpleSourceSpec(prefix='', name='a'), + SimpleSourceSpec(prefix='', name='b'), '1') Interning is done per class: >>> class T(S): pass - >>> T(id='1', name=a) is S(id='1', name=a) + >>> T(id='1', spec=a) is S(id='1', spec=a) False - >>> T(id='1', name=a) == S(id='1', name=a) + >>> T(id='1', spec=a) == S(id='1', spec=a) False """ with cls._lookup_lock: @@ -155,16 +191,16 @@ def __new__(cls: Type[SOURCE_REF], *, id: str, name: SOURCE_NAME) -> SOURCE_REF: except KeyError: self = super().__new__(cls) # noinspection PyArgumentList - self.__init__(id=id, name=name) + self.__init__(id=id, spec=spec) lookup[cls, id] = self else: assert self.id == id - require(self.name == name, - 'Ambiguous source names for same ID.', self.name, name, id) + require(self.spec == spec, + 'Ambiguous source specs for same ID.', self.spec, spec, id) return self def to_json(self): - return dict(id=self.id, name=str(self.name)) + return dict(id=self.id, spec=str(self.spec)) @attr.s(auto_attribs=True, frozen=True, kw_only=True, order=True) diff --git a/src/azul/indexer/document.py b/src/azul/indexer/document.py index 6799c92de..9979963cf 100644 --- a/src/azul/indexer/document.py +++ b/src/azul/indexer/document.py @@ -37,7 +37,7 @@ ) from azul.indexer import ( BundleFQID, - SimpleSourceName, + SimpleSourceSpec, SourceRef, ) from azul.types import ( @@ -596,11 +596,11 @@ def delete(self): return False -class DocumentSource(SourceRef[SimpleSourceName, SourceRef]): +class DocumentSource(SourceRef[SimpleSourceSpec, SourceRef]): @classmethod def from_json(cls, source: JSON) -> 'DocumentSource': - return cls(id=source['id'], name=SimpleSourceName(name=source['name'])) + return cls(id=source['id'], spec=SimpleSourceSpec.parse(source['spec'])) @dataclass @@ -664,7 +664,7 @@ class Aggregate(Document[AggregateCoordinates]): def __init__(self, coordinates: AggregateCoordinates, version: Optional[int], - sources: Set[SourceRef[SimpleSourceName, SourceRef]], + sources: Set[SourceRef[SimpleSourceSpec, SourceRef]], contents: Optional[JSON], bundles: Optional[List[JSON]], num_contributions: int) -> None: ... @@ -683,7 +683,7 @@ def field_types(cls, field_types: FieldTypes) -> FieldTypes: 'num_contributions': pass_thru_int, 'sources': { 'id': pass_thru_str, - 'name': pass_thru_str + 'spec': pass_thru_str }, 'bundles': { 'uuid': pass_thru_str, diff --git a/src/azul/indexer/index_controller.py b/src/azul/indexer/index_controller.py index 090358ba0..39622e848 100644 --- a/src/azul/indexer/index_controller.py +++ b/src/azul/indexer/index_controller.py @@ -176,7 +176,7 @@ def transform(self, catalog: CatalogName, notification: JSON, delete: bool) -> L """ match, source = notification['match'], notification['source'] plugin = self.repository_plugin(catalog) - source = plugin.resolve_source(name=source['name'], id=source['id']) + source = plugin.resolve_source(spec=source['spec'], id=source['id']) bundle_fqid = SourcedBundleFQID(source=source, uuid=match['bundle_uuid'], version=match['bundle_version']) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 51aa0cc87..cdadbeb4b 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -40,8 +40,8 @@ ) from azul.indexer import ( Bundle, - SOURCE_NAME, SOURCE_REF, + SOURCE_SPEC, SourcedBundleFQID, ) from azul.indexer.document import ( @@ -167,7 +167,7 @@ def aggregate_class(self) -> Type[Aggregate]: return Aggregate -class RepositoryPlugin(Generic[SOURCE_NAME, SOURCE_REF], Plugin): +class RepositoryPlugin(Generic[SOURCE_SPEC, SOURCE_REF], Plugin): @classmethod def type_name(cls) -> str: @@ -183,32 +183,33 @@ def create(cls, catalog: CatalogName) -> 'RepositoryPlugin': @property @abstractmethod - def sources(self) -> AbstractSet[SOURCE_NAME]: + def sources(self) -> AbstractSet[SOURCE_SPEC]: """ The names of the sources the plugin is configured to read metadata from. """ raise NotImplementedError - def resolve_source(self, *, name: str, id: Optional[str] = None) -> SOURCE_REF: + def resolve_source(self, *, spec: str, id: Optional[str] = None) -> SOURCE_REF: """ - Return an instance of :class:`SourceRef` for the repository source with - the specified name or raise an exception if no such source exists. If an - ID is given, ensure that it refers to the same source as the name. + Return an instance of :class:`SourceRef` for the repository source + matching the given specification or raise an exception if no such source + exists. If an ID is given, ensure that the source matching the + specification has the given ID. """ cls = type(self) base_cls = one(getattr(cls, '__orig_bases__')) - source_name_cls, source_ref_cls = get_args(base_cls) - name = source_name_cls.parse(name) - actual_id = self.lookup_source_id(name) + source_spec_cls, source_ref_cls = get_args(base_cls) + spec = source_spec_cls.parse(spec) + actual_id = self.lookup_source_id(spec) if id is None: id = actual_id else: require(id == actual_id, - 'Source ID changed unexpectedly', name, id, actual_id) - return source_ref_cls(id=id, name=name) + 'Source ID changed unexpectedly', spec, id, actual_id) + return source_ref_cls(id=id, spec=spec) @abstractmethod - def lookup_source_id(self, name: SOURCE_NAME) -> str: + def lookup_source_id(self, spec: SOURCE_SPEC) -> str: """ Return the ID of the repository source with the specified name or raise an exception if no such source exists. diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 150243249..889513caa 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -188,7 +188,7 @@ def service_config(self) -> ServiceConfig: "entryId": "entity_id", "sourceId": "sources.id", - "sourceName": "sources.name", + "sourceSpec": "sources.spec", }, autocomplete_translation={ "files": { @@ -202,7 +202,7 @@ def service_config(self) -> ServiceConfig: manifest={ "sources": { "source_id": "id", - "source_name": "name", + "source_spec": "spec", }, "bundles": { "bundle_uuid": "uuid", diff --git a/src/azul/plugins/repository/canned/__init__.py b/src/azul/plugins/repository/canned/__init__.py index 198c3b9e4..99ecb38b6 100644 --- a/src/azul/plugins/repository/canned/__init__.py +++ b/src/azul/plugins/repository/canned/__init__.py @@ -40,7 +40,7 @@ ) from azul.indexer import ( Bundle, - SimpleSourceName, + SimpleSourceSpec, SourceRef, SourcedBundleFQID, ) @@ -60,7 +60,7 @@ log = logging.getLogger(__name__) -class CannedSourceRef(SourceRef[SimpleSourceName, 'CannedSourceRef']): +class CannedSourceRef(SourceRef[SimpleSourceSpec, 'CannedSourceRef']): pass @@ -68,27 +68,27 @@ class CannedSourceRef(SourceRef[SimpleSourceName, 'CannedSourceRef']): @dataclass(frozen=True) -class Plugin(RepositoryPlugin[SimpleSourceName, CannedSourceRef]): - _sources: AbstractSet[SimpleSourceName] +class Plugin(RepositoryPlugin[SimpleSourceSpec, CannedSourceRef]): + _sources: AbstractSet[SimpleSourceSpec] @classmethod def create(cls, catalog: CatalogName) -> RepositoryPlugin: return cls( frozenset( - SimpleSourceName.parse(name) + SimpleSourceSpec.parse(name) for name in config.canned_sources(catalog) ) ) @property - def sources(self) -> AbstractSet[SimpleSourceName]: + def sources(self) -> AbstractSet[SimpleSourceSpec]: return self._sources - def lookup_source_id(self, name: SimpleSourceName) -> str: + def lookup_source_id(self, name: SimpleSourceSpec) -> str: return name @lru_cache - def staging_area(self, source_name: SimpleSourceName) -> StagingArea: + def staging_area(self, source_name: SimpleSourceSpec) -> StagingArea: factory = GitHubStagingAreaFactory.from_url(source_name) return factory.load_staging_area() diff --git a/src/azul/plugins/repository/dss/__init__.py b/src/azul/plugins/repository/dss/__init__.py index de46ab5da..640285f29 100644 --- a/src/azul/plugins/repository/dss/__init__.py +++ b/src/azul/plugins/repository/dss/__init__.py @@ -48,7 +48,7 @@ ) from azul.indexer import ( Bundle, - SimpleSourceName, + SimpleSourceSpec, SourceRef, SourcedBundleFQID, ) @@ -68,7 +68,7 @@ log = logging.getLogger(__name__) -class DSSSourceRef(SourceRef[SimpleSourceName, 'DSSSourceRef']): +class DSSSourceRef(SourceRef[SimpleSourceSpec, 'DSSSourceRef']): """ Subclass of `Source` to create new namespace for source IDs. """ @@ -78,19 +78,20 @@ class DSSSourceRef(SourceRef[SimpleSourceName, 'DSSSourceRef']): def for_dss_endpoint(cls, endpoint: str): # We hash the endpoint instead of using it verbatim to distinguish them # within a document, which is helpful for testing. - return cls(id=cls.id_from_name(endpoint), - name=SimpleSourceName(prefix=config.dss_query_prefix, - name=endpoint)) + spec = SimpleSourceSpec(prefix=config.dss_query_prefix, + name=endpoint) + return cls(id=cls.id_from_spec(spec), + spec=spec) @classmethod - def id_from_name(cls, name: str) -> str: - return str(uuid5(cls.namespace, name)) + def id_from_spec(cls, spec: SimpleSourceSpec) -> str: + return str(uuid5(cls.namespace, spec.name)) DSSBundleFQID = SourcedBundleFQID[DSSSourceRef] -class Plugin(RepositoryPlugin[DSSSourceRef, SimpleSourceName]): +class Plugin(RepositoryPlugin[DSSSourceRef, SimpleSourceSpec]): @classmethod def create(cls, catalog: CatalogName) -> RepositoryPlugin: @@ -100,8 +101,8 @@ def create(cls, catalog: CatalogName) -> RepositoryPlugin: def sources(self) -> AbstractSet[str]: return {config.dss_endpoint} - def lookup_source_id(self, name: SimpleSourceName) -> str: - return DSSSourceRef.id_from_name(name.name) + def lookup_source_id(self, spec: SimpleSourceSpec) -> str: + return DSSSourceRef.id_from_spec(spec) @cached_property def dss_client(self): @@ -112,7 +113,7 @@ def _assert_source(self, source): def list_bundles(self, source: DSSSourceRef, prefix: str) -> List[DSSBundleFQID]: self._assert_source(source) - prefix = source.name.prefix + prefix + prefix = source.spec.prefix + prefix validate_uuid_prefix(prefix) log.info('Listing bundles with prefix %r in source %r.', prefix, source) bundle_fqids = [] diff --git a/src/azul/plugins/repository/tdr/__init__.py b/src/azul/plugins/repository/tdr/__init__.py index a53ec5481..04d9227ca 100644 --- a/src/azul/plugins/repository/tdr/__init__.py +++ b/src/azul/plugins/repository/tdr/__init__.py @@ -75,7 +75,7 @@ ) from azul.terra import ( TDRClient, - TDRSourceName, + TDRSourceSpec, TerraDRSClient, ) from azul.types import ( @@ -154,7 +154,7 @@ def dangling_inputs(self) -> Entities: } -class TDRSourceRef(SourceRef[TDRSourceName, 'TDRSourceRef']): +class TDRSourceRef(SourceRef[TDRSourceSpec, 'TDRSourceRef']): pass @@ -162,18 +162,18 @@ class TDRSourceRef(SourceRef[TDRSourceName, 'TDRSourceRef']): @attr.s(kw_only=True, auto_attribs=True, frozen=True) -class Plugin(RepositoryPlugin[TDRSourceName, TDRSourceRef]): - _sources: AbstractSet[TDRSourceName] +class Plugin(RepositoryPlugin[TDRSourceSpec, TDRSourceRef]): + _sources: AbstractSet[TDRSourceSpec] @classmethod def create(cls, catalog: CatalogName) -> 'RepositoryPlugin': return cls(sources=frozenset( - TDRSourceName.parse(name) - for name in config.tdr_sources(catalog)) + TDRSourceSpec.parse(spec) + for spec in config.tdr_sources(catalog)) ) @property - def sources(self) -> AbstractSet[TDRSourceName]: + def sources(self) -> AbstractSet[TDRSourceSpec]: return self._sources @property @@ -198,10 +198,10 @@ def _tdr(cls): return TDRClient() def _assert_source(self, source: TDRSourceRef): - assert source.name in self.sources, (source, self.sources) + assert source.spec in self.sources, (source, self.sources) - def lookup_source_id(self, name: TDRSourceName) -> str: - return self.tdr.lookup_source_id(name) + def lookup_source_id(self, spec: TDRSourceSpec) -> str: + return self.tdr.lookup_source_id(spec) def list_bundles(self, source: TDRSourceRef, prefix: str) -> List[TDRBundleFQID]: self._assert_source(source) @@ -247,16 +247,16 @@ def format_version(cls, version: datetime.datetime) -> str: def _run_sql(self, query): return self.tdr.run_sql(query) - def _full_table_name(self, source: TDRSourceName, table_name: str) -> str: + def _full_table_name(self, source: TDRSourceSpec, table_name: str) -> str: return source.qualify_table(table_name) def _list_links_ids(self, source: TDRSourceRef, prefix: str) -> List[TDRBundleFQID]: validate_uuid_prefix(prefix) - current_bundles = self._query_latest_version(source.name, f''' + current_bundles = self._query_latest_version(source.spec, f''' SELECT links_id, version - FROM {self._full_table_name(source.name, 'links')} - WHERE STARTS_WITH(links_id, '{source.name.prefix + prefix}') + FROM {self._full_table_name(source.spec, 'links')} + WHERE STARTS_WITH(links_id, '{source.spec.prefix + prefix}') ''', group_by='links_id') return [ SourcedBundleFQID(source=source, @@ -265,13 +265,13 @@ def _list_links_ids(self, source: TDRSourceRef, prefix: str) -> List[TDRBundleFQ for row in current_bundles ] - def _query_latest_version(self, source: TDRSourceName, query: str, group_by: str) -> List[BigQueryRow]: + def _query_latest_version(self, source: TDRSourceSpec, query: str, group_by: str) -> List[BigQueryRow]: iter_rows = self._run_sql(query) key = itemgetter(group_by) groups = groupby(sorted(iter_rows, key=key), key=key) return [self._choose_one_version(source, group) for _, group in groups] - def _choose_one_version(self, source: TDRSourceName, versioned_items: BigQueryRows) -> BigQueryRow: + def _choose_one_version(self, source: TDRSourceSpec, versioned_items: BigQueryRows) -> BigQueryRow: if source.is_snapshot: return one(versioned_items) else: @@ -290,7 +290,7 @@ def _emulate_bundle(self, bundle_fqid: SourcedBundleFQID) -> Bundle: with ThreadPoolExecutor(max_workers=config.num_tdr_workers) as executor: futures = { entity_type: executor.submit(self._retrieve_entities, - bundle.fqid.source.name, + bundle.fqid.source.spec, entity_type, entity_ids) for entity_type, entity_ids in entities.items() @@ -368,7 +368,7 @@ def _retrieve_links(self, links_id: SourcedBundleFQID) -> JSON: links_columns = ', '.join( TDRBundle.metadata_columns | {'project_id', 'links_id'} ) - source = links_id.source.name + source = links_id.source.spec links = one(self._run_sql(f''' SELECT {links_columns} FROM {self._full_table_name(source, 'links')} @@ -380,7 +380,7 @@ def _retrieve_links(self, links_id: SourcedBundleFQID) -> JSON: return links def _retrieve_entities(self, - source: TDRSourceName, + source: TDRSourceSpec, entity_type: EntityType, entity_ids: Set[EntityID] ) -> BigQueryRows: @@ -416,7 +416,7 @@ def _find_upstream_bundles(self, output_id = 'JSON_EXTRACT_SCALAR(link_output, "$.output_id")' rows = self._run_sql(f''' SELECT links_id, version, {output_id} AS output_id - FROM {self._full_table_name(source.name, 'links')} AS links + FROM {self._full_table_name(source.spec, 'links')} AS links JOIN UNNEST(JSON_EXTRACT_ARRAY(links.content, '$.links')) AS content_links ON JSON_EXTRACT_SCALAR(content_links, '$.link_type') = 'process_link' JOIN UNNEST(JSON_EXTRACT_ARRAY(content_links, '$.outputs')) AS link_output @@ -642,7 +642,7 @@ def _parse_file_id_column(self, file_id: Optional[str]) -> Optional[str]: # The file_id column is present for datasets, but is usually null, may # contain unexpected/unusable values, and NEVER produces usable DRS URLs, # so we avoid parsing the column altogether for datasets. - if self.fqid.source.name.is_snapshot: + if self.fqid.source.spec.is_snapshot: reject(file_id is None) # TDR stores the complete DRS URI in the file_id column, but we only # index the path component. These requirements prevent mismatches in diff --git a/src/azul/service/hca_response_v5.py b/src/azul/service/hca_response_v5.py index 9b10dd095..79c49f032 100644 --- a/src/azul/service/hca_response_v5.py +++ b/src/azul/service/hca_response_v5.py @@ -265,7 +265,7 @@ def make_bundles(self, entry): def make_sources(self, entry): return [ - {'sourceId': s['id'], 'sourceName': s['name']} + {'sourceId': s['id'], 'sourceSpec': s['spec']} for s in entry['sources'] ] diff --git a/src/azul/terra.py b/src/azul/terra.py index d8a70a1f2..fc13d450d 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -54,7 +54,7 @@ http_client, ) from azul.indexer import ( - SourceName, + SourceSpec, ) from azul.strings import ( trunc_ellipses, @@ -70,7 +70,7 @@ @attr.s(frozen=True, auto_attribs=True, kw_only=True) -class TDRSourceName(SourceName): +class TDRSourceSpec(SourceSpec): project: str name: str is_snapshot: bool @@ -80,44 +80,46 @@ class TDRSourceName(SourceName): _type_snapshot = 'snapshot' @classmethod - def parse(cls, source: str) -> 'TDRSourceName': + def parse(cls, spec: str) -> 'TDRSourceSpec': """ Construct an instance from its string representation, using the syntax 'tdr:{project}:{type}/{name}:{prefix}'. - >>> s = TDRSourceName.parse('tdr:foo:snapshot/bar:') + >>> s = TDRSourceSpec.parse('tdr:foo:snapshot/bar:') >>> s - TDRSourceName(prefix='', project='foo', name='bar', is_snapshot=True) + TDRSourceSpec(prefix='', project='foo', name='bar', is_snapshot=True) >>> s.bq_name 'bar' >>> str(s) 'tdr:foo:snapshot/bar:' - >>> d = TDRSourceName.parse('tdr:foo:dataset/bar:42') + >>> d = TDRSourceSpec.parse('tdr:foo:dataset/bar:42') >>> d - TDRSourceName(prefix='42', project='foo', name='bar', is_snapshot=False) + TDRSourceSpec(prefix='42', project='foo', name='bar', is_snapshot=False) >>> d.bq_name 'datarepo_bar' >>> str(d) 'tdr:foo:dataset/bar:42' - >>> TDRSourceName.parse('baz:foo:dataset/bar:') + >>> TDRSourceSpec.parse('baz:foo:dataset/bar:') Traceback (most recent call last): ... AssertionError: baz - >>> TDRSourceName.parse('tdr:foo:baz/bar:42') + >>> TDRSourceSpec.parse('tdr:foo:baz/bar:42') Traceback (most recent call last): ... AssertionError: baz - >>> TDRSourceName.parse('tdr:foo:snapshot/bar:n32') + >>> TDRSourceSpec.parse('tdr:foo:snapshot/bar:n32') Traceback (most recent call last): ... azul.uuids.InvalidUUIDPrefixError: 'n32' is not a valid UUID prefix. """ # BigQuery (and by extension the TDR) does not allow : or / in dataset names - service, project, name, prefix = source.split(':') + # FIXME: Move parsing of prefix to SourceSpec + # https://github.com/DataBiosphere/azul/issues/3073 + service, project, name, prefix = spec.split(':') type, name = name.split('/') assert service == 'tdr', service if type == cls._type_snapshot: @@ -128,7 +130,7 @@ def parse(cls, source: str) -> 'TDRSourceName': assert False, type validate_uuid_prefix(prefix) self = cls(prefix=prefix, project=project, name=name, is_snapshot=is_snapshot) - assert source == str(self), (source, self) + assert spec == str(self), (spec, self) return self @property @@ -236,7 +238,7 @@ class TDRClient(SAMClient): """ @cache - def lookup_source_project(self, source: TDRSourceName) -> str: + def lookup_source_project(self, source: TDRSourceSpec) -> str: """ Return the name of the Google Cloud project containing the source (snapshot or dataset) with the specified name. @@ -244,21 +246,21 @@ def lookup_source_project(self, source: TDRSourceName) -> str: return self._lookup_source(source)['dataProject'] @cache - def lookup_source_id(self, source: TDRSourceName) -> str: + def lookup_source_id(self, source: TDRSourceSpec) -> str: """ Return the primary identifier of the source (snapshot or dataset) with the specified name. """ return self._lookup_source(source)['id'] - def check_api_access(self, source: TDRSourceName) -> None: + def check_api_access(self, source: TDRSourceSpec) -> None: """ Verify that the client is authorized to read from the TDR service API. """ self._lookup_source(source) log.info('TDR client is authorized for API access to %s.', source) - def _lookup_source(self, source: TDRSourceName) -> JSON: + def _lookup_source(self, source: TDRSourceSpec) -> JSON: resource = f'{source.type_name} {source.name!r} via the TDR API' tdr_path = source.type_name + 's' endpoint = self._repository_endpoint(tdr_path) @@ -283,7 +285,7 @@ def _lookup_source(self, source: TDRSourceName) -> JSON: else: raise RequirementError('Unexpected response from TDR API', response.status) - def check_bigquery_access(self, source: TDRSourceName): + def check_bigquery_access(self, source: TDRSourceSpec): """ Verify that the client is authorized to read from TDR BigQuery tables. """ diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T113344.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T113344.698028Z.results.json index 66750d912..d6c5859ed 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T113344.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T113344.698028Z.results.json @@ -8,7 +8,7 @@ "entity_id": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d", "source": { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" }, "contents": { "samples": [ @@ -465,7 +465,7 @@ "entity_id": "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", "source": { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" }, "bundle_uuid": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d", "bundle_version": "2018-11-02T113344.698028Z", @@ -699,7 +699,7 @@ "entity_id": "70d1af4a-82c8-478a-8960-e9028b3616ca", "source": { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" }, "bundle_uuid": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d", "bundle_version": "2018-11-02T113344.698028Z", @@ -933,7 +933,7 @@ "entity_id": "a21dc760-a500-4236-bcff-da34a0e873d2", "source": { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" }, "bundle_uuid": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d", "bundle_version": "2018-11-02T113344.698028Z", @@ -1192,7 +1192,7 @@ "entity_id": "e8642221-4c2c-4fd7-b926-a68bce363c88", "source": { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" }, "bundle_uuid": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d", "bundle_version": "2018-11-02T113344.698028Z", @@ -1454,7 +1454,7 @@ "sources": [ { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" } ], "contents": { @@ -1970,7 +1970,7 @@ "sources": [ { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" } ], "contents": { @@ -2263,7 +2263,7 @@ "sources": [ { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" } ], "contents": { @@ -2556,7 +2556,7 @@ "sources": [ { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" } ], "contents": { @@ -2809,7 +2809,7 @@ "sources": [ { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" } ], "contents": { @@ -3123,7 +3123,7 @@ "entity_id": "412898c5-5b9b-4907-b07c-e9b89666e204", "source": { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" }, "contents": { "samples": [ @@ -3383,7 +3383,7 @@ "sources": [ { "id": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "name": "test" + "spec": "test:" } ], "contents": { diff --git a/test/indexer/test_indexer_controller.py b/test/indexer/test_indexer_controller.py index b4096c928..1764ceff3 100644 --- a/test/indexer/test_indexer_controller.py +++ b/test/indexer/test_indexer_controller.py @@ -156,7 +156,7 @@ def test_contribute_and_aggregate(self): } self.maxDiff = None self.assertSetEqual(expected_entities, entities_from_tallies) - self.assertListEqual(len(bundles) * [mock.call(name=str(mock_source.name), + self.assertListEqual(len(bundles) * [mock.call(spec=str(mock_source.spec), id=mock_source.id)], mock_plugin.resolve_source.mock_calls) self.assertListEqual([mock.call(b) for b in bundle_fqids], diff --git a/test/indexer/test_tdr.py b/test/indexer/test_tdr.py index caef746ff..1da449732 100644 --- a/test/indexer/test_tdr.py +++ b/test/indexer/test_tdr.py @@ -46,7 +46,7 @@ TDRSourceRef, ) from azul.terra import ( - TDRSourceName, + TDRSourceSpec, ) from azul.types import ( JSON, @@ -65,7 +65,7 @@ class TestTDRPlugin(CannedBundleTestCase): mock_service_url = 'https://azul_tdr_service_url_testing.org' source = TDRSourceRef(id='test_id', - name=TDRSourceName(project='test_project', + spec=TDRSourceSpec(project='test_project', name='snapshot', is_snapshot=True)) @@ -74,14 +74,14 @@ def tinyquery(self) -> tinyquery.TinyQuery: return tinyquery.TinyQuery() @cache - def plugin_for_source_name(self, source_name) -> tdr.Plugin: - return TestPlugin(sources={source_name}, tinyquery=self.tinyquery) + def plugin_for_source_name(self, source_spec) -> tdr.Plugin: + return TestPlugin(sources={source_spec}, tinyquery=self.tinyquery) def test_list_bundles(self): source = self.source current_version = '2001-01-01T00:00:00.000001Z' links_ids = ['42-abc', '42-def', '42-ghi', '86-xyz'] - self._make_mock_entity_table(source=source.name, + self._make_mock_entity_table(source=source.spec, table_name='links', rows=[ dict(links_id=links_id, @@ -89,7 +89,7 @@ def test_list_bundles(self): content={}) for links_id in links_ids ]) - plugin = self.plugin_for_source_name(source.name) + plugin = self.plugin_for_source_name(source.spec) bundle_ids = plugin.list_bundles(source, prefix='42') bundle_ids.sort(key=attrgetter('uuid')) self.assertEqual(bundle_ids, [ @@ -118,7 +118,7 @@ def _make_mock_tdr_tables(self, version=None, extension='tables.tdr')['tables'] for table_name, table_rows in tables.items(): - self._make_mock_entity_table(bundle_fqid.source.name, + self._make_mock_entity_table(bundle_fqid.source.spec, table_name, table_rows['rows']) @@ -129,7 +129,7 @@ def test_fetch_bundle(self): # Directly modify the canned tables to test invalid links not present # in the canned bundle. - dataset = self.source.name.bq_name + dataset = self.source.spec.bq_name links_table = self.tinyquery.tables_by_name[dataset + '.links'] links_content_column = links_table.columns['content'].values links_content = json.loads(one(links_content_column)) @@ -185,7 +185,7 @@ def _test_fetch_bundle(self, load_tables: bool): if load_tables: self._make_mock_tdr_tables(test_bundle.fqid) - plugin = self.plugin_for_source_name(test_bundle.fqid.source.name) + plugin = self.plugin_for_source_name(test_bundle.fqid.source.spec) emulated_bundle = plugin.fetch_bundle(test_bundle.fqid) self.assertEqual(test_bundle.fqid, emulated_bundle.fqid) @@ -194,7 +194,7 @@ def _test_fetch_bundle(self, self.assertEqual(test_bundle.metadata_files, emulated_bundle.metadata_files) def _make_mock_entity_table(self, - source: TDRSourceName, + source: TDRSourceSpec, table_name: str, rows: JSONs) -> None: schema = self._bq_schema(rows[0]) @@ -242,7 +242,7 @@ def _run_sql(self, query: str) -> BigQueryRows: for i in range(num_rows): yield {k[1]: v.values[i] for k, v in columns.items()} - def _full_table_name(self, source: TDRSourceName, table_name: str) -> str: + def _full_table_name(self, source: TDRSourceSpec, table_name: str) -> str: return source.bq_name + '.' + table_name diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 5f79313af..1b6c9f419 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -212,7 +212,7 @@ def test_manifest_not_cached(self): def test_compact_manifest(self): expected = [ ('source_id', '4b737739-4dc9-5d4b-9989-a4942047c91c', '4b737739-4dc9-5d4b-9989-a4942047c91c'), - ('source_name', 'test', 'test'), + ('source_spec', 'test:', 'test:'), ('bundle_uuid', 'f79257a7-dfc6-46d6-ae00-ba4b25313c10', 'f79257a7-dfc6-46d6-ae00-ba4b25313c10'), ('bundle_version', '2018-09-14T133314.453337Z', '2018-09-14T133314.453337Z'), ('file_document_id', '89e313db-4423-4d53-b17e-164949acfa8f', '6c946b6c-040e-45cc-9114-a8b1454c8d20'), @@ -544,7 +544,7 @@ def test_terra_bdbag_manifest(self): 'bundle_uuid': '587d74b4-1075-4bbf-b96a-4d1ede0481b2', 'bundle_version': '2018-09-14T133314.453337Z', 'source_id': '4b737739-4dc9-5d4b-9989-a4942047c91c', - 'source_name': 'test', + 'source_spec': 'test:', 'cell_suspension__provenance__document_id': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0', 'cell_suspension__biomaterial_core__biomaterial_id': 'Q4_DEMO-cellsus_SAMN02797092', 'cell_suspension__estimated_cell_count': '', @@ -641,7 +641,7 @@ def test_terra_bdbag_manifest(self): 'bundle_uuid': 'aaa96233-bf27-44c7-82df-b4dc15ad4d9d', 'bundle_version': '2018-11-02T113344.698028Z', 'source_id': '4b737739-4dc9-5d4b-9989-a4942047c91c', - 'source_name': 'test', + 'source_spec': 'test:', 'cell_suspension__provenance__document_id': '412898c5-5b9b-4907-b07c-e9b89666e204', 'cell_suspension__biomaterial_core__biomaterial_id': 'GSM2172585 1', 'cell_suspension__estimated_cell_count': '1', @@ -755,7 +755,7 @@ def sort_rows(rows: List[Dict[str, str]]) -> List[List[Tuple[str, str]]]: 'bundle_uuid', 'bundle_version', 'source_id', - 'source_name', + 'source_spec', 'cell_suspension__provenance__document_id', 'cell_suspension__biomaterial_core__biomaterial_id', 'cell_suspension__estimated_cell_count', diff --git a/test/service/test_response.py b/test/service/test_response.py index 40de09ccd..e5bf0c3bb 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -174,7 +174,7 @@ def test_key_search_files_response(self): "entryId": "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", "sources": [{ "sourceId": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "sourceName": "test" + "sourceSpec": "test:" }], "files": [ { @@ -326,7 +326,7 @@ def test_key_search_samples_response(self): ], "sources": [{ "sourceId": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "sourceName": "test" + "sourceSpec": "test:" }], "specimens": [ { @@ -461,7 +461,7 @@ def test_file_search_response(self): ], "sources": [{ "sourceId": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "sourceName": "test" + "sourceSpec": "test:" }], "specimens": [ { @@ -821,7 +821,7 @@ def test_projects_key_search_response(self): ], "sources": [{ "sourceId": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "sourceName": "test" + "sourceSpec": "test:" }], "specimens": [ { @@ -999,7 +999,7 @@ def test_projects_file_search_response(self): ], "sources": [{ "sourceId": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "sourceName": "test" + "sourceSpec": "test:" }], "specimens": [ { @@ -1238,7 +1238,7 @@ def test_project_accessions_response(self): ], "sources": [{ "sourceId": "4b737739-4dc9-5d4b-9989-a4942047c91c", - "sourceName": "test" + "sourceSpec": "test:" }], "specimens": [ {