[r] Fix: No replicas for donors in HCA (#6582)

DataBiosphere · Sep 20, 2024 · b7c9cc2 · b7c9cc2
1 parent 7729898
commit b7c9cc2
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 137 deletions.
diff --git a/src/azul/indexer/index_service.py b/src/azul/indexer/index_service.py
@@ -38,7 +38,6 @@
 from more_itertools import (
     first,
     one,
-    unzip,
 )
 
 from azul import (
@@ -303,19 +302,19 @@ def transform(self,
             log.info('Transforming %i entities in partition %s of bundle %s, version %s.',
                      num_entities, partition, bundle.uuid, bundle.version)
             contributions = []
-            replicas = []
+            replicas_by_coords = {}
             for transformer in transformers:
-                # The cast is necessary because unzip()'s type stub doesn't
-                # support heterogeneous tuples.
-                transforms = cast(
-                    tuple[Iterable[Optional[Contribution]], Iterable[Optional[Replica]]],
-                    unzip(transformer.transform(partition))
-                )
-                if transforms:
-                    contributions_part, replicas_part = transforms
-                    contributions.extend(filter(None, contributions_part))
-                    replicas.extend(filter(None, replicas_part))
-            return contributions, replicas
+                for document in transformer.transform(partition):
+                    if isinstance(document, Contribution):
+                        contributions.append(document)
+                    elif isinstance(document, Replica):
+                        try:
+                            dup = replicas_by_coords[document.coordinates]
+                        except KeyError:
+                            replicas_by_coords[document.coordinates] = document
+                        else:
+                            dup.hub_ids.extend(document.hub_ids)
+            return contributions, list(replicas_by_coords.values())
 
     def create_indices(self, catalog: CatalogName):
         es_client = ESClientFactory.get()

diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py
@@ -36,8 +36,6 @@
     JSON,
 )
 
-Transform = tuple[Optional[Contribution], Optional[Replica]]
-
 
 @attr.s(frozen=True, kw_only=True, auto_attribs=True)
 class Transformer(metaclass=ABCMeta):
@@ -87,7 +85,7 @@ def estimate(self, partition: BundlePartition) -> int:
         """
 
     @abstractmethod
-    def transform(self, partition: BundlePartition) -> Iterable[Transform]:
+    def transform(self, partition: BundlePartition) -> Iterable[Contribution | Replica]:
         """
         Return the contributions by the current bundle to the entities it
         contains metadata about. More than one bundle can contribute to a

diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py
@@ -38,7 +38,6 @@
     BiosampleTransformer,
     BundleTransformer,
     DatasetTransformer,
-    DiagnosisTransformer,
     DonorTransformer,
     FileTransformer,
 )
@@ -98,7 +97,6 @@ def transformer_types(self) -> Iterable[Type[BaseTransformer]]:
             BiosampleTransformer,
             BundleTransformer,
             DatasetTransformer,
-            DiagnosisTransformer,
             DonorTransformer,
             FileTransformer,
         )

diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py
@@ -54,14 +54,14 @@
     EntityReference,
     EntityType,
     FieldTypes,
+    Replica,
     null_bool,
     null_int,
     null_str,
     pass_thru_int,
     pass_thru_json,
 )
 from azul.indexer.transform import (
-    Transform,
     Transformer,
 )
 from azul.plugins.metadata.anvil.bundle import (
@@ -98,6 +98,12 @@ class LinkedEntities:
     def __getitem__(self, item: EntityType) -> set[EntityReference]:
         return self.ancestors[item] | self.descendants[item]
 
+    def __iter__(self) -> Iterable[EntityReference]:
+        for entities in self.ancestors.values():
+            yield from entities
+        for entities in self.descendants.values():
+            yield from entities
+
     @classmethod
     def from_links(cls,
                    origin: EntityReference,
@@ -168,33 +174,18 @@ def aggregator(cls, entity_type) -> EntityAggregator:
     def estimate(self, partition: BundlePartition) -> int:
         return sum(map(partial(self._contains, partition), self.bundle.entities))
 
-    def transform(self, partition: BundlePartition) -> Iterable[Transform]:
-        return (
-            self._transform(entity)
-            for entity in self._list_entities()
-            if self._contains(partition, entity)
-        )
+    def transform(self, partition: BundlePartition) -> Iterable[Contribution | Replica]:
+        for entity in self._list_entities():
+            if self._contains(partition, entity):
+                yield from self._transform(entity)
 
     def _list_entities(self) -> Iterable[EntityReference]:
         return self.bundle.entities
 
     @abstractmethod
-    def _transform(self, entity: EntityReference) -> Transform:
+    def _transform(self, entity: EntityReference) -> Iterable[Contribution | Replica]:
         raise NotImplementedError
 
-    def _add_replica(self,
-                     contribution: JSON | None,
-                     entity: EntityReference,
-                     hub_ids: list[EntityID]
-                     ) -> Transform:
-        no_replica = not config.enable_replicas or self.entity_type() == 'bundles'
-        return (
-            None if contribution is None else self._contribution(contribution, entity),
-            None if no_replica else self._replica(self.bundle.entities[entity],
-                                                  entity,
-                                                  hub_ids)
-        )
-
     def _pluralize(self, entity_type: str) -> str:
         if entity_type == 'diagnosis':
             return 'diagnoses'
@@ -474,8 +465,8 @@ def _complete_dataset_keys(cls) -> AbstractSet[str]:
 
 class SingletonTransformer(BaseTransformer, metaclass=ABCMeta):
 
-    def _contents(self) -> MutableJSON:
-        return dict(
+    def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
+        contents = dict(
             activities=self._entities(self._activity, chain.from_iterable(
                 self._entities_by_type[activity_type]
                 for activity_type in self._activity_polymorphic_types
@@ -486,6 +477,7 @@ def _contents(self) -> MutableJSON:
             donors=self._entities(self._donor, self._entities_by_type['donor']),
             files=self._entities(self._file, self._entities_by_type['file'])
         )
+        yield self._contribution(contents, entity)
 
     @classmethod
     def field_types(cls) -> FieldTypes:
@@ -524,19 +516,17 @@ class ActivityTransformer(BaseTransformer):
     def entity_type(cls) -> str:
         return 'activities'
 
-    def _transform(self, entity: EntityReference) -> Transform:
+    def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
         linked = self._linked_entities(entity)
-        files = linked['file']
         contents = dict(
             activities=[self._activity(entity)],
             biosamples=self._entities(self._biosample, linked['biosample']),
             datasets=[self._dataset(self._only_dataset())],
             diagnoses=self._entities(self._diagnosis, linked['diagnosis']),
             donors=self._entities(self._donor, linked['donor']),
-            files=self._entities(self._file, files),
+            files=self._entities(self._file, linked['file'])
         )
-        hub_ids = [f.entity_id for f in files]
-        return self._add_replica(contents, entity, hub_ids)
+        yield self._contribution(contents, entity)
 
 
 class BiosampleTransformer(BaseTransformer):
@@ -545,9 +535,8 @@ class BiosampleTransformer(BaseTransformer):
     def entity_type(cls) -> str:
         return 'biosamples'
 
-    def _transform(self, entity: EntityReference) -> Transform:
+    def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
         linked = self._linked_entities(entity)
-        files = linked['file']
         contents = dict(
             activities=self._entities(self._activity, chain.from_iterable(
                 linked[activity_type]
@@ -557,22 +546,9 @@ def _transform(self, entity: EntityReference) -> Transform:
             datasets=[self._dataset(self._only_dataset())],
             diagnoses=self._entities(self._diagnosis, linked['diagnosis']),
             donors=self._entities(self._donor, linked['donor']),
-            files=self._entities(self._file, files),
+            files=self._entities(self._file, linked['file']),
         )
-        hub_ids = [f.entity_id for f in files]
-        return self._add_replica(contents, entity, hub_ids)
-
-
-class DiagnosisTransformer(BaseTransformer):
-
-    def _transform(self, entity: EntityReference) -> Transform:
-        files = self._linked_entities(entity)['file']
-        hub_ids = [f.entity_id for f in files]
-        return self._add_replica(None, entity, hub_ids)
-
-    @classmethod
-    def entity_type(cls) -> EntityType:
-        return 'diagnoses'
+        yield self._contribution(contents, entity)
 
 
 class BundleTransformer(SingletonTransformer):
@@ -585,11 +561,6 @@ def _singleton(self) -> EntityReference:
         return EntityReference(entity_type='bundle',
                                entity_id=self.bundle.uuid)
 
-    def _transform(self, entity: EntityReference) -> Transform:
-        contents = self._contents()
-        hub_ids = [f.entity_id for f in self._entities_by_type['file']]
-        return self._add_replica(contents, entity, hub_ids)
-
 
 class DatasetTransformer(SingletonTransformer):
 
@@ -600,28 +571,15 @@ def entity_type(cls) -> str:
     def _singleton(self) -> EntityReference:
         return self._only_dataset()
 
-    def _transform(self, entity: EntityReference) -> Transform:
-        contents = self._contents()
-        # Every file in a snapshot is linked to that snapshot's singular
-        # dataset, making an explicit list of hub IDs for the dataset both
-        # redundant and impractically large (we observe that for large
-        # snapshots, trying to track this many files in a single data structure
-        # causes a prohibitively high rate of conflicts during replica updates).
-        # Therefore, we leave the hub IDs field empty for datasets and rely on
-        # the tenet that every file is an implicit hub of its parent dataset.
-        hub_ids = []
-        return self._add_replica(contents, entity, hub_ids)
-
 
 class DonorTransformer(BaseTransformer):
 
     @classmethod
     def entity_type(cls) -> str:
         return 'donors'
 
-    def _transform(self, entity: EntityReference) -> Transform:
+    def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
         linked = self._linked_entities(entity)
-        files = linked['file']
         contents = dict(
             activities=self._entities(self._activity, chain.from_iterable(
                 linked[activity_type]
@@ -631,10 +589,9 @@ def _transform(self, entity: EntityReference) -> Transform:
             datasets=[self._dataset(self._only_dataset())],
             diagnoses=self._entities(self._diagnosis, linked['diagnosis']),
             donors=[self._donor(entity)],
-            files=self._entities(self._file, files),
+            files=self._entities(self._file, linked['file']),
         )
-        hub_ids = [f.entity_id for f in files]
-        return self._add_replica(contents, entity, hub_ids)
+        yield self._contribution(contents, entity)
 
 
 class FileTransformer(BaseTransformer):
@@ -643,7 +600,7 @@ class FileTransformer(BaseTransformer):
     def entity_type(cls) -> str:
         return 'files'
 
-    def _transform(self, entity: EntityReference) -> Transform:
+    def _transform(self, entity: EntityReference) -> Iterable[Contribution | Replica]:
         linked = self._linked_entities(entity)
         contents = dict(
             activities=self._entities(self._activity, chain.from_iterable(
@@ -656,8 +613,9 @@ def _transform(self, entity: EntityReference) -> Transform:
             donors=self._entities(self._donor, linked['donor']),
             files=[self._file(entity)],
         )
-        # The result of the link traversal does not include the starting entity,
-        # so without this step the file itself wouldn't be included in its hubs
-        files = (entity, *linked['file'])
-        hub_ids = [f.entity_id for f in files]
-        return self._add_replica(contents, entity, hub_ids)
+        yield self._contribution(contents, entity)
+        if config.enable_replicas:
+            for linked_entity in linked:
+                content = self.bundle.entities[linked_entity]
+                hub_ids = [] if linked_entity.entity_type == 'dataset' else [entity.entity_id]
+                yield self._replica(content, linked_entity, hub_ids)