From b5674cf5ce9dabae33c6677775eff7fd70cf8ac3 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Tue, 16 Jul 2024 18:34:29 -0400 Subject: [PATCH] feat!: add enum for `as_version` + add restrictions to `ga4gh_serialize_as_version` Building off of #382 and #427 * Add `PrevVrsVersion` enum to store previous versions of VRS that is supported for computing digests/identifiers * Updates function signatures + docstrings for ga4gh digest/serialize/identifier * Adds restrictions to `ga4gh_serialize_as_version` * For `SequenceLocation`: `sequenceReference` must be provided and must be a valid `SequenceReference` obj * For `Allele`: Only `LiteralSequenceExpression` and `ReferenceLengthExpression` are supported and must provide a `sequence` nonnull attribute. --- src/ga4gh/core/__init__.py | 4 +- src/ga4gh/core/identifiers.py | 71 +++++++++++++++++------------- src/ga4gh/vrs/models.py | 76 ++++++++++++++++++++++----------- tests/validation/test_models.py | 44 +++++++++++++++++-- 4 files changed, 135 insertions(+), 60 deletions(-) diff --git a/src/ga4gh/core/__init__.py b/src/ga4gh/core/__init__.py index 53c4edb9..cb19f47d 100644 --- a/src/ga4gh/core/__init__.py +++ b/src/ga4gh/core/__init__.py @@ -9,7 +9,8 @@ from .identifiers import ( ga4gh_digest, ga4gh_identify, ga4gh_serialize, is_ga4gh_identifier, parse_ga4gh_identifier, VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when, - CURIE_NAMESPACE, CURIE_SEP, GA4GH_PREFIX_SEP, GA4GH_IR_REGEXP, GA4GH_DIGEST_REGEXP + CURIE_NAMESPACE, CURIE_SEP, GA4GH_PREFIX_SEP, GA4GH_IR_REGEXP, GA4GH_DIGEST_REGEXP, + PrevVrsVersion ) from .pydantic import ( is_pydantic_instance, is_curie_type, is_ga4gh_identifiable, is_literal, pydantic_copy @@ -32,6 +33,7 @@ "GA4GH_PREFIX_SEP", "GA4GH_IR_REGEXP", "GA4GH_DIGEST_REGEXP", + "PrevVrsVersion", "is_pydantic_instance", "is_curie_type", "is_ga4gh_identifiable", diff --git a/src/ga4gh/core/identifiers.py b/src/ga4gh/core/identifiers.py index 4fd62d8b..4503af88 100644 --- a/src/ga4gh/core/identifiers.py +++ b/src/ga4gh/core/identifiers.py @@ -19,7 +19,7 @@ import logging import re from contextlib import ContextDecorator -from enum import IntEnum +from enum import Enum, IntEnum from typing import Union, Optional from pydantic import BaseModel, RootModel @@ -58,6 +58,20 @@ class VrsObjectIdentifierIs(IntEnum): MISSING = 2 +class PrevVrsVersion(str, Enum): + """Define previous VRS versions that are supported for computing digests and + identifiers based on the current VRS model + """ + + V1_3 = "1.3" + + @classmethod + def validate(cls, version): + if version is not None and version not in cls.__members__.values(): + err_msg = f"Expected `PrevVrsVersion`, but got {version}" + raise ValueError(err_msg) + + ga4gh_compute_identifier_when = contextvars.ContextVar("ga4gh_compute_identifier_when") @@ -122,9 +136,8 @@ def parse_ga4gh_identifier(ir): raise ValueError(ir) from e -def ga4gh_identify(vro, in_place='default', as_version=None): - """ - Return the GA4GH digest-based id for the object, as a CURIE +def ga4gh_identify(vro, in_place: str = 'default', as_version: PrevVrsVersion | None = None) -> str | None: + """Return the GA4GH digest-based id for the object, as a CURIE (string). Returns None if object is not identifiable. This function has three options for in_place editing of vro.id: @@ -137,18 +150,18 @@ def ga4gh_identify(vro, in_place='default', as_version=None): - 'never': the vro.id field will not be edited in-place, even when empty - If 'as_version' is set to a version string, other parameters are - ignored and an identifier returned following the conventions of - the VRS version indicated by 'as_version'. + If ``as_version`` is provided, other parameters are ignored and an identifier is + returned following the conventions of the VRS version indicated by ``as_version_``. + Raises ``ValueError`` if ``as_version`` is not a ``PrevVrsVersion``. - TODO update example for VRS 2.0 + >>> from ga4gh.core import ga4gh_identify >>> import ga4gh.vrs - >>> ival = ga4gh.vrs.models.SimpleInterval(start=44908821, end=44908822) - >>> location = ga4gh.vrs.models.Location(sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", interval=ival) + >>> location = ga4gh.vrs.models.SequenceLocation(start=44908821, end=44908822, sequenceReference=ga4gh.vrs.models.SequenceReference(refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul")) >>> ga4gh_identify(location) - 'ga4gh:VSL.u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx' - + 'ga4gh:SL.4t6JnYWqHwYw9WzBT_lmWBb3tLQNalkT' """ + PrevVrsVersion.validate(as_version) + if vro.is_ga4gh_identifiable(): when_rule = ga4gh_compute_identifier_when.get(VrsObjectIdentifierIs.ANY) obj_id = None @@ -169,23 +182,21 @@ def ga4gh_identify(vro, in_place='default', as_version=None): return None -def ga4gh_digest(vro: BaseModel, overwrite=False, as_version=None): - """ - Return the GA4GH digest for the object. - - If 'as_version' is set to a version string, other parameters - are ignored and a digest returned following the conventions of - the VRS version indicated by 'as_version'. +def ga4gh_digest(vro: BaseModel, overwrite: bool = False, as_version: PrevVrsVersion | None = None) -> str: + """Return the GA4GH digest for the object. - TODO update example + If ``as_version`` is provided, other parameters are ignored and a digest is returned + following the conventions of the VRS version indicated by ``as_version_``. + Raises ``ValueError`` if ``as_version`` is not a ``PrevVrsVersion``. + >>> from ga4gh.core import ga4gh_digest >>> import ga4gh.vrs - >>> ival = ga4gh.vrs.models.SimpleInterval(start=44908821, end=44908822) - >>> location = ga4gh.vrs.models.Location(sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", interval=ival) + >>> location = ga4gh.vrs.models.SequenceLocation(start=44908821, end=44908822, sequenceReference=ga4gh.vrs.models.SequenceReference(refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul")) >>> ga4gh_digest(location) - 'u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx' - + '4t6JnYWqHwYw9WzBT_lmWBb3tLQNalkT' """ + PrevVrsVersion.validate(as_version) + if vro.is_ga4gh_identifiable(): # Only GA4GH identifiable objects are GA4GH digestible if as_version is None: return vro.get_or_create_digest(overwrite) @@ -219,14 +230,14 @@ def collapse_identifiable_values(obj: dict) -> dict: return obj -def ga4gh_serialize(obj: BaseModel, as_version=None) -> Optional[bytes]: - """ - Serializes an object for use in computed digest computation. +def ga4gh_serialize(obj: BaseModel, as_version: PrevVrsVersion | None = None) -> Optional[bytes]: + """Serializes an object for use in computed digest computation. - If a VRS version string is specified for the 'as_version' parameter, - the returned serialization follows the convention of the specified - VRS version. + If ``as_version`` is provided, the returned serialization follows + the conventions of the VRS version indicated by ``as_version_``. """ + PrevVrsVersion.validate(as_version) + if as_version is None: return obj.model_dump_json().encode("utf-8") else: diff --git a/src/ga4gh/vrs/models.py b/src/ga4gh/vrs/models.py index 6d5ddf71..79d4e572 100644 --- a/src/ga4gh/vrs/models.py +++ b/src/ga4gh/vrs/models.py @@ -15,7 +15,14 @@ from enum import Enum import inspect import sys -from ga4gh.core import sha512t24u, GA4GH_PREFIX_SEP, CURIE_SEP, CURIE_NAMESPACE, GA4GH_IR_REGEXP +from ga4gh.core import ( + sha512t24u, + GA4GH_PREFIX_SEP, + CURIE_SEP, + CURIE_NAMESPACE, + GA4GH_IR_REGEXP, + PrevVrsVersion +) from ga4gh.core.pydantic import get_pydantic_root from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer @@ -230,11 +237,13 @@ def has_valid_ga4gh_id(self): def has_valid_digest(self): return bool(self.digest) # Pydantic constraint ensures digest field value is valid - def compute_digest(self, store=True, as_version=None) -> str: + def compute_digest(self, store=True, as_version: PrevVrsVersion | None = None) -> str: """A sha512t24u digest created using the VRS Computed Identifier algorithm. - Stores the digest in the object if store is True. If 'as_version' is set to - a version string, other parameters are ignored and a digest returned - following the conventions of the VRS version indicated by 'as_version'. + + Stores the digest in the object if ``store`` is ``True``. + + If ``as_version`` is provided, other parameters are ignored and a digest is + returned following the conventions of the VRS version indicated by ``as_version_``. """ if as_version is None: digest = sha512t24u(self.model_dump_json().encode("utf-8")) @@ -262,9 +271,9 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False, as Digests will be recalculated even if present if recompute is True. - If 'as_version' is set to a version string, other parameters are - ignored and an identifier returned following the conventions of - the VRS version indicated by 'as_version'. + If ``as_version`` is provided, other parameters are ignored and an identifier is + returned following the conventions of the VRS version indicated by + ``as_version_``. """ if as_version is not None: return self.compute_ga4gh_identifier(as_version=as_version) @@ -287,9 +296,9 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False, as def compute_ga4gh_identifier(self, recompute=False, as_version=None): """Returns a GA4GH Computed Identifier. - If 'as_version' is set to a version string, other parameters are - ignored and a computed identifier returned following the conventions - of the VRS version indicated by 'as_version'. + If ``as_version`` is provided, other parameters are ignored and a computed + identifier is returned following the conventions of the VRS version indicated by + ``as_version_``. """ if as_version is None: self.get_or_create_digest(recompute) @@ -456,12 +465,21 @@ class SequenceLocation(_Ga4ghIdentifiableObject): ) sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.") - def ga4gh_serialize_as_version(self, as_version): + def ga4gh_serialize_as_version(self, as_version: PrevVrsVersion): """This method will return a serialized string following the conventions for - SequenceLocation serialization as defined in the VRS version specified by 'as_version`.""" - if as_version == '1.3': - out = list() - for value in [self.start,self.end]: + SequenceLocation serialization as defined in the VRS version specified by + ``as_version``. + + :raises ValueError: If ``sequenceReference`` is not a ``SequenceReference`` + object; ``start`` or ``end`` are not an int or list. + """ + if as_version == PrevVrsVersion.V1_3: + if not isinstance(self.sequenceReference, SequenceReference): + err_msg = "Must provide `sequenceReference` and it must be a valid `SequenceReference`" + raise ValueError(err_msg) + + out = [] + for value in [self.start, self.end]: value = get_pydantic_root(value) if isinstance(value, int): result = f'{{"type":"Number","value":{value}}}' @@ -476,8 +494,6 @@ def ga4gh_serialize_as_version(self, as_version): raise ValueError(f'{value} is not int or list.') out.append(result) return f'{{"interval":{{"end":{out[1]},"start":{out[0]},"type":"SequenceInterval"}},"sequence_id":"{self.sequenceReference.refgetAccession.split(".")[1]}","type":"SequenceLocation"}}' - else: - raise ValueError(f'Serializing as version {as_version} not supported for this class.') def get_refget_accession(self): if isinstance(self.sequenceReference, SequenceReference): @@ -489,7 +505,7 @@ def get_refget_accession(self): class ga4gh(_Ga4ghIdentifiableObject.ga4gh): prefix = 'SL' - priorPrefix = {'1.3': 'VSL'} + priorPrefix = {PrevVrsVersion.V1_3.value: 'VSL'} keys = [ 'end', 'sequenceReference', @@ -523,21 +539,31 @@ class Allele(_VariationBase): ..., description='An expression of the sequence state' ) - def ga4gh_serialize_as_version(self, as_version): + def ga4gh_serialize_as_version(self, as_version: PrevVrsVersion): """This method will return a serialized string following the conventions for - Allele serialization as defined in the VRS version specified by 'as_version`.""" + Allele serialization as defined in the VRS version specified by 'as_version`. + + :raises ValueError: If ``state`` is not a ``LiteralSequenceExpression`` or + ``ReferenceLengthExpression``; ``state.sequence`` is null. + """ location_digest = self.location.compute_digest(as_version=as_version) + + if not isinstance(self.state, (LiteralSequenceExpression, ReferenceLengthExpression)): + err_msg = "Only `LiteralSequenceExpression` and `ReferenceLengthExpression` are supported for previous versions of VRS" + raise ValueError(err_msg) + sequence = get_pydantic_root(self.state.sequence) + if sequence is None: raise ValueError('State sequence attribute must be defined.') - if as_version == '1.3': + + if as_version == PrevVrsVersion.V1_3: return f'{{"location":"{location_digest}","state":{{"sequence":"{sequence}","type":"LiteralSequenceExpression"}},"type":"Allele"}}' - else: - raise ValueError(f'Serializing as version {as_version} not supported for this class.') + class ga4gh(_Ga4ghIdentifiableObject.ga4gh): prefix = 'VA' - priorPrefix = {'1.3': 'VA'} + priorPrefix = {PrevVrsVersion.V1_3.value: 'VA'} keys = [ 'location', 'state', diff --git a/tests/validation/test_models.py b/tests/validation/test_models.py index a7652b34..50c1ec63 100644 --- a/tests/validation/test_models.py +++ b/tests/validation/test_models.py @@ -7,19 +7,19 @@ import pytest import yaml -from ga4gh.core import ga4gh_serialize, ga4gh_digest, ga4gh_identify +from ga4gh.core import ga4gh_serialize, ga4gh_digest, ga4gh_identify, PrevVrsVersion, entity_models from ga4gh.vrs import models def ga4gh_1_3_identify(*args, **kwargs): - kwargs['as_version'] = '1.3' + kwargs['as_version'] = PrevVrsVersion.V1_3 return ga4gh_identify(*args, **kwargs) def ga4gh_1_3_digest(*args, **kwargs): - kwargs['as_version'] = '1.3' + kwargs['as_version'] = PrevVrsVersion.V1_3 return ga4gh_digest(*args, **kwargs) def ga4gh_1_3_serialize(*args, **kwargs): - kwargs['as_version'] = '1.3' + kwargs['as_version'] = PrevVrsVersion.V1_3 return ga4gh_serialize(*args, **kwargs) fxs = { @@ -60,3 +60,39 @@ def test_validation(cls, data, fn, exp): o = getattr(models, cls)(**data) fx = fxs[fn] assert fx(o) == exp + + +def test_prev_vrs_version(): + """Ensure that support to previous VRS digest/identifiers works correctly""" + loc = models.SequenceLocation(start=44908821, end=44908822, sequenceReference=models.SequenceReference(refgetAccession="SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl")) + + # string representation should work as well + ga4gh_identify(loc, as_version="1.3") + + invalid_vrs_version = "0.0" + invalid_vrs_version_msg = f"Expected `PrevVrsVersion`, but got {invalid_vrs_version}" + + loc_no_seq_ref = models.SequenceLocation(start=44908821, end=44908822) + loc_iri = models.SequenceLocation(start=44908821, end=44908822, sequenceReference=entity_models.IRI("sequenceReferences.json#example1")) + allele_rle_no_seq = models.Allele(location=loc, state=models.ReferenceLengthExpression(length=11, repeatSubunitLength=3)) + allele_le = models.Allele(location=loc, state=models.LengthExpression(length=2)) + loc_seq_ref_msg = "Must provide `sequenceReference` and it must be a valid `SequenceReference`" + for ga4gh_func in [ga4gh_identify, ga4gh_digest, ga4gh_serialize]: + with pytest.raises(ValueError, match=invalid_vrs_version_msg): + ga4gh_func(loc, as_version=invalid_vrs_version_msg) + + with pytest.raises(ValueError, match=loc_seq_ref_msg): + ga4gh_func(loc_no_seq_ref, as_version=PrevVrsVersion.V1_3) + + with pytest.raises(ValueError, match=loc_seq_ref_msg): + ga4gh_func(loc_iri, as_version=PrevVrsVersion.V1_3) + + with pytest.raises(ValueError, match="State sequence attribute must be defined."): + ga4gh_func(allele_rle_no_seq, as_version=PrevVrsVersion.V1_3) + + allele_rlse_seq = allele_rle_no_seq.model_copy(deep=True) + allele_rlse_seq.state.sequence = "C" + assert ga4gh_func(allele_rlse_seq, as_version=PrevVrsVersion.V1_3) + + with pytest.raises(ValueError, match="Only `LiteralSequenceExpression` and `ReferenceLengthExpression` are supported for previous versions of VRS"): + ga4gh_func(allele_le, as_version=PrevVrsVersion.V1_3)