From 00f60e28adcccc41730474e48a376462a9506f82 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 18 Aug 2021 11:57:58 +0200 Subject: [PATCH 1/2] #174 - FSes that are only transitively referenced cannot be serialized (#179) - Turn on ID generation during FS scan in general because the FS scan internally uses a map in which the ID is the key and if the IDs are not uniquely generated, then the result of the method is incomplete. - Add consistency checks fo _find_all_fs to detect e.g. duplicate FS IDs - Fixed a test which introduced a duplicate FS ID --- cassis/cas.py | 25 ++++++++++++++++++++----- cassis/xmi.py | 2 +- tests/test_xmi.py | 2 +- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/cassis/cas.py b/cassis/cas.py index cf27142..b507157 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -576,7 +576,7 @@ def typecheck(self) -> List[TypeCheckError]: return all_errors - def _find_all_fs(self, generate_missing_ids: bool = False) -> Iterable[FeatureStructure]: + def _find_all_fs(self, generate_missing_ids: bool = True) -> Iterable[FeatureStructure]: """This function traverses the whole CAS in order to find all directly and indirectly referenced feature structures. Traversing is needed as it can be that a feature structure is not added to the sofa but referenced by another feature structure as a feature.""" @@ -590,8 +590,25 @@ def _find_all_fs(self, generate_missing_ids: bool = False) -> Iterable[FeatureSt ts = self.typesystem while openlist: fs = openlist.pop(0) - if generate_missing_ids and fs.xmiID is None: - fs.xmiID = self._get_next_xmi_id() + + # We do not want to return cas:NULL here as we handle serializing it later + if fs.xmiID == 0: + continue; + + if fs.xmiID is None: + if generate_missing_ids: + fs.xmiID = self._get_next_xmi_id() + else: + raise ValueError("FS has no ID and ID generation is disabled! {fs}".format(fs=fs)) + + existing_fs = all_fs.get(fs.xmiID) + if existing_fs is not None and existing_fs is not fs: + raise ValueError( + "Duplicate FS id [{fsId}] used for [{fs1}] and [{fs2}]".format( + fsId=fs.xmiID, fs1=existing_fs, fs2=fs + ) + ) + all_fs[fs.xmiID] = fs t = ts.get_type(fs.type) @@ -623,8 +640,6 @@ def _find_all_fs(self, generate_missing_ids: bool = False) -> Iterable[FeatureSt if referenced_fs.xmiID not in all_fs: openlist.append(referenced_fs) - # We do not want to return cas:NULL here as we handle serializing it later - all_fs.pop(0, None) yield from all_fs.values() def _get_next_xmi_id(self) -> int: diff --git a/cassis/xmi.py b/cassis/xmi.py index f4b7b94..2be378a 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -343,7 +343,7 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): self._serialize_cas_null(root) # Find all fs, even the ones that are not directly added to a sofa - for fs in sorted(cas._find_all_fs(generate_missing_ids=True), key=lambda a: a.xmiID): + for fs in sorted(cas._find_all_fs(), key=lambda a: a.xmiID): self._serialize_feature_structure(cas, root, fs) for sofa in cas.sofas: diff --git a/tests/test_xmi.py b/tests/test_xmi.py index 0d5eed0..04bb9b8 100644 --- a/tests/test_xmi.py +++ b/tests/test_xmi.py @@ -198,7 +198,7 @@ def test_serializing_xmi_ignores_none_features(small_xmi, small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) cas = load_cas_from_xmi(small_xmi, typesystem=typesystem) TokenType = typesystem.get_type("cassis.Token") - cas.add(TokenType(xmiID=13, sofa=1, begin=0, end=3, id=None, pos=None)) + cas.add(TokenType(begin=0, end=3, id=None, pos=None)) actual_xml = cas.to_xmi() From 9caa3016b8eb682b1110db1049b8b59685ee441e Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 18 Aug 2021 12:11:45 +0200 Subject: [PATCH 2/2] #174 - FSes that are only transitively referenced cannot be serialized (#179) - Added test to fail on duplicate IDs --- cassis/cas.py | 2 +- tests/test_cas.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cassis/cas.py b/cassis/cas.py index b507157..3ba745a 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -593,7 +593,7 @@ def _find_all_fs(self, generate_missing_ids: bool = True) -> Iterable[FeatureStr # We do not want to return cas:NULL here as we handle serializing it later if fs.xmiID == 0: - continue; + continue if fs.xmiID is None: if generate_missing_ids: diff --git a/tests/test_cas.py b/tests/test_cas.py index 11ad0a2..47f7571 100644 --- a/tests/test_cas.py +++ b/tests/test_cas.py @@ -443,3 +443,14 @@ def test_removing_throws_if_fs_in_other_view(small_typesystem_xml, tokens, sente with pytest.raises(ValueError): view.remove(tokens[0]) + + +def test_fail_on_duplicate_fs_id(small_typesystem_xml): + cas = Cas(typesystem=load_typesystem(small_typesystem_xml)) + + TokenType = cas.typesystem.get_type("cassis.Token") + cas.add_annotation(TokenType(xmiID=10, begin=0, end=0)) + cas.add_annotation(TokenType(xmiID=10, begin=10, end=10)) + + with pytest.raises(ValueError): + list(cas._find_all_fs())