From 307064a1e2d7cece06df5889362774cad9d760a9 Mon Sep 17 00:00:00 2001 From: Jan-Christoph Klie Date: Mon, 15 Nov 2021 17:20:35 +0100 Subject: [PATCH 1/5] #236 - Long output when printing type (#237) - Add shorter __str__ to type --- cassis/typesystem.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cassis/typesystem.py b/cassis/typesystem.py index 44cfc05..73de09c 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -589,6 +589,9 @@ def __hash__(self): def __eq__(self, other): return self.name == other.name + def __str__(self): + return f"Type(name={self.name})" + class TypeSystem: def __init__(self, add_document_annotation_type: bool = True): From 9aab09174401d6741a23b54cd5ce69dd31f87cca Mon Sep 17 00:00:00 2001 From: Jan-Christoph Klie Date: Tue, 23 Nov 2021 10:25:19 +0100 Subject: [PATCH 2/5] Create CITATION.cff --- CITATION.cff | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..b7d1689 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,14 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Klie" + given-names: "Jan-Christoph " + orcid: "https://orcid.org/0000-0003-0181-6450" +- family-names: "Eckart de Castilho" + given-names: "Richard" + orcid: "https://orcid.org/0000-0003-0991-7045" +title: "dkpro-cassis" +version: 0.6.1 +doi: 10.5281/zenodo.5537447 +date-released: 2021-11-23 +url: "https://github.com/dkpro/dkpro-cassis" From 71cbd3a4688f1ba97c409154b9a23f166fe7be94 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Thu, 2 Dec 2021 17:56:58 +0100 Subject: [PATCH 3/5] #238 - Error parsing FSList in CTAKES XMi - Added new test data allowing to reproduce the issue with all kinds of different UIMA "list" structures --- cassis/typesystem.py | 4 + cassis/xmi.py | 1 + tests/fixtures.py | 28 +++++ .../typesystem_with_list_features.xml | 117 ++++++++++++++++++ .../test_files/xmi/cas_with_list_features.xmi | 38 ++++++ tests/test_xmi.py | 4 + 6 files changed, 192 insertions(+) create mode 100644 tests/test_files/typesystems/typesystem_with_list_features.xml create mode 100644 tests/test_files/xmi/cas_with_list_features.xmi diff --git a/cassis/typesystem.py b/cassis/typesystem.py index 73de09c..31af63a 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -29,9 +29,13 @@ TYPE_NAME_DOUBLE = UIMA_CAS_PREFIX + "Double" TYPE_NAME_ARRAY_BASE = UIMA_CAS_PREFIX + "ArrayBase" TYPE_NAME_FS_ARRAY = UIMA_CAS_PREFIX + "FSArray" +TYPE_NAME_FS_LIST = UIMA_CAS_PREFIX + "FSList" TYPE_NAME_INTEGER_ARRAY = UIMA_CAS_PREFIX + "IntegerArray" +TYPE_NAME_INTEGER_LIST = UIMA_CAS_PREFIX + "IntegerList" TYPE_NAME_FLOAT_ARRAY = UIMA_CAS_PREFIX + "FloatArray" +TYPE_NAME_FLOAT_LIST = UIMA_CAS_PREFIX + "FloatList" TYPE_NAME_STRING_ARRAY = UIMA_CAS_PREFIX + "StringArray" +TYPE_NAME_STRING_LIST = UIMA_CAS_PREFIX + "StringList" TYPE_NAME_BOOLEAN_ARRAY = UIMA_CAS_PREFIX + "BooleanArray" TYPE_NAME_BYTE_ARRAY = UIMA_CAS_PREFIX + "ByteArray" TYPE_NAME_SHORT_ARRAY = UIMA_CAS_PREFIX + "ShortArray" diff --git a/cassis/xmi.py b/cassis/xmi.py index 67657cc..0925f96 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -24,6 +24,7 @@ TYPE_NAME_FLOAT, TYPE_NAME_FLOAT_ARRAY, TYPE_NAME_FS_ARRAY, + TYPE_NAME_FS_LIST, TYPE_NAME_INTEGER, TYPE_NAME_INTEGER_ARRAY, TYPE_NAME_LONG, diff --git a/tests/fixtures.py b/tests/fixtures.py index a6bb721..8e9294c 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -62,6 +62,20 @@ def cas_with_collections_xmi(cas_with_collections_path): return f.read() +# CAS with all kinds of list features + + +@pytest.fixture +def cas_with_list_features_path(): + return os.path.join(FIXTURE_DIR, "xmi", "cas_with_list_features.xmi") + + +@pytest.fixture +def cas_with_list_features_xmi(cas_with_list_features_path): + with open(cas_with_list_features_path, "r") as f: + return f.read() + + # CAS with references @@ -378,6 +392,20 @@ def typesystem_merge_base_path(): return os.path.join(FIXTURE_DIR, "typesystems", "typesystem_merge_base.xml") +# Type system with all kinds of list features + + +@pytest.fixture +def typesystem_with_list_features_path(): + return os.path.join(FIXTURE_DIR, "typesystems", "typesystem_with_list_features.xml") + + +@pytest.fixture +def typesystem_with_list_features_xml(typesystem_with_list_features_path): + with open(typesystem_with_list_features_path, "r") as f: + return f.read() + + # Annotations diff --git a/tests/test_files/typesystems/typesystem_with_list_features.xml b/tests/test_files/typesystems/typesystem_with_list_features.xml new file mode 100644 index 0000000..61b1697 --- /dev/null +++ b/tests/test_files/typesystems/typesystem_with_list_features.xml @@ -0,0 +1,117 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + FloatListHolder + + uima.cas.TOP + + + floatList + + uima.cas.FloatList + + + + + FloatListHolderMR + + uima.cas.TOP + + + floatList + + uima.cas.FloatList + true + + + + + FsListHolder + + uima.cas.TOP + + + fsList + + uima.cas.FSList + + + + + FsListHolderMR + + uima.cas.TOP + + + fsList + + uima.cas.FSList + true + + + + + IntListHolder + + uima.cas.TOP + + + intList + + uima.cas.IntegerList + + + + + IntListHolderMR + + uima.cas.TOP + + + intList + + uima.cas.IntegerList + true + + + + + StringListHolder + + uima.cas.TOP + + + stringList + + uima.cas.StringList + + + + + StringListHolderMR + + uima.cas.TOP + + + stringList + + uima.cas.StringList + true + + + + + diff --git a/tests/test_files/xmi/cas_with_list_features.xmi b/tests/test_files/xmi/cas_with_list_features.xmi new file mode 100644 index 0000000..148dcbb --- /dev/null +++ b/tests/test_files/xmi/cas_with_list_features.xmi @@ -0,0 +1,38 @@ + + + + + + + + + + + blub + blah + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_xmi.py b/tests/test_xmi.py index 17c7e6e..8adc317 100644 --- a/tests/test_xmi.py +++ b/tests/test_xmi.py @@ -35,6 +35,10 @@ pytest.lazy_fixture("cas_with_multiple_references_allowed_string_array_xmi"), pytest.lazy_fixture("typesystem_with_multiple_references_allowed_xml"), ), + ( + pytest.lazy_fixture("cas_with_list_features_xmi"), + pytest.lazy_fixture("typesystem_with_list_features_xml"), + ), ] From adeb2d725a135cec6041162f6a8ffc756d10725d Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Thu, 2 Dec 2021 18:17:05 +0100 Subject: [PATCH 4/5] #238 - Error parsing FSList in CTAKES XMi - Added type / feature constants --- cassis/typesystem.py | 4 ++++ cassis/xmi.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/cassis/typesystem.py b/cassis/typesystem.py index 31af63a..a581858 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -30,6 +30,8 @@ TYPE_NAME_ARRAY_BASE = UIMA_CAS_PREFIX + "ArrayBase" TYPE_NAME_FS_ARRAY = UIMA_CAS_PREFIX + "FSArray" TYPE_NAME_FS_LIST = UIMA_CAS_PREFIX + "FSList" +TYPE_NAME_EMPTY_FS_LIST = UIMA_CAS_PREFIX + "EmptyFSList" +TYPE_NAME_NON_EMPTY_FS_LIST = UIMA_CAS_PREFIX + "NonEmptyFSList" TYPE_NAME_INTEGER_ARRAY = UIMA_CAS_PREFIX + "IntegerArray" TYPE_NAME_INTEGER_LIST = UIMA_CAS_PREFIX + "IntegerList" TYPE_NAME_FLOAT_ARRAY = UIMA_CAS_PREFIX + "FloatArray" @@ -60,6 +62,8 @@ FEATURE_BASE_NAME_BEGIN = "begin" FEATURE_BASE_NAME_END = "end" FEATURE_BASE_NAME_LANGUAGE = "language" +FEATURE_BASE_NAME_HEAD = "head" +FEATURE_BASE_NAME_TAIL = "tail" _DOCUMENT_ANNOTATION_TYPE = "uima.tcas.DocumentAnnotation" diff --git a/cassis/xmi.py b/cassis/xmi.py index 0925f96..7b33013 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -13,7 +13,9 @@ _PRIMITIVE_ARRAY_TYPES, FEATURE_BASE_NAME_BEGIN, FEATURE_BASE_NAME_END, + FEATURE_BASE_NAME_HEAD, FEATURE_BASE_NAME_SOFA, + FEATURE_BASE_NAME_TAIL, TYPE_NAME_ANNOTATION, TYPE_NAME_BOOLEAN, TYPE_NAME_BOOLEAN_ARRAY, @@ -21,6 +23,7 @@ TYPE_NAME_BYTE_ARRAY, TYPE_NAME_DOUBLE, TYPE_NAME_DOUBLE_ARRAY, + TYPE_NAME_EMPTY_FS_LIST, TYPE_NAME_FLOAT, TYPE_NAME_FLOAT_ARRAY, TYPE_NAME_FS_ARRAY, @@ -29,6 +32,7 @@ TYPE_NAME_INTEGER_ARRAY, TYPE_NAME_LONG, TYPE_NAME_LONG_ARRAY, + TYPE_NAME_NON_EMPTY_FS_LIST, TYPE_NAME_SHORT, TYPE_NAME_SHORT_ARRAY, TYPE_NAME_SOFA, From 7ce65861140aa0471c41ea978284af7fccf2b42b Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 12 Dec 2021 20:48:06 +0100 Subject: [PATCH 5/5] #238 - Error parsing FSList in CTAKES XMi - Fixed test - Changed formatting of floats to the same way that Java does it - Handle inlined lists during (de)serilaization - Added new methods to check if a type is a list to the type system/type --- cassis/cas.py | 23 +++++--- cassis/typesystem.py | 65 ++++++++++++++++++++++- cassis/xmi.py | 124 +++++++++++++++++++++++++++++++++++++++---- 3 files changed, 193 insertions(+), 19 deletions(-) diff --git a/cassis/cas.py b/cassis/cas.py index 616e451..ec42fa6 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -9,7 +9,8 @@ from attr import validators from sortedcontainers import SortedKeyList -from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem +from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem, TYPE_NAME_FS_LIST, \ + TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD _validator_optional_string = validators.optional(validators.instance_of(str)) @@ -590,7 +591,7 @@ def typecheck(self) -> List[TypeCheckError]: def _find_all_fs( self, generate_missing_ids: bool = True, - include_inlinable_arrays: bool = False, + include_inlinable_arrays_and_lists: bool = False, seeds: Iterable = None, ) -> Iterable[FeatureStructure]: """This function traverses the whole CAS in order to find all directly and indirectly referenced @@ -656,21 +657,29 @@ def _find_all_fs( continue if ( - not include_inlinable_arrays + not include_inlinable_arrays_and_lists and not feature.multipleReferencesAllowed - and ts.is_array(feature.rangeType) + and (ts.is_array(feature.rangeType) or ts.is_list(feature.rangeType)) ): - # For inlined FSArrays, we still need to scan their members - if feature.rangeType.name == "uima.cas.FSArray" and feature_value.elements: + # For inlined FSArrays / FSList, we still need to scan their members + if feature.rangeType.name == TYPE_NAME_FS_ARRAY and feature_value.elements: for ref in feature_value.elements: if not ref or ref.xmiID in all_fs: continue openlist.append(ref) + elif feature.rangeType.name == TYPE_NAME_FS_LIST and hasattr(feature_value, FEATURE_BASE_NAME_HEAD): + v = feature_value + while hasattr(v, FEATURE_BASE_NAME_HEAD): + if not v.head or v.head.xmiID in all_fs: + continue + openlist.append(v.head) + v = v.tail + # For primitive arrays / lists, we do not need to handle the elements continue if not hasattr(feature_value, "xmiID"): raise AttributeError( - f"Feature [{feature_name}] should point to a [{feature.rangeType.name}] but the feature value is a [{type(feature_value)}] with the value [{feature_value}]" + f"Feature [{feature.domainType.name}:{feature_name}] should point to a [{feature.rangeType.name}] but the feature value is a [{type(feature_value)}] with the value [{feature_value}]" ) if feature_value.xmiID in all_fs: diff --git a/cassis/typesystem.py b/cassis/typesystem.py index a581858..2957a0d 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -34,10 +34,16 @@ TYPE_NAME_NON_EMPTY_FS_LIST = UIMA_CAS_PREFIX + "NonEmptyFSList" TYPE_NAME_INTEGER_ARRAY = UIMA_CAS_PREFIX + "IntegerArray" TYPE_NAME_INTEGER_LIST = UIMA_CAS_PREFIX + "IntegerList" +TYPE_NAME_EMPTY_INTEGER_LIST = UIMA_CAS_PREFIX + "EmptyIntegerList" +TYPE_NAME_NON_EMPTY_INTEGER_LIST = UIMA_CAS_PREFIX + "NonEmptyIntegerList" TYPE_NAME_FLOAT_ARRAY = UIMA_CAS_PREFIX + "FloatArray" TYPE_NAME_FLOAT_LIST = UIMA_CAS_PREFIX + "FloatList" +TYPE_NAME_EMPTY_FLOAT_LIST = UIMA_CAS_PREFIX + "EmptyFloatList" +TYPE_NAME_NON_EMPTY_FLOAT_LIST = UIMA_CAS_PREFIX + "NonEmptyFloatList" TYPE_NAME_STRING_ARRAY = UIMA_CAS_PREFIX + "StringArray" TYPE_NAME_STRING_LIST = UIMA_CAS_PREFIX + "StringList" +TYPE_NAME_EMPTY_STRING_LIST = UIMA_CAS_PREFIX + "EmptyStringList" +TYPE_NAME_NON_EMPTY_STRING_LIST = UIMA_CAS_PREFIX + "NonEmptyStringList" TYPE_NAME_BOOLEAN_ARRAY = UIMA_CAS_PREFIX + "BooleanArray" TYPE_NAME_BYTE_ARRAY = UIMA_CAS_PREFIX + "ByteArray" TYPE_NAME_SHORT_ARRAY = UIMA_CAS_PREFIX + "ShortArray" @@ -174,10 +180,13 @@ "uima.cas.StringArray", } +_PRIMITIVE_LIST_TYPES = {TYPE_NAME_INTEGER_LIST, TYPE_NAME_FLOAT_LIST, TYPE_NAME_STRING_LIST} + _INHERITANCE_FINAL_TYPES = _PRIMITIVE_ARRAY_TYPES -_ARRAY_TYPES = _PRIMITIVE_ARRAY_TYPES | {"uima.cas.FSArray"} +_ARRAY_TYPES = _PRIMITIVE_ARRAY_TYPES | {TYPE_NAME_FS_ARRAY} +_LIST_TYPES = _PRIMITIVE_LIST_TYPES | {TYPE_NAME_FS_LIST} def _string_to_valid_classname(name: str): return re.sub("[^a-zA-Z0-9_]", "_", name) @@ -253,6 +262,23 @@ def is_primitive_array(type_: Union[str, "Type"]) -> bool: return type_name in _PRIMITIVE_ARRAY_TYPES +def is_primitive_list(type_: Union[str, "Type"]) -> bool: + """Checks if the type identified by `type` is a primitive list, e.g. list of primitives. + + Args: + type_: Type to query for (`Type` or name as string) + Returns: + Returns `True` if the type identified by `type` is a primitive array type, else `False` + """ + type_name = type_ if isinstance(type_, str) else type_.name + + if type_name == TOP_TYPE_NAME: + return False + + # Arrays are inheritance-final, so we do not need to check the inheritance hierarchy + return type_name in _PRIMITIVE_LIST_TYPES + + def is_array(type_: Union[str, "Type"]) -> bool: """Checks if the type identified by `type` is an array. @@ -270,6 +296,23 @@ def is_array(type_: Union[str, "Type"]) -> bool: return type_name in _ARRAY_TYPES +def is_list(type_: Union[str, "Type"]) -> bool: + """Checks if the type identified by `type` is a list. + + Args: + type_: Type to query for (`Type` or name as string) + Returns: + Returns `True` if the type identified by `type` is a list type, else `False` + """ + type_name = type_ if isinstance(type_, str) else type_.name + + if type_name == TOP_TYPE_NAME: + return False + + # Lists are inheritance-final, so we do not need to check the inheritance hierarchy + return type_name in _LIST_TYPES + + @attr.s class TypeCheckError(Exception): xmiID: int = attr.ib() # xmiID of the feature structure with type error @@ -819,6 +862,16 @@ def is_primitive_array(self, type_: Union[str, Type]) -> bool: """ return is_primitive_array(type_) + def is_primitive_list(self, type_: Union[str, Type]) -> bool: + """Checks if the type identified by `type` is a primitive list, e.g. list of primitives. + + Args: + type_: Type to query for (`Type` or name as string) + Returns: + Returns `True` if the type identified by `type` is a primitive array type, else `False` + """ + return is_primitive_list(type_) + def is_array(self, type_: Union[str, Type]) -> bool: """Checks if the type identified by `type` is an array. @@ -829,6 +882,16 @@ def is_array(self, type_: Union[str, Type]) -> bool: """ return is_array(type_) + def is_list(self, type_: Union[str, Type]) -> bool: + """Checks if the type identified by `type` is a list. + + Args: + type_: Type to query for (`Type` or name as string) + Returns: + Returns `True` if the type identified by `type` is a list type, else `False` + """ + return is_list(type_) + def subsumes(self, parent: Union[str, Type], child: Union[str, Type]) -> bool: """Determines if the type `child` is a child of `parent`. diff --git a/cassis/xmi.py b/cassis/xmi.py index 7b33013..e2a41fd 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -23,16 +23,22 @@ TYPE_NAME_BYTE_ARRAY, TYPE_NAME_DOUBLE, TYPE_NAME_DOUBLE_ARRAY, + TYPE_NAME_EMPTY_FLOAT_LIST, TYPE_NAME_EMPTY_FS_LIST, + TYPE_NAME_EMPTY_INTEGER_LIST, TYPE_NAME_FLOAT, TYPE_NAME_FLOAT_ARRAY, + TYPE_NAME_FLOAT_LIST, TYPE_NAME_FS_ARRAY, TYPE_NAME_FS_LIST, TYPE_NAME_INTEGER, TYPE_NAME_INTEGER_ARRAY, + TYPE_NAME_INTEGER_LIST, TYPE_NAME_LONG, TYPE_NAME_LONG_ARRAY, + TYPE_NAME_NON_EMPTY_FLOAT_LIST, TYPE_NAME_NON_EMPTY_FS_LIST, + TYPE_NAME_NON_EMPTY_INTEGER_LIST, TYPE_NAME_SHORT, TYPE_NAME_SHORT_ARRAY, TYPE_NAME_SOFA, @@ -41,7 +47,8 @@ FeatureStructure, Type, TypeNotFoundError, - TypeSystem, + TypeSystem, TYPE_NAME_STRING_LIST, TYPE_NAME_EMPTY_STRING_LIST, TYPE_NAME_NON_EMPTY_STRING_LIST, + _PRIMITIVE_LIST_TYPES, _LIST_TYPES, ) NAN_VALUE = "NaN" @@ -195,8 +202,6 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b self._clear_elem(elem) # Post-process feature values - StringArray = typesystem.get_type("uima.cas.StringArray") - referenced_fs = set() for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type.name) @@ -226,6 +231,13 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b if isinstance(value, str): FSType = feature.rangeType fs[feature_name] = FSType(elements=self._parse_primitive_array(feature.rangeType, value)) + elif typesystem.is_primitive_list(feature.rangeType) and not feature.multipleReferencesAllowed: + # Array feature rendered inline (multipleReferencesAllowed = False|None) + # We also end up here for array features that were rendered as child elements. No need to parse + # them again, so we check if the value is still a string (i.e. attribute value) and only then + # process it + if isinstance(value, str): + fs[feature_name] = self._parse_primitive_list(feature.rangeType, value) else: # Resolve references here if value is None: @@ -240,16 +252,23 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b target_id = int(ref) target = feature_structures[target_id] targets.append(target) - referenced_fs.add(target_id) + if feature.rangeType.name == TYPE_NAME_FS_ARRAY: # Wrap inline array into the appropriate array object ArrayType = typesystem.get_type(TYPE_NAME_FS_ARRAY) targets = ArrayType(elements=targets) + fs[feature_name] = targets + elif feature.rangeType.name == TYPE_NAME_FS_LIST and not feature.multipleReferencesAllowed: + # Array feature rendered inline (multipleReferencesAllowed = False|None) + # We also end up here for array features that were rendered as child elements. No need to parse + # them again, so we check if the value is still a string (i.e. attribute value) and only then + # process it + if isinstance(value, list) or isinstance(value, str): + fs[feature_name] = self._parse_fs_list(feature_structures, feature.rangeType, value) else: target_id = int(value) fs[feature_name] = feature_structures[target_id] - referenced_fs.add(target_id) cas = Cas(typesystem=typesystem, lenient=lenient) for sofa in sofas.values(): @@ -340,25 +359,73 @@ def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[ attributes["type_"] = attributes.pop("type") # Arrays which were represented as nested elements in the XMI have so far have only been parsed into a Python - # arrays. Now we convert them to proper UIMA arrays + # arrays. Now we convert them to proper UIMA arrays/lists if not typesystem.is_primitive_array(type_name): for feature_name, feature_value in children.items(): feature = AnnotationType.get_feature(feature_name) if typesystem.is_primitive_array(feature.rangeType): ArrayType = feature.rangeType attributes[feature_name] = ArrayType(elements=attributes[feature_name]) + if typesystem.is_primitive_list(feature.rangeType): + attributes[feature_name] = self._parse_primitive_list(feature.rangeType, attributes[feature_name]) self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) return AnnotationType(**attributes) - def _parse_primitive_array(self, type_: Type, value: str) -> List: + def _parse_primitive_list(self, type_: Type, value: Union[str, List[str]]): + if value is None: + return None + + # Convert the inline array into the linked NonEmptyList/EmptyList instances + if type_.name == TYPE_NAME_INTEGER_LIST: + EmptyList = type_.typesystem.get_type(TYPE_NAME_EMPTY_INTEGER_LIST) + NonEmptyList = type_.typesystem.get_type(TYPE_NAME_NON_EMPTY_INTEGER_LIST) + conv = int + elif type_.name == TYPE_NAME_FLOAT_LIST: + EmptyList = type_.typesystem.get_type(TYPE_NAME_EMPTY_FLOAT_LIST) + NonEmptyList = type_.typesystem.get_type(TYPE_NAME_NON_EMPTY_FLOAT_LIST) + conv = float + elif type_.name == TYPE_NAME_STRING_LIST: + EmptyList = type_.typesystem.get_type(TYPE_NAME_EMPTY_STRING_LIST) + NonEmptyList = type_.typesystem.get_type(TYPE_NAME_NON_EMPTY_STRING_LIST) + conv = str + else: + raise ValueError(f"Unexpected primitive list type: {type_.name}") + + elements = value.split() if isinstance(value, str) else value + + head = EmptyList() + for e in reversed(elements): + tail = head + head = NonEmptyList() + head.set(FEATURE_BASE_NAME_HEAD, conv(e)) + head.set(FEATURE_BASE_NAME_TAIL, tail) + return head + + def _parse_fs_list(self, feature_structures, type_: Type, value: str): + # Convert the inline array into the linked NonEmptyFSList/EmptyFSList instances + NonEmptyFSList = type_.typesystem.get_type(TYPE_NAME_NON_EMPTY_FS_LIST) + EmptyFSList = type_.typesystem.get_type(TYPE_NAME_EMPTY_FS_LIST) + + elements = value.split() if isinstance(value, str) else value + + head = EmptyFSList() + for e in reversed(elements): + tail = head + head = NonEmptyFSList() + head.set(FEATURE_BASE_NAME_HEAD, feature_structures[int(e)]) + head.set(FEATURE_BASE_NAME_TAIL, tail) + return head + + def _parse_primitive_array(self, type_: Type, value: Union[str, List[str]]) -> List: """Primitive collections are serialized as white space separated primitive values""" if value is None: return None # TODO: Use type name global variable here instead of hardcoded string literal - elements = value.split(" ") + elements = value.split() if isinstance(value, str) else value + type_name = type_.name if type_name in [TYPE_NAME_FLOAT_ARRAY, TYPE_NAME_DOUBLE_ARRAY]: return [float(e) for e in elements] if value else [] @@ -535,12 +602,23 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur for e in value.elements: child = etree.SubElement(elem, feature_name) child.text = e + elif ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_LIST) and not feature.multipleReferencesAllowed: + if value is not None: # Compare to none to not skip if elements is empty! + for e in self._collect_list_elements(feature.rangeType.name, value): + child = etree.SubElement(elem, feature_name) + child.text = e elif ts.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed: if value.elements is not None: # Compare to none to not skip if elements is empty! elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeType.name, value.elements) + elif ts.is_primitive_list(feature.rangeType) and not feature.multipleReferencesAllowed: + if value is not None: # Compare to none to not skip if elements is empty! + elem.attrib[feature_name] = self._serialize_primitive_list(feature.rangeType.name, value) elif feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed: if value.elements is not None: # Compare to none to not skip if elements is empty! elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in value.elements) + elif feature.rangeType.name == TYPE_NAME_FS_LIST and not feature.multipleReferencesAllowed: + if value is not None: # Compare to none to not skip if elements is empty! + elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in self._collect_list_elements(feature.rangeType.name, value)) elif feature_name == FEATURE_BASE_NAME_SOFA: elem.attrib[feature_name] = str(value.xmiID) elif feature.rangeType.name == TYPE_NAME_BOOLEAN: @@ -560,8 +638,10 @@ def _serialize_sofa(self, root: etree.Element, sofa: Sofa): elem.attrib["{http://www.omg.org/XMI}id"] = str(sofa.xmiID) elem.attrib["sofaNum"] = str(sofa.sofaNum) elem.attrib["sofaID"] = str(sofa.sofaID) - elem.attrib["mimeType"] = str(sofa.mimeType) - elem.attrib["sofaString"] = str(sofa.sofaString) + if sofa.mimeType is not None: + elem.attrib["mimeType"] = str(sofa.mimeType) + if sofa.sofaString is not None: + elem.attrib["sofaString"] = str(sofa.sofaString) def _serialize_view(self, root: etree.Element, view: View): name = etree.QName(self._nsmap["cas"], "View") @@ -570,6 +650,26 @@ def _serialize_view(self, root: etree.Element, view: View): elem.attrib["sofa"] = str(view.sofa.xmiID) elem.attrib["members"] = " ".join(sorted((str(x.xmiID) for x in view.get_all_annotations()), key=int)) + def _collect_list_elements(self, type_name: str, value) -> List[str]: + if type_name not in _LIST_TYPES: + raise ValueError(f"Not a primitive list: {type_name}") + + elements = [] + current = value + while hasattr(current, "head"): + elements.append(current.head) + current = current.tail + return elements + + def _serialize_primitive_list(self, type_name: str, value) -> str: + elements = [] + for e in self._collect_list_elements(type_name, value): + if isinstance(e, float): + elements.append(self._serialize_float_value(e)) + else: + elements.append(str(e)) + return " ".join(elements) + def _serialize_primitive_array(self, type_name: str, values: List) -> str: """Primitive collections are serialized as white space seperated primitive values""" @@ -594,4 +694,6 @@ def _serialize_float_value(self, value) -> Union[float, str]: return POSITIVE_INFINITE_VALUE else: return NEGATIVE_INFINITE_VALUE - return str(value) + + # Formatting in the same way that Java does it, with a capital 'E' and without a '+' if the exponent is positive + return str(value).upper().replace("E+", "E")