Merge branch 'feature/168-Experimental-JSON-CAS-support' of https://g…

…ithub.com/dkpro/dkpro-cassis into feature/168-Experimental-JSON-CAS-support * 'feature/168-Experimental-JSON-CAS-support' of https://github.com/dkpro/dkpro-cassis: No issue. Formatting. #215 - Ability to exclude types from cas_to_comparable_text #212 - Allow loading/saving XMI/typesystems from/to Path #211 - Serializing an FSArray without any elements breaks #212 - Allow loading/saving XMI/typesystems from/to Path #168 - Experimental JSON CAS support #168 - Experimental JSON CAS support #168 - Experimental JSON CAS support #209 - Parsing an array that was serialized using multipleReferencesAllowed=true fails - Do not execute performance "tests" when running make test - Update JSON reference data with new data from UIMA Java SDK - including CAS examples using emojis and other Unicode characters - Enabled character offset conversion on import/export in JSON (de)serializer #209 - Parsing an array that was serialized using multipleReferencesAllowed=true fails
dkpro · Sep 27, 2021 · e607dd5 · e607dd5
2 parents 053bbf7 + a77935d
commit e607dd5
Show file tree

Hide file tree

Showing 12 changed files with 243 additions and 88 deletions.
diff --git a/cassis/json.py b/cassis/json.py
@@ -1,7 +1,7 @@
 import base64
 import json
 from collections import OrderedDict
-from io import TextIOWrapper
+from io import TextIOBase, TextIOWrapper
 
 from cassis.cas import NAME_DEFAULT_SOFA, Cas, IdGenerator, Sofa, View
 from cassis.typesystem import *
@@ -268,7 +268,7 @@ def serialize(
             json_fs = self._serialize_feature_structure(fs)
             feature_structures.append(json_fs)
 
-        if isinstance(sink, BytesIO):
+        if sink and not isinstance(sink, TextIOBase):
             sink = TextIOWrapper(sink, encoding="utf-8", write_through=True)
 
         if sink:
@@ -316,7 +316,7 @@ def _serialize_feature(self, json_type, feature: Feature):
             json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed
 
         if feature.elementType is not None:
-            json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType)
+            json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType.name)
 
         return json_feature
 

diff --git a/cassis/typesystem.py b/cassis/typesystem.py
@@ -971,12 +971,13 @@ def _add_document_annotation_type(self):
 # Deserializing
 
 
-def load_typesystem(source: Union[IO, str]) -> TypeSystem:
+def load_typesystem(source: Union[IO, str, Path]) -> TypeSystem:
     """Loads a type system from a XML source.
 
     Args:
         source: The XML source. If `source` is a string, then it is assumed to be an XML string.
                 If `source` is a file-like object, then the data is read from it.
+                If `source` is a `Path`, then load the file at the given location.
 
     Returns:
         The deserialized type system
@@ -985,6 +986,9 @@ def load_typesystem(source: Union[IO, str]) -> TypeSystem:
     deserializer = TypeSystemDeserializer()
     if isinstance(source, str):
         return deserializer.deserialize(BytesIO(source.encode("utf-8")))
+    elif isinstance(source, Path):
+        with source.open("rb") as src:
+            return deserializer.deserialize(src)
     else:
         return deserializer.deserialize(source)
 

diff --git a/cassis/util.py b/cassis/util.py
@@ -2,12 +2,12 @@
 from collections import defaultdict
 from functools import cmp_to_key
 from io import IOBase, StringIO
-from typing import Dict, Iterable
+from typing import Dict, Iterable, Set
 
 import attr
 
 from cassis import Cas
-from cassis.typesystem import FEATURE_BASE_NAME_SOFA, TYPE_NAME_ANNOTATION, FeatureStructure, Type
+from cassis.typesystem import FEATURE_BASE_NAME_SOFA, TYPE_NAME_ANNOTATION, FeatureStructure, Type, TypeSystem, is_array
 
 _EXCLUDED_FEATURES = {FEATURE_BASE_NAME_SOFA}
 _NULL_VALUE = "<NULL>"
@@ -19,9 +19,10 @@ def cas_to_comparable_text(
     seeds: Iterable[FeatureStructure] = None,
     mark_indexed: bool = True,
     covered_text: bool = True,
+    exclude_types: Set[str] = None,
 ) -> [str, None]:
     indexed_feature_structures = _get_indexed_feature_structures(cas)
-    all_feature_structures_by_type = _group_feature_structures_by_type(cas._find_all_fs(seeds))
+    all_feature_structures_by_type = _group_feature_structures_by_type(cas._find_all_fs(seeds=seeds))
     types_sorted = sorted(all_feature_structures_by_type.keys())
     fs_id_to_anchor = _generate_anchors(
         cas, types_sorted, all_feature_structures_by_type, indexed_feature_structures, mark_indexed=mark_indexed
@@ -32,6 +33,9 @@ def cas_to_comparable_text(
 
     csv_writer = csv.writer(out, dialect=csv.unix_dialect)
     for t in types_sorted:
+        if exclude_types and t in exclude_types:
+            continue
+
         type_ = cas.typesystem.get_type(t)
 
         csv_writer.writerow([type_.name])

diff --git a/cassis/xmi.py b/cassis/xmi.py
@@ -1,6 +1,7 @@
 import warnings
 from collections import defaultdict
 from io import BytesIO
+from pathlib import Path
 from typing import IO, Dict, Iterable, List, Set, Union
 
 import attr
@@ -47,13 +48,14 @@ class ProtoView:
 
 
 def load_cas_from_xmi(
-    source: Union[IO, str], typesystem: TypeSystem = None, lenient: bool = False, trusted: bool = False
+    source: Union[IO, Path, str], typesystem: TypeSystem = None, lenient: bool = False, trusted: bool = False
 ) -> Cas:
     """Loads a CAS from a XMI source.
 
     Args:
         source: The XML source. If `source` is a string, then it is assumed to be an XML string.
             If `source` is a file-like object, then the data is read from it.
+            If `source` is a `Path`, then load the file at the given location.
         typesystem: The type system that belongs to this CAS. If `None`, an empty type system is provided.
         lenient: If `True`, unknown Types will be ignored. If `False`, unknown Types will cause an exception.
             The default is `False`.
@@ -71,6 +73,9 @@ def load_cas_from_xmi(
         return deserializer.deserialize(
             BytesIO(source.encode("utf-8")), typesystem=typesystem, lenient=lenient, trusted=trusted
         )
+    if isinstance(source, Path):
+        with source.open("rb") as src:
+            return deserializer.deserialize(src, typesystem=typesystem, lenient=lenient, trusted=trusted)
     else:
         return deserializer.deserialize(source, typesystem=typesystem, lenient=lenient, trusted=trusted)
 
@@ -203,7 +208,7 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b
                 elif typesystem.is_primitive_array(fs.type) and feature_name == "elements":
                     # Separately rendered arrays (typically used with multipleReferencesAllowed = True)
                     fs[feature_name] = self._parse_primitive_array(fs.type, value)
-                elif typesystem.is_primitive_array(feature.rangeType):
+                elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed:
                     # Array feature rendered inline (multipleReferencesAllowed = False|None)
                     # We also end up here for array features that were rendered as child elements. No need to parse
                     # them again, so we check if the value is still a string (i.e. attribute value) and only then
@@ -337,19 +342,19 @@ def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[
         return AnnotationType(**attributes)
 
     def _parse_primitive_array(self, type_: Type, value: str) -> List:
-        """Primitive collections are serialized as white space seperated primitive values"""
+        """Primitive collections are serialized as white space separated primitive values"""
 
         # TODO: Use type name global variable here instead of hardcoded string literal
         elements = value.split(" ")
         type_name = type_.name
         if type_name in [TYPE_NAME_FLOAT_ARRAY, TYPE_NAME_DOUBLE_ARRAY]:
-            return [float(e) for e in elements]
+            return [float(e) for e in elements] if value else []
         elif type_name in [TYPE_NAME_INTEGER_ARRAY, TYPE_NAME_SHORT_ARRAY, TYPE_NAME_LONG_ARRAY]:
-            return [int(e) for e in elements]
+            return [int(e) for e in elements] if value else []
         elif type_name == TYPE_NAME_BOOLEAN_ARRAY:
-            return [self._parse_bool(e) for e in elements]
+            return [self._parse_bool(e) for e in elements] if value else []
         elif type_name == TYPE_NAME_BYTE_ARRAY:
-            return list(bytearray.fromhex(value))
+            return list(bytearray.fromhex(value)) if value else []
         else:
             raise ValueError(f"Not a primitive collection type: {type_name}")
 
@@ -519,22 +524,17 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur
                 sofa: Sofa = fs.sofa
                 value = sofa._offset_converter.cassis_to_uima(value)
 
-            if (
-                ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_ARRAY)
-                and not feature.multipleReferencesAllowed
-                and value.elements
-            ):
-                for e in value.elements:
-                    child = etree.SubElement(elem, feature_name)
-                    child.text = e
-            elif ts.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed and value.elements:
-                elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeType.name, value.elements)
-            elif (
-                feature.rangeType.name == TYPE_NAME_FS_ARRAY
-                and not feature.multipleReferencesAllowed
-                and value.elements
-            ):
-                elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in value.elements)
+            if ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_ARRAY) and not feature.multipleReferencesAllowed:
+                if value.elements is not None:  # Compare to none to not skip if elements is empty!
+                    for e in value.elements:
+                        child = etree.SubElement(elem, feature_name)
+                        child.text = e
+            elif ts.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed:
+                if value.elements is not None:  # Compare to none to not skip if elements is empty!
+                    elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeType.name, value.elements)
+            elif feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed:
+                if value.elements is not None:  # Compare to none to not skip if elements is empty!
+                    elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in value.elements)
             elif feature_name == FEATURE_BASE_NAME_SOFA:
                 elem.attrib[feature_name] = str(value.xmiID)
             elif feature.rangeType.name == TYPE_NAME_BOOLEAN:

diff --git a/tests/fixtures.py b/tests/fixtures.py
@@ -104,6 +104,20 @@ def cas_with_empty_array_references_xmi(cas_with_empty_array_references_path):
         return f.read()
 
 
+# CAS with multipleReferencesAllowed=true on string array
+
+
+@pytest.fixture
+def cas_with_multiple_references_allowed_string_array_path():
+    return os.path.join(FIXTURE_DIR, "xmi", "cas_with_multiple_references_allowed_string_array.xmi")
+
+
+@pytest.fixture
+def cas_with_multiple_references_allowed_string_array_xmi(cas_with_multiple_references_allowed_string_array_path):
+    with open(cas_with_multiple_references_allowed_string_array_path, "r") as f:
+        return f.read()
+
+
 # CAS with reserved names
 
 
@@ -273,6 +287,20 @@ def typesystem_with_collections_xml(typesystem_with_collections_path):
         return f.read()
 
 
+# CAS with multipleReferencesAllowed=true on string array
+
+
+@pytest.fixture
+def typesystem_with_multiple_references_allowed_path():
+    return os.path.join(FIXTURE_DIR, "typesystems", "typesystem_with_multiple_references_allowed.xml")
+
+
+@pytest.fixture
+def typesystem_with_multiple_references_allowed_xml(typesystem_with_multiple_references_allowed_path):
+    with open(typesystem_with_multiple_references_allowed_path, "r") as f:
+        return f.read()
+
+
 # DKPro types
 
 

diff --git a/tests/performance.py b/tests/performance.py
@@ -13,8 +13,12 @@
 
 typesystem = generator.generate_type_system()
 randomized_cas = generator.generate_cas(typesystem)
+
 randomized_cas_xmi = randomized_cas.to_xmi()
+randomized_cas_xmi_bytes = randomized_cas_xmi.encode("utf-8")
+
 randomized_cas_json = randomized_cas.to_json()
+randomized_cas_json_bytes = randomized_cas_json.encode("utf-8")
 
 
 @pytest.mark.performance
@@ -24,7 +28,9 @@ def test_xmi_serialization_performance():
         randomized_cas.to_xmi()
     end = timer()
 
-    print(f"XMI: Serializing {iterations} CASes took {end - start} seconds")
+    print(
+        f"XMI: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)"
+    )
 
 
 @pytest.mark.performance
@@ -34,7 +40,9 @@ def test_json_serialization_performance():
         randomized_cas.to_json()
     end = timer()
 
-    print(f"JSON: Serializing {iterations} CASes took {end - start} seconds")
+    print(
+        f"JSON: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)"
+    )
 
 
 @pytest.mark.performance
@@ -44,7 +52,9 @@ def test_xmi_deserialization_performance():
         load_cas_from_xmi(randomized_cas_xmi, typesystem)
     end = timer()
 
-    print(f"XMI: Deserializing {iterations} CASes took {end - start} seconds")
+    print(
+        f"XMI: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)"
+    )
 
 
 @pytest.mark.performance
@@ -54,4 +64,6 @@ def test_json_deserialization_performance():
         load_cas_from_json(randomized_cas_json, typesystem)
     end = timer()
 
-    print(f"JSON: Deserializing {iterations} CASes took {end - start} seconds")
+    print(
+        f"JSON: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)"
+    )
diff --git a/tests/test_files/typesystems/typesystem_with_collections.xml b/tests/test_files/typesystems/typesystem_with_collections.xml
@@ -7,15 +7,50 @@
             <supertypeName>uima.tcas.Annotation</supertypeName>
             <features>
                 <featureDescription>
-                    <name>collection1</name>
+                    <name>strings</name>
                     <description/>
                     <rangeTypeName>uima.cas.StringArray</rangeTypeName>
                 </featureDescription>
                 <featureDescription>
-                    <name>collection2</name>
+                    <name>shorts</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.ShortArray</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>integers</name>
                     <description/>
                     <rangeTypeName>uima.cas.IntegerArray</rangeTypeName>
                 </featureDescription>
+                <featureDescription>
+                    <name>longs</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.LongArray</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>booleans</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.BooleanArray</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>bytes</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.ByteArray</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>floats</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.FloatArray</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>doubles</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.DoubleArray</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>fses</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.FSArray</rangeTypeName>
+                </featureDescription>
             </features>
         </typeDescription>
     </types>

diff --git a/tests/test_files/typesystems/typesystem_with_multiple_references_allowed.xml b/tests/test_files/typesystems/typesystem_with_multiple_references_allowed.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
+    <types>
+        <typeDescription>
+            <name>test.type</name>
+            <description/>
+            <supertypeName>uima.tcas.Annotation</supertypeName>
+            <features>
+                <featureDescription>
+                    <name>target</name>
+                    <description/>
+                    <rangeTypeName>uima.cas.StringArray</rangeTypeName>
+                    <elementType>uima.cas.String</elementType>
+                    <multipleReferencesAllowed>true</multipleReferencesAllowed>
+                </featureDescription>
+            </features>
+        </typeDescription>
+    </types>
+</typeSystemDescription>