From 4aed3f21abb04e9ba838343091621da55c0d3c4f Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 13 Aug 2021 12:30:24 +0200 Subject: [PATCH 01/22] #168 - Experimental JSON CAS support - Added very basic JSON CAS support - No support for type systems yet - No support for lenient loading - Remove Cas:NULL via type name instead of puring simply the FS with ID 0 (which may not be a Cas:NULL fs) - Added various constants for type names and feature names in the Cas class (analouge to the Apache UIMA Java SDK impl) - WIP --- .gitignore | 2 + cassis/__init__.py | 2 + cassis/cas.py | 99 ++++++- cassis/json.py | 273 ++++++++++++++++++ cassis/typesystem.py | 10 +- tests/test_files/json/README.md | 5 + .../ser-ref/casWithSofaDataArray/data.json | 21 ++ .../casWithSofaDataArray/debug-typesystem.xml | 17 ++ .../ser-ref/casWithSofaDataArray/debug.xmi | 5 + .../ser-ref/casWithSofaDataURI/data.json | 17 ++ .../casWithSofaDataURI/debug-typesystem.xml | 17 ++ .../ser-ref/casWithSofaDataURI/debug.xmi | 4 + .../fs_as_array/ser-ref/casWithText/data.json | 24 ++ .../ser-ref/casWithText/debug-typesystem.xml | 17 ++ .../fs_as_array/ser-ref/casWithText/debug.xmi | 6 + .../casWithTextAndAnnotation/data.json | 30 ++ .../debug-typesystem.xml | 17 ++ .../casWithTextAndAnnotation/debug.xmi | 7 + tests/test_json.py | 26 ++ tests/util.py | 34 +++ 20 files changed, 628 insertions(+), 5 deletions(-) create mode 100644 cassis/json.py create mode 100644 tests/test_files/json/README.md create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi create mode 100644 tests/test_json.py diff --git a/.gitignore b/.gitignore index 0d13f2a..e2de877 100644 --- a/.gitignore +++ b/.gitignore @@ -221,3 +221,5 @@ expected.xml difference.diff xml_issue.py +actual.json +expected.json diff --git a/cassis/__init__.py b/cassis/__init__.py index 651988c..4d90bba 100644 --- a/cassis/__init__.py +++ b/cassis/__init__.py @@ -1,6 +1,7 @@ """UIMA CAS processing library in Python.""" from .cas import Cas, Sofa, View +from .json import load_cas_from_json from .typesystem import TypeSystem, load_dkpro_core_typesystem, load_typesystem, merge_typesystems from .xmi import load_cas_from_xmi @@ -13,4 +14,5 @@ "load_dkpro_core_typesystem", "merge_typesystems", "load_cas_from_xmi", + "load_cas_from_json", ] diff --git a/cassis/cas.py b/cassis/cas.py index 10c06b6..d56abae 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -77,6 +77,9 @@ def cassis_to_uima(self, idx: Optional[int]) -> Optional[int]: class Sofa: """Each CAS has one or more Subject of Analysis (SofA)""" + #: str: The type + type = "uima.cas.Sofa" + #: int: The sofaNum sofaNum = attr.ib(validator=validators.instance_of(int)) @@ -95,6 +98,9 @@ class Sofa: #: str: The sofa URI, it references remote sofa data sofaURI = attr.ib(default=None, validator=_validator_optional_string) + #: str: The sofa data byte array + sofaArray = attr.ib(default=None) + #: OffsetConverter: Converts from UIMA UTF-16 based offsets to Unicode codepoint offsets and back _offset_converter = attr.ib(factory=OffsetConverter, eq=False, hash=False) @@ -171,6 +177,40 @@ def __init__(self, typesystem: TypeSystem): class Cas: """A CAS object is a container for text (sofa) and annotations""" + NAME_SPACE_UIMA_CAS = "uima" + TypeSystem.NAMESPACE_SEPARATOR + "cas" + UIMA_CAS_PREFIX = NAME_SPACE_UIMA_CAS + TypeSystem.NAMESPACE_SEPARATOR + TYPE_NAME_TOP = UIMA_CAS_PREFIX + "TOP" + TYPE_NAME_INTEGER = UIMA_CAS_PREFIX + "Integer" + TYPE_NAME_FLOAT = UIMA_CAS_PREFIX + "Float" + TYPE_NAME_STRING = UIMA_CAS_PREFIX + "String" + TYPE_NAME_BOOLEAN = UIMA_CAS_PREFIX + "Boolean" + TYPE_NAME_BYTE = UIMA_CAS_PREFIX + "Byte" + TYPE_NAME_SHORT = UIMA_CAS_PREFIX + "Short" + TYPE_NAME_LONG = UIMA_CAS_PREFIX + "Long" + TYPE_NAME_DOUBLE = UIMA_CAS_PREFIX + "Double" + TYPE_NAME_ARRAY_BASE = UIMA_CAS_PREFIX + "ArrayBase" + TYPE_NAME_FS_ARRAY = UIMA_CAS_PREFIX + "FSArray" + TYPE_NAME_INTEGER_ARRAY = UIMA_CAS_PREFIX + "IntegerArray" + TYPE_NAME_FLOAT_ARRAY = UIMA_CAS_PREFIX + "FloatArray" + TYPE_NAME_STRING_ARRAY = UIMA_CAS_PREFIX + "StringArray" + TYPE_NAME_BOOLEAN_ARRAY = UIMA_CAS_PREFIX + "BooleanArray" + TYPE_NAME_BYTE_ARRAY = UIMA_CAS_PREFIX + "ByteArray" + TYPE_NAME_SHORT_ARRAY = UIMA_CAS_PREFIX + "ShortArray" + TYPE_NAME_LONG_ARRAY = UIMA_CAS_PREFIX + "LongArray" + TYPE_NAME_DOUBLE_ARRAY = UIMA_CAS_PREFIX + "DoubleArray" + TYPE_NAME_FS_HASH_SET = UIMA_CAS_PREFIX + "FSHashSet" + TYPE_NAME_SOFA = UIMA_CAS_PREFIX + "Sofa" + TYPE_NAME_ANNOTATION_BASE = UIMA_CAS_PREFIX + "AnnotationBase" + + FEATURE_BASE_NAME_SOFANUM = "sofaNum" + FEATURE_BASE_NAME_SOFAID = "sofaID" + FEATURE_BASE_NAME_SOFAMIME = "mimeType" + FEATURE_BASE_NAME_SOFAURI = "sofaURI" + FEATURE_BASE_NAME_SOFASTRING = "sofaString" + FEATURE_BASE_NAME_SOFAARRAY = "sofaArray" + + NAME_DEFAULT_SOFA = "_InitialView" + def __init__(self, typesystem: TypeSystem = None, lenient: bool = False): """ Creates a CAS with the specified typesystem. If no typesystem is given, then the default one is used which only contains UIMA-predefined types. @@ -321,6 +361,7 @@ def get_covered_text(self, annotation: FeatureStructure) -> str: def select(self, type_name: str) -> List[FeatureStructure]: """ Finds all annotations of type `type_name`. + """Finds all annotations of type `type_name`. Args: type_name: The name of the type whose annotation instances are to be found @@ -492,14 +533,33 @@ def sofa_uri(self) -> str: @sofa_uri.setter def sofa_uri(self, value: str): - """ Sets the sofa URI to `value`. + """Sets the sofa URI to `value`. Args: - value: The new sofa MIME type. + value: The new sofa URI. """ self.get_sofa().sofaURI = value + @property + def sofa_array(self) -> str: + """The sofa byte array references a ByteArrayFS feature structure + + Returns: The sofa data byte array. + + """ + return self.get_sofa().sofaArray + + @sofa_array.setter + def sofa_array(self, value: "uima_cas_ByteArrayFS"): + """Sets the sofa byte array to the given ByteArrayFS feature structure. + + Args: + value: The new sofa byte array type. + + """ + self.get_sofa().sofaArray = value + def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]: """Creates a XMI representation of this CAS. @@ -514,8 +574,36 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False """ from cassis.xmi import CasXmiSerializer - serializer = CasXmiSerializer() + return self.serialize(CasXmiSerializer(), path, pretty_print) + + def to_json(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]: + """Creates a JSON representation of this CAS. + Args: + path: File path, if `None` is provided the result is returned as a string + pretty_print: `True` if the resulting JSON should be pretty-printed, else `False` + + + Returns: + If `path` is None, then the JSON representation of this CAS is returned as a string + + """ + from cassis.json import CasJsonSerializer + + return self.serialize(CasJsonSerializer(), path, pretty_print) + + def serialize(self, serializer, path: Union[str, Path, None] = None, pretty_print: bool = False): + """Runs this CAS through the given serializer. + + Args: + path: File path, if `None` is provided the result is returned as a string + pretty_print: `True` if the resulting data should be pretty-printed, else `False` + + + Returns: + If `path` is None, then the data representation of this CAS is returned as a string + + """ # If `path` is None, then serialize to a string and return it if path is None: sink = BytesIO() @@ -591,7 +679,10 @@ def _find_all_fs(self) -> Iterable[FeatureStructure]: openlist.append(referenced_fs) # We do not want to return cas:NULL here as we handle serializing it later - all_fs.pop(0, None) + for fs_id, fs in list(all_fs.items()): + if fs.type == "uima.cas.NULL": + all_fs.pop(fs_id) + yield from all_fs.values() def _get_next_xmi_id(self) -> int: diff --git a/cassis/json.py b/cassis/json.py new file mode 100644 index 0000000..b0bd9d3 --- /dev/null +++ b/cassis/json.py @@ -0,0 +1,273 @@ +import base64 +import json +import warnings +from collections import OrderedDict, defaultdict +from io import BytesIO, TextIOWrapper +from typing import IO, Dict, Iterable, List, Optional, Set, Union + +import attr +from lxml import etree + +from cassis.cas import Cas, IdGenerator, Sofa, View +from cassis.typesystem import FeatureStructure, TypeNotFoundError, TypeSystem + +RESERVED_FIELD_PREFIX = "%" +TYPE_FIELD = RESERVED_FIELD_PREFIX + "TYPE" +RANGE_FIELD = RESERVED_FIELD_PREFIX + "RANGE" +TYPES_FIELD = RESERVED_FIELD_PREFIX + "TYPES" +FEATURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURES" +VIEWS_FIELD = RESERVED_FIELD_PREFIX + "VIEWS" +VIEW_SOFA_FIELD = RESERVED_FIELD_PREFIX + "SOFA" +VIEW_INDEX_FIELD = RESERVED_FIELD_PREFIX + "INDEX" +FEATURE_STRUCTURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURE_STRUCTURES" +REF_FEATURE_PREFIX = "@" +NAME_FIELD = RESERVED_FIELD_PREFIX + "NAME" +SUPER_TYPE_FIELD = RESERVED_FIELD_PREFIX + "SUPER_TYPE" +ELEMENT_TYPE_FIELD = RESERVED_FIELD_PREFIX + "ELEMENT_TYPE" +ID_FIELD = RESERVED_FIELD_PREFIX + "ID" +FLAGS_FIELD = RESERVED_FIELD_PREFIX + "FLAGS" +FLAG_DOCUMENT_ANNOTATION = "DocumentAnnotation" +ARRAY_SUFFIX = "[]" +ELEMENTS_FIELD = RESERVED_FIELD_PREFIX + "ELEMENTS" + + +def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None) -> Cas: + """Loads a CAS from a JSON source. + + Args: + source: The JSON source. If `source` is a string, then it is assumed to be an JSON string. + If `source` is a file-like object, then the data is read from it. + typesystem: The type system that belongs to this CAS. If `None`, an empty type system is provided. + lenient: If `True`, unknown Types will be ignored. If `False`, unknown Types will cause an exception. + The default is `False`. + + Returns: + The deserialized CAS + + """ + if typesystem is None: + typesystem = TypeSystem() + + deserializer = CasJsonDeserializer() + return deserializer.deserialize(source, typesystem=typesystem) + + +class CasJsonDeserializer: + def __init__(self): + self._max_xmi_id = 0 + self._max_sofa_num = 0 + self._post_processors = [] + + def deserialize(self, source: Union[IO, str], typesystem: TypeSystem) -> Cas: + if isinstance(source, str): + data = json.loads(source) + else: + data = json.load(source) + + feature_structures = {} + + self._max_xmi_id = 0 + self._max_sofa_num = 0 + self._post_processors = [] + + data.get(TYPES_FIELD) # FIXME + + cas = Cas(typesystem=typesystem) + + json_feature_structures = data.get(FEATURE_STRUCTURES_FIELD) + if isinstance(json_feature_structures, list): + for json_fs in json_feature_structures: + if json_fs.get(TYPE_FIELD) == Cas.TYPE_NAME_SOFA: + fs_id = json_fs.get(ID_FIELD) + fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures) + else: + fs_id = json_fs.get(ID_FIELD) + fs = self._parse_feature_structure(typesystem, fs_id, json_fs, feature_structures) + feature_structures[fs.xmiID] = fs + + if isinstance(json_feature_structures, dict): + for fs_id, json_fs in json_feature_structures.items(): + if json_fs.get(TYPE_FIELD) == Cas.TYPE_NAME_SOFA: + fs_id = int(fs_id) + fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures) + else: + fs_id = int(fs_id) + fs = self._parse_feature_structure(typesystem, fs_id, json_fs, feature_structures) + feature_structures[fs.xmiID] = fs + + for post_processor in self._post_processors: + post_processor() + + cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) + cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) + + # At this point all views for which we have a sofa with a known ID and sofaNum have already been created + # as part of parsing the feature structures. Thus, if there are any views remaining that are only declared + # in the views section, we just create them with auto-assigned IDs + json_views = data.get(VIEWS_FIELD) + for view_name, json_view in json_views.items(): + self._parse_view(cas, view_name, json_view, feature_structures) + + return cas + + def _get_or_create_view( + self, cas: Cas, view_name: str, fs_id: Optional[int] = None, sofa_num: Optional[int] = None + ) -> Cas: + if view_name == Cas.NAME_DEFAULT_SOFA: + view = cas.get_view(Cas.NAME_DEFAULT_SOFA) + + # We need to make sure that the sofa gets the real xmi, see #155 + if fs_id is not None: + view.get_sofa().xmiID = fs_id + + return view + else: + return cas.create_view(view_name, xmiID=fs_id, sofaNum=sofa_num) + + def _parse_view(self, cas: Cas, view_name: str, json_view: Dict[str, any], feature_structures: List): + view = self._get_or_create_view(cas, view_name) + for member_id in json_view[VIEW_INDEX_FIELD]: + fs = feature_structures[member_id] + view.add_annotation(fs, keep_id=True) + + def _parse_sofa(self, cas: Cas, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any]) -> Sofa: + view = self._get_or_create_view( + cas, json_fs.get(Cas.FEATURE_BASE_NAME_SOFAID), fs_id, json_fs.get(Cas.FEATURE_BASE_NAME_SOFANUM) + ) + + view.sofa_string = json_fs.get(Cas.FEATURE_BASE_NAME_SOFASTRING) + view.sofa_mime = json_fs.get(Cas.FEATURE_BASE_NAME_SOFAMIME) + view.sofa_uri = json_fs.get(Cas.FEATURE_BASE_NAME_SOFAURI) + view.sofa_array = feature_structures.get(json_fs.get(REF_FEATURE_PREFIX + Cas.FEATURE_BASE_NAME_SOFAARRAY)) + + return view.get_sofa() + + def _parse_feature_structure( + self, typesystem: TypeSystem, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any] + ): + AnnotationType = typesystem.get_type(json_fs.get(TYPE_FIELD)) + + attributes = dict(json_fs) + + # Map the JSON FS ID to xmiID + attributes["xmiID"] = fs_id + + # Remap features that use a reserved Python name + if "self" in attributes: + attributes["self_"] = attributes.pop("self") + + if "type" in attributes: + attributes["type_"] = attributes.pop("type") + + if AnnotationType.name == Cas.TYPE_NAME_BYTE_ARRAY: + attributes["elements"] = base64.b64decode(attributes.get(ELEMENTS_FIELD)) + + self._resolve_references(attributes, feature_structures) + self._strip_reserved_json_keys(attributes) + + self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) + return AnnotationType(**attributes) + + def _resolve_references(self, attributes: Dict[str, any], feature_structures: Dict[int, any]): + for key, value in list(attributes.items()): + if key.startswith(REF_FEATURE_PREFIX): + attributes.pop(key) + feature_name = key[1:] + target_fs = feature_structures.get(value) + if target_fs: + # Resolve id-ref now + attributes[feature_name] = target_fs + else: + # Resolve id-ref at the end of processing + def fix_up(): + attributes[feature_name] = feature_structures.get(value) + + self._post_processors.append(fix_up) + + def _strip_reserved_json_keys( + self, + attributes: Dict[str, any], + ): + for key in list(attributes): + if key.startswith(RESERVED_FIELD_PREFIX): + attributes.pop(key) + + +class CasJsonSerializer: + _COMMON_FIELD_NAMES = {"xmiID", "type"} + + def __init__(self): + pass + + def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): + data = {} + types = data[TYPES_FIELD] = {} + views = data[VIEWS_FIELD] = {} + feature_structures = data[FEATURE_STRUCTURES_FIELD] = [] + + for view in cas.views: + views[view.sofa.sofaID] = self._serialize_view(view) + if view.sofa.sofaArray: + json_sofa_array_fs = self._serialize_feature_structure(cas, view.sofa.sofaArray) + feature_structures.append(json_sofa_array_fs) + json_sofa_fs = self._serialize_feature_structure(cas, view.sofa) + feature_structures.append(json_sofa_fs) + + # Find all fs, even the ones that are not directly added to a sofa + for fs in sorted(cas._find_all_fs(), key=lambda a: a.xmiID): + json_fs = self._serialize_feature_structure(cas, fs) + feature_structures.append(json_fs) + + if isinstance(sink, BytesIO): + sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) + + if sink: + json.dump(data, sink, sort_keys=False) + else: + json.dumps(data, sort_keys=False) + + if isinstance(sink, TextIOWrapper): + sink.detach() # Prevent TextIOWrapper from closing the BytesIO + + def _serialize_feature_structure(self, cas, fs) -> dict: + json_fs = OrderedDict() + json_fs[ID_FIELD] = fs.xmiID + json_fs[TYPE_FIELD] = fs.type + + ts = cas.typesystem + t = ts.get_type(fs.type) + for feature in t.all_features: + if feature.name in CasJsonSerializer._COMMON_FIELD_NAMES: + continue + + feature_name = feature.name + + # Strip the underscore we added for reserved names + if feature._has_reserved_name: + feature_name = feature.name[:-1] + + # Skip over 'None' features + value = getattr(fs, feature.name) + if value is None: + continue + + # Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets + # if ts.is_instance_of(fs.type, "uima.tcas.Annotation") and feature_name == "begin" or feature_name == "end": + # sofa: Sofa = getattr(fs, "sofa") + # value = sofa._offset_converter.cassis_to_uima(value) + + if t.name == Cas.TYPE_NAME_BYTE_ARRAY and feature_name == "elements": + json_fs[ELEMENTS_FIELD] = base64.b64encode(value).decode("ascii") + elif t.supertypeName == Cas.TYPE_NAME_ARRAY_BASE and feature_name == "elements": + json_fs[ELEMENTS_FIELD] = value + elif ts.is_primitive(feature.rangeTypeName): + json_fs[feature_name] = value + elif ts.is_collection(fs.type, feature): + json_fs[REF_FEATURE_PREFIX + feature_name] = value.xmiID + else: + # We need to encode non-primitive features as a reference + json_fs[REF_FEATURE_PREFIX + feature_name] = value.xmiID + return json_fs + + def _serialize_view(self, view: View): + return {VIEW_SOFA_FIELD: view.sofa.xmiID, VIEW_INDEX_FIELD: sorted(x.xmiID for x in view.get_all_annotations())} diff --git a/cassis/typesystem.py b/cassis/typesystem.py index 38e0d1e..e846e37 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -349,6 +349,8 @@ def descendants(self) -> Iterator["Type"]: class TypeSystem: + NAMESPACE_SEPARATOR = "." + def __init__(self, add_document_annotation_type: bool = True): self._types = {} @@ -424,7 +426,13 @@ def __init__(self, add_document_annotation_type: bool = True): self.add_feature(t, name="sofaNum", rangeTypeName="uima.cas.Integer") self.add_feature(t, name="sofaID", rangeTypeName="uima.cas.String") self.add_feature(t, name="mimeType", rangeTypeName="uima.cas.String") - self.add_feature(t, name="sofaArray", rangeTypeName="uima.cas.TOP", multipleReferencesAllowed=True) + self.add_feature( + t, + name="sofaArray", + rangeTypeName="uima.cas.ByteArray", + elementType="uima.cas.Byte", + multipleReferencesAllowed=True, + ) self.add_feature(t, name="sofaString", rangeTypeName="uima.cas.String") self.add_feature(t, name="sofaURI", rangeTypeName="uima.cas.String") diff --git a/tests/test_files/json/README.md b/tests/test_files/json/README.md new file mode 100644 index 0000000..483853a --- /dev/null +++ b/tests/test_files/json/README.md @@ -0,0 +1,5 @@ +Test files in this folder were sourced from + +https://github.com/apache/uima-uimaj/tree/feature/UIMA-6266-Clean-JSON-Wire-Format-for-CAS/uimaj-json/src/test/resources/CasSerializationDeserialization_JsonCas2_FsAsArray_Test/ser-ref + +Apache License 2.0 diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json new file mode 100644 index 0000000..b732eaf --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json @@ -0,0 +1,21 @@ +{ + "%TYPES" : { }, + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 0, + "%INDEX" : [ ] + } + }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.ByteArray", + "%ELEMENTS" : "VGhpcyBpcyBhIHRlc3Q=" + }, { + "%ID" : 0, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text/plain", + "@sofaArray" : 1 + } ] +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi new file mode 100644 index 0000000..89075f6 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi @@ -0,0 +1,5 @@ + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json new file mode 100644 index 0000000..678d9e2 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json @@ -0,0 +1,17 @@ +{ + "%TYPES" : { }, + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 0, + "%INDEX" : [ ] + } + }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 0, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text/plain", + "sofaURI" : "classpath:/ProgrammaticallyCreatedCasDataSuite/document.txt" + } ] +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi new file mode 100644 index 0000000..89966e0 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi @@ -0,0 +1,4 @@ + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json new file mode 100644 index 0000000..416e6e1 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json @@ -0,0 +1,24 @@ +{ + "%TYPES" : { }, + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 0, + "%INDEX" : [ 1 ] + } + }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 0, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test." + }, { + "%ID" : 1, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 0, + "begin" : 0, + "end" : 15, + "language" : "x-unspecified" + } ] +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi new file mode 100644 index 0000000..943df5f --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi @@ -0,0 +1,6 @@ + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json new file mode 100644 index 0000000..aa71704 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json @@ -0,0 +1,30 @@ +{ + "%TYPES" : { }, + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 0, + "%INDEX" : [ 1, 2 ] + } + }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 0, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test." + }, { + "%ID" : 1, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 0, + "begin" : 0, + "end" : 15, + "language" : "x-unspecified" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 0, + "begin" : 0, + "end" : 15 + } ] +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi new file mode 100644 index 0000000..7292031 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/test_json.py b/tests/test_json.py new file mode 100644 index 0000000..a7aaaab --- /dev/null +++ b/tests/test_json.py @@ -0,0 +1,26 @@ +import json + +from tests.fixtures import * +from tests.util import assert_json_equal + +FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files") + +FIXTURES = [ + (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithSofaDataArray")), + (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithSofaDataURI")), + (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithText")), + (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithTextAndAnnotation")), +] + + +@pytest.mark.parametrize("json_path", FIXTURES) +def test_deserialization_serialization(json_path): + with open(os.path.join(json_path, "data.json"), "rb") as f: + cas = load_cas_from_json(f) + + with open(os.path.join(json_path, "data.json"), "rb") as f: + expected_json = json.load(f) + + actual_json = cas.to_json() + + assert_json_equal(actual_json, expected_json) diff --git a/tests/util.py b/tests/util.py index 7dbd925..8a7d780 100644 --- a/tests/util.py +++ b/tests/util.py @@ -1,4 +1,5 @@ import difflib +import json from typing import IO, Union import lxml_asserts @@ -39,6 +40,39 @@ def assert_xml_equal(actual: Union[IO, str], expected: Union[IO, str]): raise e +def assert_json_equal(actual: str, expected: Union[IO, str]): + """Checks whether the JSON trees behind `actual` and `expected` are equal. + + Args: + actual: The actual JSON + expected: The expected JSON + + Throws: + AssertionError when json(actual) != json(expected) + """ + actual_json = json.dumps(json.loads(actual), sort_keys=True, indent=2) + expected_json = json.dumps(expected, sort_keys=True, indent=2) + + try: + assert actual_json == expected_json + except AssertionError as e: + # For debugging purposes, the trees are saved to later inspect their contents + with open("actual.json", "w") as f: + f.write(actual_json) + + with open("expected.json", "w") as f: + f.write(expected_json) + + with open("difference.diff", "w") as f: + diff = difflib.unified_diff( + actual_json.splitlines(), expected_json.splitlines(), fromfile="Actual", tofile="Expected" + ) + diff_string = "\n".join(diff) + f.write(diff_string) + + raise e + + def _to_etree(source: Union[IO, str]) -> etree.Element: parser = etree.XMLParser(remove_blank_text=True) From 79db38bf6984c5696212ed7e891a3a76f6b1612a Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 13 Aug 2021 12:33:44 +0200 Subject: [PATCH 02/22] #168 - Experimental JSON CAS support - Fixed bad PyDoc comment --- cassis/cas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cassis/cas.py b/cassis/cas.py index d56abae..3bc34e4 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -360,7 +360,6 @@ def get_covered_text(self, annotation: FeatureStructure) -> str: return sofa.sofaString[annotation.begin : annotation.end] def select(self, type_name: str) -> List[FeatureStructure]: - """ Finds all annotations of type `type_name`. """Finds all annotations of type `type_name`. Args: From c47b2badf6e16b62bb600b5b413bbba62f578b55 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 13 Aug 2021 12:37:03 +0200 Subject: [PATCH 03/22] #168 - Experimental JSON CAS support - Fixed linter error because type hint was referring to a dynamically created type --- cassis/cas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cassis/cas.py b/cassis/cas.py index 323b3cb..01bc22c 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -542,7 +542,7 @@ def sofa_uri(self, value: str): @property def sofa_array(self) -> str: - """The sofa byte array references a ByteArrayFS feature structure + """The sofa byte array references a uima.cas.ByteArray feature structure Returns: The sofa data byte array. @@ -550,11 +550,11 @@ def sofa_array(self) -> str: return self.get_sofa().sofaArray @sofa_array.setter - def sofa_array(self, value: "uima_cas_ByteArrayFS"): - """Sets the sofa byte array to the given ByteArrayFS feature structure. + def sofa_array(self, value): + """Sets the sofa byte array to the given uima.cas.ByteArray feature structure. Args: - value: The new sofa byte array type. + value: The new sofa byte array feature structure. """ self.get_sofa().sofaArray = value From 8239fe55eec7011253a525884b71843a71ab128e Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 13 Aug 2021 12:45:11 +0200 Subject: [PATCH 04/22] #168 - Experimental JSON CAS support - Roll back change of Sofa.sofaArray range type from uima.cas.ByteArray back to uima.cas.TOP which is indeed the range type also used in the Apache UIMA Java SDK - despite only uima.cas.ByteArray being acceptable... --- cassis/typesystem.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cassis/typesystem.py b/cassis/typesystem.py index da77e64..fabed0f 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -426,13 +426,7 @@ def __init__(self, add_document_annotation_type: bool = True): self.add_feature(t, name="sofaNum", rangeTypeName="uima.cas.Integer") self.add_feature(t, name="sofaID", rangeTypeName="uima.cas.String") self.add_feature(t, name="mimeType", rangeTypeName="uima.cas.String") - self.add_feature( - t, - name="sofaArray", - rangeTypeName="uima.cas.ByteArray", - elementType="uima.cas.Byte", - multipleReferencesAllowed=True, - ) + self.add_feature(t, name="sofaArray", rangeTypeName="uima.cas.TOP", multipleReferencesAllowed=True) self.add_feature(t, name="sofaString", rangeTypeName="uima.cas.String") self.add_feature(t, name="sofaURI", rangeTypeName="uima.cas.String") From d1177581cb322c1090c4c323c219b429b4081b61 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 13 Aug 2021 16:05:13 +0200 Subject: [PATCH 05/22] #168 - Experimental JSON CAS support - Added generator for random CASes - Added JSON tests using random CAS generator - Added support for (de)serializing type system information in the JSON format - Move the type/feature name constants from Cas to typesystem.py --- cassis/cas.py | 40 +------- cassis/json.py | 122 ++++++++++++++++++------ cassis/typesystem.py | 53 +++++++++- tests/test_files/test_cas_generators.py | 42 ++++++++ tests/test_json.py | 16 ++++ tests/util.py | 8 +- 6 files changed, 213 insertions(+), 68 deletions(-) create mode 100644 tests/test_files/test_cas_generators.py diff --git a/cassis/cas.py b/cassis/cas.py index 01bc22c..ca74543 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -177,40 +177,6 @@ def __init__(self, typesystem: TypeSystem): class Cas: """A CAS object is a container for text (sofa) and annotations""" - NAME_SPACE_UIMA_CAS = "uima" + TypeSystem.NAMESPACE_SEPARATOR + "cas" - UIMA_CAS_PREFIX = NAME_SPACE_UIMA_CAS + TypeSystem.NAMESPACE_SEPARATOR - TYPE_NAME_TOP = UIMA_CAS_PREFIX + "TOP" - TYPE_NAME_INTEGER = UIMA_CAS_PREFIX + "Integer" - TYPE_NAME_FLOAT = UIMA_CAS_PREFIX + "Float" - TYPE_NAME_STRING = UIMA_CAS_PREFIX + "String" - TYPE_NAME_BOOLEAN = UIMA_CAS_PREFIX + "Boolean" - TYPE_NAME_BYTE = UIMA_CAS_PREFIX + "Byte" - TYPE_NAME_SHORT = UIMA_CAS_PREFIX + "Short" - TYPE_NAME_LONG = UIMA_CAS_PREFIX + "Long" - TYPE_NAME_DOUBLE = UIMA_CAS_PREFIX + "Double" - TYPE_NAME_ARRAY_BASE = UIMA_CAS_PREFIX + "ArrayBase" - TYPE_NAME_FS_ARRAY = UIMA_CAS_PREFIX + "FSArray" - TYPE_NAME_INTEGER_ARRAY = UIMA_CAS_PREFIX + "IntegerArray" - TYPE_NAME_FLOAT_ARRAY = UIMA_CAS_PREFIX + "FloatArray" - TYPE_NAME_STRING_ARRAY = UIMA_CAS_PREFIX + "StringArray" - TYPE_NAME_BOOLEAN_ARRAY = UIMA_CAS_PREFIX + "BooleanArray" - TYPE_NAME_BYTE_ARRAY = UIMA_CAS_PREFIX + "ByteArray" - TYPE_NAME_SHORT_ARRAY = UIMA_CAS_PREFIX + "ShortArray" - TYPE_NAME_LONG_ARRAY = UIMA_CAS_PREFIX + "LongArray" - TYPE_NAME_DOUBLE_ARRAY = UIMA_CAS_PREFIX + "DoubleArray" - TYPE_NAME_FS_HASH_SET = UIMA_CAS_PREFIX + "FSHashSet" - TYPE_NAME_SOFA = UIMA_CAS_PREFIX + "Sofa" - TYPE_NAME_ANNOTATION_BASE = UIMA_CAS_PREFIX + "AnnotationBase" - - FEATURE_BASE_NAME_SOFANUM = "sofaNum" - FEATURE_BASE_NAME_SOFAID = "sofaID" - FEATURE_BASE_NAME_SOFAMIME = "mimeType" - FEATURE_BASE_NAME_SOFAURI = "sofaURI" - FEATURE_BASE_NAME_SOFASTRING = "sofaString" - FEATURE_BASE_NAME_SOFAARRAY = "sofaArray" - - NAME_DEFAULT_SOFA = "_InitialView" - def __init__(self, typesystem: TypeSystem = None, lenient: bool = False): """Creates a CAS with the specified typesystem. If no typesystem is given, then the default one is used which only contains UIMA-predefined types. @@ -573,7 +539,7 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False """ from cassis.xmi import CasXmiSerializer - return self.serialize(CasXmiSerializer(), path, pretty_print) + return self._serialize(CasXmiSerializer(), path, pretty_print) def to_json(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]: """Creates a JSON representation of this CAS. @@ -589,9 +555,9 @@ def to_json(self, path: Union[str, Path, None] = None, pretty_print: bool = Fals """ from cassis.json import CasJsonSerializer - return self.serialize(CasJsonSerializer(), path, pretty_print) + return self._serialize(CasJsonSerializer(), path, pretty_print) - def serialize(self, serializer, path: Union[str, Path, None] = None, pretty_print: bool = False): + def _serialize(self, serializer, path: Union[str, Path, None] = None, pretty_print: bool = False): """Runs this CAS through the given serializer. Args: diff --git a/cassis/json.py b/cassis/json.py index b0bd9d3..34cac0e 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -1,15 +1,10 @@ import base64 import json -import warnings -from collections import OrderedDict, defaultdict -from io import BytesIO, TextIOWrapper -from typing import IO, Dict, Iterable, List, Optional, Set, Union - -import attr -from lxml import etree +from collections import OrderedDict +from io import TextIOWrapper from cassis.cas import Cas, IdGenerator, Sofa, View -from cassis.typesystem import FeatureStructure, TypeNotFoundError, TypeSystem +from cassis.typesystem import * RESERVED_FIELD_PREFIX = "%" TYPE_FIELD = RESERVED_FIELD_PREFIX + "TYPE" @@ -23,7 +18,9 @@ REF_FEATURE_PREFIX = "@" NAME_FIELD = RESERVED_FIELD_PREFIX + "NAME" SUPER_TYPE_FIELD = RESERVED_FIELD_PREFIX + "SUPER_TYPE" +DESCRIPTION_FIELD = RESERVED_FIELD_PREFIX + "DESCRIPTION" ELEMENT_TYPE_FIELD = RESERVED_FIELD_PREFIX + "ELEMENT_TYPE" +MULTIPLE_REFERENCES_ALLOWED_FIELD = RESERVED_FIELD_PREFIX + "MULTIPLE_REFERENCES_ALLOWED" ID_FIELD = RESERVED_FIELD_PREFIX + "ID" FLAGS_FIELD = RESERVED_FIELD_PREFIX + "FLAGS" FLAG_DOCUMENT_ANNOTATION = "DocumentAnnotation" @@ -58,26 +55,30 @@ def __init__(self): self._max_sofa_num = 0 self._post_processors = [] - def deserialize(self, source: Union[IO, str], typesystem: TypeSystem) -> Cas: + def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] = None) -> Cas: if isinstance(source, str): data = json.loads(source) else: data = json.load(source) - feature_structures = {} - self._max_xmi_id = 0 self._max_sofa_num = 0 self._post_processors = [] - data.get(TYPES_FIELD) # FIXME + embedded_typesystem = TypeSystem() + json_typesystem = data.get(TYPES_FIELD) + for type_name, json_type in json_typesystem.items(): + self._parse_type(embedded_typesystem, type_name, json_type) + + typesystem = merge_typesystems(typesystem, embedded_typesystem) cas = Cas(typesystem=typesystem) + feature_structures = {} json_feature_structures = data.get(FEATURE_STRUCTURES_FIELD) if isinstance(json_feature_structures, list): for json_fs in json_feature_structures: - if json_fs.get(TYPE_FIELD) == Cas.TYPE_NAME_SOFA: + if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA: fs_id = json_fs.get(ID_FIELD) fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures) else: @@ -87,7 +88,7 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem) -> Cas: if isinstance(json_feature_structures, dict): for fs_id, json_fs in json_feature_structures.items(): - if json_fs.get(TYPE_FIELD) == Cas.TYPE_NAME_SOFA: + if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA: fs_id = int(fs_id) fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures) else: @@ -110,11 +111,28 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem) -> Cas: return cas + def _parse_type(self, typesystem: TypeSystem, type_name: str, json_type: Dict[str, any]): + super_type_name = json_type[SUPER_TYPE_FIELD] + description = json_type.get(DESCRIPTION_FIELD) + new_type = typesystem.create_type(type_name, super_type_name, description=description) + + for key, value in json_type.items(): + if key.startswith(RESERVED_FIELD_PREFIX): + continue + typesystem.add_feature( + new_type, + name=key, + rangeTypeName=json_type[RANGE_FIELD], + description=json_type.get(DESCRIPTION_FIELD), + elementType=json_type.get(ELEMENT_TYPE_FIELD), + multipleReferencesAllowed=json_type.get(MULTIPLE_REFERENCES_ALLOWED_FIELD), + ) + def _get_or_create_view( self, cas: Cas, view_name: str, fs_id: Optional[int] = None, sofa_num: Optional[int] = None ) -> Cas: - if view_name == Cas.NAME_DEFAULT_SOFA: - view = cas.get_view(Cas.NAME_DEFAULT_SOFA) + if view_name == NAME_DEFAULT_SOFA: + view = cas.get_view(NAME_DEFAULT_SOFA) # We need to make sure that the sofa gets the real xmi, see #155 if fs_id is not None: @@ -124,7 +142,7 @@ def _get_or_create_view( else: return cas.create_view(view_name, xmiID=fs_id, sofaNum=sofa_num) - def _parse_view(self, cas: Cas, view_name: str, json_view: Dict[str, any], feature_structures: List): + def _parse_view(self, cas: Cas, view_name: str, json_view: Dict[str, any], feature_structures: Dict[str, any]): view = self._get_or_create_view(cas, view_name) for member_id in json_view[VIEW_INDEX_FIELD]: fs = feature_structures[member_id] @@ -132,13 +150,13 @@ def _parse_view(self, cas: Cas, view_name: str, json_view: Dict[str, any], featu def _parse_sofa(self, cas: Cas, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any]) -> Sofa: view = self._get_or_create_view( - cas, json_fs.get(Cas.FEATURE_BASE_NAME_SOFAID), fs_id, json_fs.get(Cas.FEATURE_BASE_NAME_SOFANUM) + cas, json_fs.get(FEATURE_BASE_NAME_SOFAID), fs_id, json_fs.get(FEATURE_BASE_NAME_SOFANUM) ) - view.sofa_string = json_fs.get(Cas.FEATURE_BASE_NAME_SOFASTRING) - view.sofa_mime = json_fs.get(Cas.FEATURE_BASE_NAME_SOFAMIME) - view.sofa_uri = json_fs.get(Cas.FEATURE_BASE_NAME_SOFAURI) - view.sofa_array = feature_structures.get(json_fs.get(REF_FEATURE_PREFIX + Cas.FEATURE_BASE_NAME_SOFAARRAY)) + view.sofa_string = json_fs.get(FEATURE_BASE_NAME_SOFASTRING) + view.sofa_mime = json_fs.get(FEATURE_BASE_NAME_SOFAMIME) + view.sofa_uri = json_fs.get(FEATURE_BASE_NAME_SOFAURI) + view.sofa_array = feature_structures.get(json_fs.get(REF_FEATURE_PREFIX + FEATURE_BASE_NAME_SOFAARRAY)) return view.get_sofa() @@ -159,7 +177,7 @@ def _parse_feature_structure( if "type" in attributes: attributes["type_"] = attributes.pop("type") - if AnnotationType.name == Cas.TYPE_NAME_BYTE_ARRAY: + if AnnotationType.name == TYPE_NAME_BYTE_ARRAY: attributes["elements"] = base64.b64decode(attributes.get(ELEMENTS_FIELD)) self._resolve_references(attributes, feature_structures) @@ -205,6 +223,12 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): views = data[VIEWS_FIELD] = {} feature_structures = data[FEATURE_STRUCTURES_FIELD] = [] + for type_ in cas.typesystem.get_types(): + if type_.name == TYPE_NAME_DOCUMENT_ANNOTATION: + continue + json_type = self._serialize_type(type_) + types[json_type[NAME_FIELD]] = json_type + for view in cas.views: views[view.sofa.sofaID] = self._serialize_view(view) if view.sofa.sofaArray: @@ -222,13 +246,52 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) if sink: - json.dump(data, sink, sort_keys=False) + json.dump(data, sink, sort_keys=False, indent=2 if pretty_print else None) else: - json.dumps(data, sort_keys=False) + json.dumps(data, sort_keys=False, indent=2 if pretty_print else None) if isinstance(sink, TextIOWrapper): sink.detach() # Prevent TextIOWrapper from closing the BytesIO + def _serialize_type(self, type_: Type): + type_name = self._to_external_type_name(type_.name) + supertype_name = self._to_external_type_name(type_.supertypeName) + + json_type = { + NAME_FIELD: type_name, + SUPER_TYPE_FIELD: supertype_name, + DESCRIPTION_FIELD: type_.description, + } + + for feature in list(type_.features): + json_feature = self._serialize_feature(json_type, feature) + json_type[json_feature[NAME_FIELD]] = json_feature + + return json_type + + def _serialize_feature(self, json_type, feature: Feature): + # If the feature name is a reserved name like `self`, then we added an + # underscore to it before so Python can handle it. We now need to remove it. + feature_name = feature.name + if feature._has_reserved_name: + feature_name = feature_name[:-1] + + json_feature = { + NAME_FIELD: feature_name, + RANGE_FIELD: self._to_external_type_name(feature.rangeTypeName), + } + + if feature.description: + json_feature[DESCRIPTION_FIELD] = feature.description + + if feature.multipleReferencesAllowed is not None: + json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed + + if feature.elementType is not None: + json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType) + + return json_feature + def _serialize_feature_structure(self, cas, fs) -> dict: json_fs = OrderedDict() json_fs[ID_FIELD] = fs.xmiID @@ -256,9 +319,9 @@ def _serialize_feature_structure(self, cas, fs) -> dict: # sofa: Sofa = getattr(fs, "sofa") # value = sofa._offset_converter.cassis_to_uima(value) - if t.name == Cas.TYPE_NAME_BYTE_ARRAY and feature_name == "elements": + if t.name == TYPE_NAME_BYTE_ARRAY and feature_name == "elements": json_fs[ELEMENTS_FIELD] = base64.b64encode(value).decode("ascii") - elif t.supertypeName == Cas.TYPE_NAME_ARRAY_BASE and feature_name == "elements": + elif t.supertypeName == TYPE_NAME_ARRAY_BASE and feature_name == "elements": json_fs[ELEMENTS_FIELD] = value elif ts.is_primitive(feature.rangeTypeName): json_fs[feature_name] = value @@ -271,3 +334,8 @@ def _serialize_feature_structure(self, cas, fs) -> dict: def _serialize_view(self, view: View): return {VIEW_SOFA_FIELD: view.sofa.xmiID, VIEW_INDEX_FIELD: sorted(x.xmiID for x in view.get_all_annotations())} + + def _to_external_type_name(self, type_name: str): + if type_name.startswith("uima.noNamespace."): + return type_name.replace("uima.noNamespace.", "") + return type_name diff --git a/cassis/typesystem.py b/cassis/typesystem.py index fabed0f..c32d379 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -13,6 +13,50 @@ TOP_TYPE_NAME = "uima.cas.TOP" +NAMESPACE_SEPARATOR = "." + +NAME_SPACE_UIMA_CAS = "uima" + NAMESPACE_SEPARATOR + "cas" +UIMA_CAS_PREFIX = NAME_SPACE_UIMA_CAS + NAMESPACE_SEPARATOR +TYPE_NAME_TOP = UIMA_CAS_PREFIX + "TOP" +TYPE_NAME_INTEGER = UIMA_CAS_PREFIX + "Integer" +TYPE_NAME_FLOAT = UIMA_CAS_PREFIX + "Float" +TYPE_NAME_STRING = UIMA_CAS_PREFIX + "String" +TYPE_NAME_BOOLEAN = UIMA_CAS_PREFIX + "Boolean" +TYPE_NAME_BYTE = UIMA_CAS_PREFIX + "Byte" +TYPE_NAME_SHORT = UIMA_CAS_PREFIX + "Short" +TYPE_NAME_LONG = UIMA_CAS_PREFIX + "Long" +TYPE_NAME_DOUBLE = UIMA_CAS_PREFIX + "Double" +TYPE_NAME_ARRAY_BASE = UIMA_CAS_PREFIX + "ArrayBase" +TYPE_NAME_FS_ARRAY = UIMA_CAS_PREFIX + "FSArray" +TYPE_NAME_INTEGER_ARRAY = UIMA_CAS_PREFIX + "IntegerArray" +TYPE_NAME_FLOAT_ARRAY = UIMA_CAS_PREFIX + "FloatArray" +TYPE_NAME_STRING_ARRAY = UIMA_CAS_PREFIX + "StringArray" +TYPE_NAME_BOOLEAN_ARRAY = UIMA_CAS_PREFIX + "BooleanArray" +TYPE_NAME_BYTE_ARRAY = UIMA_CAS_PREFIX + "ByteArray" +TYPE_NAME_SHORT_ARRAY = UIMA_CAS_PREFIX + "ShortArray" +TYPE_NAME_LONG_ARRAY = UIMA_CAS_PREFIX + "LongArray" +TYPE_NAME_DOUBLE_ARRAY = UIMA_CAS_PREFIX + "DoubleArray" +TYPE_NAME_FS_HASH_SET = UIMA_CAS_PREFIX + "FSHashSet" +TYPE_NAME_ANNOTATION_BASE = UIMA_CAS_PREFIX + "AnnotationBase" + +NAME_DEFAULT_SOFA = "_InitialView" +TYPE_NAME_SOFA = UIMA_CAS_PREFIX + "Sofa" +FEATURE_BASE_NAME_SOFANUM = "sofaNum" +FEATURE_BASE_NAME_SOFAID = "sofaID" +FEATURE_BASE_NAME_SOFAMIME = "mimeType" +FEATURE_BASE_NAME_SOFAURI = "sofaURI" +FEATURE_BASE_NAME_SOFASTRING = "sofaString" +FEATURE_BASE_NAME_SOFAARRAY = "sofaArray" + +NAME_SPACE_UIMA_TCAS = "uima" + NAMESPACE_SEPARATOR + "tcas" +UIMA_TCAS_PREFIX = NAME_SPACE_UIMA_TCAS + NAMESPACE_SEPARATOR +TYPE_NAME_ANNOTATION = UIMA_TCAS_PREFIX + "Annotation" +TYPE_NAME_DOCUMENT_ANNOTATION = UIMA_TCAS_PREFIX + "DocumentAnnotation" +FEATURE_BASE_NAME_SOFA = "sofa" +FEATURE_BASE_NAME_BEGIN = "begin" +FEATURE_BASE_NAME_END = "end" +FEATURE_BASE_NAME_LANGUAGE = "language" + _DOCUMENT_ANNOTATION_TYPE = "uima.tcas.DocumentAnnotation" _PREDEFINED_TYPES = { @@ -349,8 +393,6 @@ def descendants(self) -> Iterator["Type"]: class TypeSystem: - NAMESPACE_SEPARATOR = "." - def __init__(self, add_document_annotation_type: bool = True): self._types = {} @@ -426,7 +468,12 @@ def __init__(self, add_document_annotation_type: bool = True): self.add_feature(t, name="sofaNum", rangeTypeName="uima.cas.Integer") self.add_feature(t, name="sofaID", rangeTypeName="uima.cas.String") self.add_feature(t, name="mimeType", rangeTypeName="uima.cas.String") - self.add_feature(t, name="sofaArray", rangeTypeName="uima.cas.TOP", multipleReferencesAllowed=True) + self.add_feature( + t, + name="sofaArray", + rangeTypeName="uima.cas.TOP", + multipleReferencesAllowed=True, + ) self.add_feature(t, name="sofaString", rangeTypeName="uima.cas.String") self.add_feature(t, name="sofaURI", rangeTypeName="uima.cas.String") diff --git a/tests/test_files/test_cas_generators.py b/tests/test_files/test_cas_generators.py new file mode 100644 index 0000000..25bb4da --- /dev/null +++ b/tests/test_files/test_cas_generators.py @@ -0,0 +1,42 @@ +from random import Random + +from cassis import Cas, TypeSystem +from cassis.typesystem import * + + +class MultiTypeRandomCasGenerator: + def __init__(self): + self.type_count = 10 + self.size = 10 + self.minimum_width = 0 + self.rnd = Random() + + def generate_type_system(self) -> TypeSystem: + typesystem = TypeSystem() + types = [] + + for ti in range(0, self.type_count): + type_name = f"test.Type{ti + 1}" + if self.rnd.randint(0, 1) == 0 or not types: + typesystem.create_type(type_name, TYPE_NAME_ANNOTATION) + else: + typesystem.create_type(type_name, self.rnd.choice(types)) + types.append(type_name) + + return typesystem + + def generate_cas(self, typesystem: TypeSystem) -> Cas: + cas = Cas(typesystem) + + types = [t for t in typesystem.get_types()] + types.remove(cas.typesystem.get_type(TYPE_NAME_DOCUMENT_ANNOTATION)) + self.rnd.shuffle(types) + + for n in range(0, self.size): + for T in types: + begin = self.rnd.randint(0, 100) + end = self.rnd.randint(0, 30) + self.minimum_width + fs = T(begin=begin, end=end) + cas.add_annotation(fs) + + return cas diff --git a/tests/test_json.py b/tests/test_json.py index a7aaaab..4633fd9 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -1,6 +1,7 @@ import json from tests.fixtures import * +from tests.test_files.test_cas_generators import MultiTypeRandomCasGenerator from tests.util import assert_json_equal FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files") @@ -24,3 +25,18 @@ def test_deserialization_serialization(json_path): actual_json = cas.to_json() assert_json_equal(actual_json, expected_json) + + +def test_serialization_deserialization(): + generator = MultiTypeRandomCasGenerator() + for i in range(0, 10): + generator.size = (i + 1) * 10 + generator.type_count = i + 1 + typesystem = generator.generate_type_system() + randomized_cas = generator.generate_cas(typesystem) + expected_json = randomized_cas.to_json(pretty_print=True) + + loaded_cas = load_cas_from_json(expected_json) + actual_json = loaded_cas.to_json() + + assert_json_equal(actual_json, expected_json) diff --git a/tests/util.py b/tests/util.py index de1d26b..129cc84 100644 --- a/tests/util.py +++ b/tests/util.py @@ -50,7 +50,13 @@ def assert_json_equal(actual: str, expected: Union[IO, str]): Throws: AssertionError when json(actual) != json(expected) """ - actual_json = json.dumps(json.loads(actual), sort_keys=True, indent=2) + if isinstance(actual, str): + actual = json.loads(actual) + + if isinstance(expected, str): + expected = json.loads(expected) + + actual_json = json.dumps(actual, sort_keys=True, indent=2) expected_json = json.dumps(expected, sort_keys=True, indent=2) try: From 4f462be7841eb1a3c316f7bdc4f1ca0cde52d148 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 13 Aug 2021 17:34:49 +0200 Subject: [PATCH 06/22] #168 - Experimental JSON CAS support - Added another generator for random CASes - Added more tests - Commented out all testing of arrays in the new generator since array handling in cassis seems to have a few conceptual problems when need to be looked at first --- cassis/cas.py | 2 + cassis/json.py | 47 ++++++----- cassis/typesystem.py | 1 - tests/test_files/test_cas_generators.py | 102 ++++++++++++++++++++++++ tests/test_json.py | 23 +++++- 5 files changed, 150 insertions(+), 25 deletions(-) diff --git a/cassis/cas.py b/cassis/cas.py index ca74543..257bcbd 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -13,6 +13,8 @@ _validator_optional_string = validators.optional(validators.instance_of(str)) +NAME_DEFAULT_SOFA = "_InitialView" + class IdGenerator: def __init__(self, initial_id: int = 1): diff --git a/cassis/json.py b/cassis/json.py index 34cac0e..14b2e5d 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -3,7 +3,7 @@ from collections import OrderedDict from io import TextIOWrapper -from cassis.cas import Cas, IdGenerator, Sofa, View +from cassis.cas import NAME_DEFAULT_SOFA, Cas, IdGenerator, Sofa, View from cassis.typesystem import * RESERVED_FIELD_PREFIX = "%" @@ -116,16 +116,16 @@ def _parse_type(self, typesystem: TypeSystem, type_name: str, json_type: Dict[st description = json_type.get(DESCRIPTION_FIELD) new_type = typesystem.create_type(type_name, super_type_name, description=description) - for key, value in json_type.items(): + for key, json_feature in json_type.items(): if key.startswith(RESERVED_FIELD_PREFIX): continue typesystem.add_feature( new_type, name=key, - rangeTypeName=json_type[RANGE_FIELD], - description=json_type.get(DESCRIPTION_FIELD), - elementType=json_type.get(ELEMENT_TYPE_FIELD), - multipleReferencesAllowed=json_type.get(MULTIPLE_REFERENCES_ALLOWED_FIELD), + rangeTypeName=json_feature[RANGE_FIELD], + description=json_feature.get(DESCRIPTION_FIELD), + elementType=json_feature.get(ELEMENT_TYPE_FIELD), + multipleReferencesAllowed=json_feature.get(MULTIPLE_REFERENCES_ALLOWED_FIELD), ) def _get_or_create_view( @@ -180,27 +180,32 @@ def _parse_feature_structure( if AnnotationType.name == TYPE_NAME_BYTE_ARRAY: attributes["elements"] = base64.b64decode(attributes.get(ELEMENTS_FIELD)) - self._resolve_references(attributes, feature_structures) self._strip_reserved_json_keys(attributes) - self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) - return AnnotationType(**attributes) - - def _resolve_references(self, attributes: Dict[str, any], feature_structures: Dict[int, any]): + ref_features = {} for key, value in list(attributes.items()): if key.startswith(REF_FEATURE_PREFIX): + ref_features[key[1:]] = value attributes.pop(key) - feature_name = key[1:] - target_fs = feature_structures.get(value) - if target_fs: - # Resolve id-ref now - attributes[feature_name] = target_fs - else: - # Resolve id-ref at the end of processing - def fix_up(): - attributes[feature_name] = feature_structures.get(value) - self._post_processors.append(fix_up) + self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) + fs = AnnotationType(**attributes) + + self._resolve_references(fs, ref_features, feature_structures) + return fs + + def _resolve_references(self, fs, ref_features: Dict[str, any], feature_structures: Dict[int, any]): + for key, value in ref_features.items(): + target_fs = feature_structures.get(value) + if target_fs: + # Resolve id-ref now + setattr(fs, key, target_fs) + else: + # Resolve id-ref at the end of processing + def fix_up(): + setattr(fs, key, feature_structures.get(value)) + + self._post_processors.append(fix_up) def _strip_reserved_json_keys( self, diff --git a/cassis/typesystem.py b/cassis/typesystem.py index c32d379..92898de 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -39,7 +39,6 @@ TYPE_NAME_FS_HASH_SET = UIMA_CAS_PREFIX + "FSHashSet" TYPE_NAME_ANNOTATION_BASE = UIMA_CAS_PREFIX + "AnnotationBase" -NAME_DEFAULT_SOFA = "_InitialView" TYPE_NAME_SOFA = UIMA_CAS_PREFIX + "Sofa" FEATURE_BASE_NAME_SOFANUM = "sofaNum" FEATURE_BASE_NAME_SOFAID = "sofaID" diff --git a/tests/test_files/test_cas_generators.py b/tests/test_files/test_cas_generators.py index 25bb4da..3284da6 100644 --- a/tests/test_files/test_cas_generators.py +++ b/tests/test_files/test_cas_generators.py @@ -40,3 +40,105 @@ def generate_cas(self, typesystem: TypeSystem) -> Cas: cas.add_annotation(fs) return cas + + +class MultiFeatureRandomCasGenerator: + STRING_VALUES = ["abc", "abcdef", None, "", "ghijklm", "a", "b"] + BYTE_VALUES = [1, 0, -1, 127, -128, 9, -9] + LONG_VALUES = [1, 0, -1, 9223372036854775807, -9223372036854775808, 11, -11] + SHORT_VALUES = [1, 0, -1, 32767, -32768, 22, -22] + DOUBLE_VALUES = [1, 0, -1, 999999999999, -999999999999, 33, -33.33] + FLOAT_VALUES = [1, 0, -1, 999999999999, -999999999999, 17, -22.33] + BOOL_VALUES = [True, False] + + def __init__(self): + self.size = 10 + self.rnd = Random() + + def generate_type_system(self) -> TypeSystem: + typesystem = TypeSystem() + Akof = typesystem.create_type("akof", TYPE_NAME_TOP, "all kinds of features") + typesystem.add_feature(Akof, "akofInt", TYPE_NAME_INTEGER) + typesystem.add_feature(Akof, "akofFs", TYPE_NAME_TOP) + typesystem.add_feature(Akof, "akofFloat", TYPE_NAME_FLOAT) + typesystem.add_feature(Akof, "akofDouble", TYPE_NAME_DOUBLE) + typesystem.add_feature(Akof, "akofLong", TYPE_NAME_LONG) + typesystem.add_feature(Akof, "akofShort", TYPE_NAME_SHORT) + typesystem.add_feature(Akof, "akofByte", TYPE_NAME_BYTE) + typesystem.add_feature(Akof, "akofBoolean", TYPE_NAME_BOOLEAN) + typesystem.add_feature(Akof, "akofString", TYPE_NAME_STRING) + # typesystem.add_feature(Akof, "akofAInt", TYPE_NAME_INTEGER_ARRAY) + # typesystem.add_feature(Akof, "akofAFs", TYPE_NAME_FS_ARRAY) + # typesystem.add_feature(Akof, "akofAFloat", TYPE_NAME_FLOAT_ARRAY) + # typesystem.add_feature(Akof, "akofADouble", TYPE_NAME_DOUBLE_ARRAY) + # typesystem.add_feature(Akof, "akofALong", TYPE_NAME_LONG_ARRAY) + # typesystem.add_feature(Akof, "akofAShort", TYPE_NAME_SHORT_ARRAY) + # typesystem.add_feature(Akof, "akofAByte", TYPE_NAME_BYTE_ARRAY) + # typesystem.add_feature(Akof, "akofABoolean", TYPE_NAME_BOOLEAN_ARRAY) + # typesystem.add_feature(Akof, "akofAString", TYPE_NAME_STRING_ARRAY) + return typesystem + + def generate_cas(self, typesystem: TypeSystem) -> Cas: + feature_structures = [] + + cas = Cas(typesystem) + + for i in range(0, self.size): + feature_structures.append(self._makeAkof(cas)) + + # Randomly link feature structures to each other +# FSArray = cas.typesystem.get_type(TYPE_NAME_FS_ARRAY) + for fs in feature_structures: + fs.akofFs = self.rnd.choice(feature_structures) +# fs.akofAFs = FSArray( +# elements=[self.rnd.choice(feature_structures) for i in range(0, self.rnd.randint(1, 3))] +# ) + + cas.add_annotations(feature_structures) + + return cas + + def _makeAkof(self, cas: Cas) -> Any: + Akof = cas.typesystem.get_type("akof") + # IntegerArray = cas.typesystem.get_type(TYPE_NAME_INTEGER_ARRAY) + # FloatArray = cas.typesystem.get_type(TYPE_NAME_FLOAT_ARRAY) + # DoubleArray = cas.typesystem.get_type(TYPE_NAME_DOUBLE_ARRAY) + # LongArray = cas.typesystem.get_type(TYPE_NAME_LONG_ARRAY) + # ShortArray = cas.typesystem.get_type(TYPE_NAME_SHORT_ARRAY) + # ByteArray = cas.typesystem.get_type(TYPE_NAME_BYTE_ARRAY) + # BooleanArray = cas.typesystem.get_type(TYPE_NAME_BOOLEAN_ARRAY) + # StringArray = cas.typesystem.get_type(TYPE_NAME_STRING_ARRAY) + akof = Akof() + akof.akofInt = self.rnd.randint(-2147483648, 2147483647) + akof.akofFloat = self.rnd.choice(self.FLOAT_VALUES) + akof.akofDouble = self.rnd.choice(self.DOUBLE_VALUES) + akof.akofLong = self.rnd.choice(self.LONG_VALUES) + akof.akofShort = self.rnd.choice(self.SHORT_VALUES) + akof.akofByte = self.rnd.choice(self.BYTE_VALUES) + akof.akofBoolean = self.rnd.choice(self.BOOL_VALUES) + akof.akofString = self.rnd.choice(self.STRING_VALUES) + # akof.akofAInt = IntegerArray( + # elements=[self.rnd.randint(-2147483648, 2147483647) for i in range(0, self.rnd.randint(1, 3))] + # ) + # akof.akofAFloat = FloatArray( + # elements=[self.rnd.choice(self.FLOAT_VALUES) for i in range(0, self.rnd.randint(1, 3))] + # ) + # akof.akofADouble = DoubleArray( + # elements=[self.rnd.choice(self.DOUBLE_VALUES) for i in range(0, self.rnd.randint(1, 3))] + # ) + # akof.akofALong = LongArray( + # elements=[self.rnd.choice(self.LONG_VALUES) for i in range(0, self.rnd.randint(1, 3))] + # ) + # akof.akofAShort = ShortArray( + # elements=[self.rnd.choice(self.SHORT_VALUES) for i in range(0, self.rnd.randint(1, 3))] + # ) + # akof.akofAByte = ByteArray( + # elements=[self.rnd.choice(self.BYTE_VALUES) for i in range(0, self.rnd.randint(1, 3))] + # ) + # akof.akofABoolean = BooleanArray( + # elements=[self.rnd.choice(self.BOOL_VALUES) for i in range(0, self.rnd.randint(1, 3))] + # ) + # akof.akofAString = StringArray( + # elements=[self.rnd.choice(self.STRING_VALUES) for i in range(0, self.rnd.randint(1, 3))] + # ) + return akof diff --git a/tests/test_json.py b/tests/test_json.py index 4633fd9..4b10495 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -1,7 +1,8 @@ import json +from cassis.cas import NAME_DEFAULT_SOFA from tests.fixtures import * -from tests.test_files.test_cas_generators import MultiTypeRandomCasGenerator +from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator, MultiTypeRandomCasGenerator from tests.util import assert_json_equal FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files") @@ -27,14 +28,30 @@ def test_deserialization_serialization(json_path): assert_json_equal(actual_json, expected_json) -def test_serialization_deserialization(): +def test_multi_type_random_serialization_deserialization(): generator = MultiTypeRandomCasGenerator() for i in range(0, 10): generator.size = (i + 1) * 10 generator.type_count = i + 1 typesystem = generator.generate_type_system() randomized_cas = generator.generate_cas(typesystem) - expected_json = randomized_cas.to_json(pretty_print=True) + print(f"CAS size: {sum(len(view.get_all_annotations()) for view in randomized_cas.views)}") + expected_json = randomized_cas.to_json() + + loaded_cas = load_cas_from_json(expected_json) + actual_json = loaded_cas.to_json() + + assert_json_equal(actual_json, expected_json) + + +def test_multi_feature_random_serialization_deserialization(): + generator = MultiFeatureRandomCasGenerator() + for i in range(0, 10): + generator.size = (i + 1) * 10 + typesystem = generator.generate_type_system() + randomized_cas = generator.generate_cas(typesystem) + print(f"CAS size: {sum(len(view.get_all_annotations()) for view in randomized_cas.views)}") + expected_json = randomized_cas.to_json() loaded_cas = load_cas_from_json(expected_json) actual_json = loaded_cas.to_json() From cbf086ecf1ab02ebfbd6b2338e5133529c4e3a34 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 13 Aug 2021 18:19:36 +0200 Subject: [PATCH 07/22] #168 - Experimental JSON CAS support - Revert change to stripping the null FS - Changed reference data so that IDs start at 1 and not at 0 leaving 0 reserved for the null FS --- cassis/cas.py | 5 +---- .../ser-ref/casWithSofaDataArray/data.json | 8 ++++---- .../ser-ref/casWithSofaDataURI/data.json | 4 ++-- .../json/fs_as_array/ser-ref/casWithText/data.json | 10 +++++----- .../ser-ref/casWithTextAndAnnotation/data.json | 14 +++++++------- 5 files changed, 19 insertions(+), 22 deletions(-) diff --git a/cassis/cas.py b/cassis/cas.py index 257bcbd..999771d 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -646,10 +646,7 @@ def _find_all_fs(self) -> Iterable[FeatureStructure]: openlist.append(referenced_fs) # We do not want to return cas:NULL here as we handle serializing it later - for fs_id, fs in list(all_fs.items()): - if fs.type == "uima.cas.NULL": - all_fs.pop(fs_id) - + all_fs.pop(0, None) yield from all_fs.values() def _get_next_xmi_id(self) -> int: diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json index b732eaf..054d442 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json @@ -2,20 +2,20 @@ "%TYPES" : { }, "%VIEWS" : { "_InitialView" : { - "%SOFA" : 0, + "%SOFA" : 1, "%INDEX" : [ ] } }, "%FEATURE_STRUCTURES" : [ { - "%ID" : 1, + "%ID" : 2, "%TYPE" : "uima.cas.ByteArray", "%ELEMENTS" : "VGhpcyBpcyBhIHRlc3Q=" }, { - "%ID" : 0, + "%ID" : 1, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text/plain", - "@sofaArray" : 1 + "@sofaArray" : 2 } ] } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json index 678d9e2..9375241 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json @@ -2,12 +2,12 @@ "%TYPES" : { }, "%VIEWS" : { "_InitialView" : { - "%SOFA" : 0, + "%SOFA" : 1, "%INDEX" : [ ] } }, "%FEATURE_STRUCTURES" : [ { - "%ID" : 0, + "%ID" : 1, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json index 416e6e1..c8fd81a 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json @@ -2,21 +2,21 @@ "%TYPES" : { }, "%VIEWS" : { "_InitialView" : { - "%SOFA" : 0, - "%INDEX" : [ 1 ] + "%SOFA" : 1, + "%INDEX" : [ 2 ] } }, "%FEATURE_STRUCTURES" : [ { - "%ID" : 0, + "%ID" : 1, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text", "sofaString" : "This is a test." }, { - "%ID" : 1, + "%ID" : 2, "%TYPE" : "uima.tcas.DocumentAnnotation", - "@sofa" : 0, + "@sofa" : 1, "begin" : 0, "end" : 15, "language" : "x-unspecified" diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json index aa71704..8debb1f 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json @@ -2,28 +2,28 @@ "%TYPES" : { }, "%VIEWS" : { "_InitialView" : { - "%SOFA" : 0, - "%INDEX" : [ 1, 2 ] + "%SOFA" : 1, + "%INDEX" : [ 2, 3 ] } }, "%FEATURE_STRUCTURES" : [ { - "%ID" : 0, + "%ID" : 1, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text", "sofaString" : "This is a test." }, { - "%ID" : 1, + "%ID" : 2, "%TYPE" : "uima.tcas.DocumentAnnotation", - "@sofa" : 0, + "@sofa" : 1, "begin" : 0, "end" : 15, "language" : "x-unspecified" }, { - "%ID" : 2, + "%ID" : 3, "%TYPE" : "uima.tcas.Annotation", - "@sofa" : 0, + "@sofa" : 1, "begin" : 0, "end" : 15 } ] From 3b50a8e6e3839f4ead3b06d04b05fe8220845113 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Thu, 19 Aug 2021 12:31:47 +0200 Subject: [PATCH 08/22] #168 - Experimental JSON CAS support - Fix array support - Enable array tests --- cassis/json.py | 57 ++++++++---- tests/test_files/test_cas_generators.py | 112 ++++++++++++------------ tests/test_json.py | 2 +- tests/util.py | 6 +- 4 files changed, 103 insertions(+), 74 deletions(-) diff --git a/cassis/json.py b/cassis/json.py index 14b2e5d..bd7655f 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -119,7 +119,7 @@ def _parse_type(self, typesystem: TypeSystem, type_name: str, json_type: Dict[st for key, json_feature in json_type.items(): if key.startswith(RESERVED_FIELD_PREFIX): continue - typesystem.add_feature( + typesystem.create_feature( new_type, name=key, rangeTypeName=json_feature[RANGE_FIELD], @@ -177,8 +177,14 @@ def _parse_feature_structure( if "type" in attributes: attributes["type_"] = attributes.pop("type") - if AnnotationType.name == TYPE_NAME_BYTE_ARRAY: - attributes["elements"] = base64.b64decode(attributes.get(ELEMENTS_FIELD)) + if typesystem.is_primitive_array(AnnotationType.name): + attributes["elements"] = self._parse_primitive_array(AnnotationType.name, json_fs.get(ELEMENTS_FIELD)) + elif AnnotationType.name == TYPE_NAME_FS_ARRAY: + # Resolve id-ref at the end of processing + def fix_up(elements): + return lambda: setattr(fs, "elements", [feature_structures.get(e) for e in elements]) + + self._post_processors.append(fix_up(json_fs.get(ELEMENTS_FIELD))) self._strip_reserved_json_keys(attributes) @@ -194,6 +200,12 @@ def _parse_feature_structure( self._resolve_references(fs, ref_features, feature_structures) return fs + def _parse_primitive_array(self, type_name: str, elements: [list, str]) -> List: + if type_name == TYPE_NAME_BYTE_ARRAY: + return base64.b64decode(elements) + else: + return elements + def _resolve_references(self, fs, ref_features: Dict[str, any], feature_structures: Dict[int, any]): for key, value in ref_features.items(): target_fs = feature_structures.get(value) @@ -202,10 +214,10 @@ def _resolve_references(self, fs, ref_features: Dict[str, any], feature_structur setattr(fs, key, target_fs) else: # Resolve id-ref at the end of processing - def fix_up(): - setattr(fs, key, feature_structures.get(value)) + def fix_up(k, v): + return lambda: setattr(fs, k, feature_structures.get(v)) - self._post_processors.append(fix_up) + self._post_processors.append(fix_up(key, value)) def _strip_reserved_json_keys( self, @@ -243,7 +255,7 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): feature_structures.append(json_sofa_fs) # Find all fs, even the ones that are not directly added to a sofa - for fs in sorted(cas._find_all_fs(), key=lambda a: a.xmiID): + for fs in sorted(cas._find_all_fs(include_inlinable_arrays=True), key=lambda a: a.xmiID): json_fs = self._serialize_feature_structure(cas, fs) feature_structures.append(json_fs) @@ -304,6 +316,20 @@ def _serialize_feature_structure(self, cas, fs) -> dict: ts = cas.typesystem t = ts.get_type(fs.type) + + if t.name == TYPE_NAME_BYTE_ARRAY: + if fs.elements: + json_fs[ELEMENTS_FIELD] = base64.b64encode(bytes(fs.elements)).decode("ascii") + return json_fs + elif ts.is_primitive_array(t.name): + if fs.elements: + json_fs[ELEMENTS_FIELD] = fs.elements + return json_fs + elif TYPE_NAME_FS_ARRAY == t.name: + if fs.elements: + json_fs[ELEMENTS_FIELD] = [self._serialize_ref(e) for e in fs.elements] + return json_fs + for feature in t.all_features: if feature.name in CasJsonSerializer._COMMON_FIELD_NAMES: continue @@ -324,19 +350,20 @@ def _serialize_feature_structure(self, cas, fs) -> dict: # sofa: Sofa = getattr(fs, "sofa") # value = sofa._offset_converter.cassis_to_uima(value) - if t.name == TYPE_NAME_BYTE_ARRAY and feature_name == "elements": - json_fs[ELEMENTS_FIELD] = base64.b64encode(value).decode("ascii") - elif t.supertypeName == TYPE_NAME_ARRAY_BASE and feature_name == "elements": - json_fs[ELEMENTS_FIELD] = value - elif ts.is_primitive(feature.rangeTypeName): + if ts.is_primitive(feature.rangeTypeName): json_fs[feature_name] = value - elif ts.is_collection(fs.type, feature): - json_fs[REF_FEATURE_PREFIX + feature_name] = value.xmiID else: # We need to encode non-primitive features as a reference - json_fs[REF_FEATURE_PREFIX + feature_name] = value.xmiID + json_fs[REF_FEATURE_PREFIX + feature_name] = self._serialize_ref(value) return json_fs + def _serialize_ref(self, fs) -> int: + if not fs: + return None + + return fs.xmiID + + def _serialize_view(self, view: View): return {VIEW_SOFA_FIELD: view.sofa.xmiID, VIEW_INDEX_FIELD: sorted(x.xmiID for x in view.get_all_annotations())} diff --git a/tests/test_files/test_cas_generators.py b/tests/test_files/test_cas_generators.py index 3284da6..2cd6e03 100644 --- a/tests/test_files/test_cas_generators.py +++ b/tests/test_files/test_cas_generators.py @@ -44,7 +44,9 @@ def generate_cas(self, typesystem: TypeSystem) -> Cas: class MultiFeatureRandomCasGenerator: STRING_VALUES = ["abc", "abcdef", None, "", "ghijklm", "a", "b"] - BYTE_VALUES = [1, 0, -1, 127, -128, 9, -9] + # In Java, bytes go from -128 to 127, in Python from 0 to 255. + # BYTE_VALUES = [1, 0, -1, 127, -128, 9, -9] + BYTE_VALUES = [1, 0, 255, 0, 9] LONG_VALUES = [1, 0, -1, 9223372036854775807, -9223372036854775808, 11, -11] SHORT_VALUES = [1, 0, -1, 32767, -32768, 22, -22] DOUBLE_VALUES = [1, 0, -1, 999999999999, -999999999999, 33, -33.33] @@ -58,24 +60,24 @@ def __init__(self): def generate_type_system(self) -> TypeSystem: typesystem = TypeSystem() Akof = typesystem.create_type("akof", TYPE_NAME_TOP, "all kinds of features") - typesystem.add_feature(Akof, "akofInt", TYPE_NAME_INTEGER) - typesystem.add_feature(Akof, "akofFs", TYPE_NAME_TOP) - typesystem.add_feature(Akof, "akofFloat", TYPE_NAME_FLOAT) - typesystem.add_feature(Akof, "akofDouble", TYPE_NAME_DOUBLE) - typesystem.add_feature(Akof, "akofLong", TYPE_NAME_LONG) - typesystem.add_feature(Akof, "akofShort", TYPE_NAME_SHORT) - typesystem.add_feature(Akof, "akofByte", TYPE_NAME_BYTE) - typesystem.add_feature(Akof, "akofBoolean", TYPE_NAME_BOOLEAN) - typesystem.add_feature(Akof, "akofString", TYPE_NAME_STRING) - # typesystem.add_feature(Akof, "akofAInt", TYPE_NAME_INTEGER_ARRAY) - # typesystem.add_feature(Akof, "akofAFs", TYPE_NAME_FS_ARRAY) - # typesystem.add_feature(Akof, "akofAFloat", TYPE_NAME_FLOAT_ARRAY) - # typesystem.add_feature(Akof, "akofADouble", TYPE_NAME_DOUBLE_ARRAY) - # typesystem.add_feature(Akof, "akofALong", TYPE_NAME_LONG_ARRAY) - # typesystem.add_feature(Akof, "akofAShort", TYPE_NAME_SHORT_ARRAY) - # typesystem.add_feature(Akof, "akofAByte", TYPE_NAME_BYTE_ARRAY) - # typesystem.add_feature(Akof, "akofABoolean", TYPE_NAME_BOOLEAN_ARRAY) - # typesystem.add_feature(Akof, "akofAString", TYPE_NAME_STRING_ARRAY) + typesystem.create_feature(Akof, "akofInt", TYPE_NAME_INTEGER) + typesystem.create_feature(Akof, "akofFs", TYPE_NAME_TOP) + typesystem.create_feature(Akof, "akofFloat", TYPE_NAME_FLOAT) + typesystem.create_feature(Akof, "akofDouble", TYPE_NAME_DOUBLE) + typesystem.create_feature(Akof, "akofLong", TYPE_NAME_LONG) + typesystem.create_feature(Akof, "akofShort", TYPE_NAME_SHORT) + typesystem.create_feature(Akof, "akofByte", TYPE_NAME_BYTE) + typesystem.create_feature(Akof, "akofBoolean", TYPE_NAME_BOOLEAN) + typesystem.create_feature(Akof, "akofString", TYPE_NAME_STRING) + typesystem.create_feature(Akof, "akofAInt", TYPE_NAME_INTEGER_ARRAY) + typesystem.create_feature(Akof, "akofAFs", TYPE_NAME_FS_ARRAY) + typesystem.create_feature(Akof, "akofAFloat", TYPE_NAME_FLOAT_ARRAY) + typesystem.create_feature(Akof, "akofADouble", TYPE_NAME_DOUBLE_ARRAY) + typesystem.create_feature(Akof, "akofALong", TYPE_NAME_LONG_ARRAY) + typesystem.create_feature(Akof, "akofAShort", TYPE_NAME_SHORT_ARRAY) + typesystem.create_feature(Akof, "akofAByte", TYPE_NAME_BYTE_ARRAY) + typesystem.create_feature(Akof, "akofABoolean", TYPE_NAME_BOOLEAN_ARRAY) + typesystem.create_feature(Akof, "akofAString", TYPE_NAME_STRING_ARRAY) return typesystem def generate_cas(self, typesystem: TypeSystem) -> Cas: @@ -87,12 +89,12 @@ def generate_cas(self, typesystem: TypeSystem) -> Cas: feature_structures.append(self._makeAkof(cas)) # Randomly link feature structures to each other -# FSArray = cas.typesystem.get_type(TYPE_NAME_FS_ARRAY) + FSArray = cas.typesystem.get_type(TYPE_NAME_FS_ARRAY) for fs in feature_structures: fs.akofFs = self.rnd.choice(feature_structures) -# fs.akofAFs = FSArray( -# elements=[self.rnd.choice(feature_structures) for i in range(0, self.rnd.randint(1, 3))] -# ) + fs.akofAFs = FSArray( + elements=[self.rnd.choice(feature_structures) for i in range(0, self.rnd.randint(1, 3))] + ) cas.add_annotations(feature_structures) @@ -100,14 +102,14 @@ def generate_cas(self, typesystem: TypeSystem) -> Cas: def _makeAkof(self, cas: Cas) -> Any: Akof = cas.typesystem.get_type("akof") - # IntegerArray = cas.typesystem.get_type(TYPE_NAME_INTEGER_ARRAY) - # FloatArray = cas.typesystem.get_type(TYPE_NAME_FLOAT_ARRAY) - # DoubleArray = cas.typesystem.get_type(TYPE_NAME_DOUBLE_ARRAY) - # LongArray = cas.typesystem.get_type(TYPE_NAME_LONG_ARRAY) - # ShortArray = cas.typesystem.get_type(TYPE_NAME_SHORT_ARRAY) - # ByteArray = cas.typesystem.get_type(TYPE_NAME_BYTE_ARRAY) - # BooleanArray = cas.typesystem.get_type(TYPE_NAME_BOOLEAN_ARRAY) - # StringArray = cas.typesystem.get_type(TYPE_NAME_STRING_ARRAY) + IntegerArray = cas.typesystem.get_type(TYPE_NAME_INTEGER_ARRAY) + FloatArray = cas.typesystem.get_type(TYPE_NAME_FLOAT_ARRAY) + DoubleArray = cas.typesystem.get_type(TYPE_NAME_DOUBLE_ARRAY) + LongArray = cas.typesystem.get_type(TYPE_NAME_LONG_ARRAY) + ShortArray = cas.typesystem.get_type(TYPE_NAME_SHORT_ARRAY) + ByteArray = cas.typesystem.get_type(TYPE_NAME_BYTE_ARRAY) + BooleanArray = cas.typesystem.get_type(TYPE_NAME_BOOLEAN_ARRAY) + StringArray = cas.typesystem.get_type(TYPE_NAME_STRING_ARRAY) akof = Akof() akof.akofInt = self.rnd.randint(-2147483648, 2147483647) akof.akofFloat = self.rnd.choice(self.FLOAT_VALUES) @@ -117,28 +119,28 @@ def _makeAkof(self, cas: Cas) -> Any: akof.akofByte = self.rnd.choice(self.BYTE_VALUES) akof.akofBoolean = self.rnd.choice(self.BOOL_VALUES) akof.akofString = self.rnd.choice(self.STRING_VALUES) - # akof.akofAInt = IntegerArray( - # elements=[self.rnd.randint(-2147483648, 2147483647) for i in range(0, self.rnd.randint(1, 3))] - # ) - # akof.akofAFloat = FloatArray( - # elements=[self.rnd.choice(self.FLOAT_VALUES) for i in range(0, self.rnd.randint(1, 3))] - # ) - # akof.akofADouble = DoubleArray( - # elements=[self.rnd.choice(self.DOUBLE_VALUES) for i in range(0, self.rnd.randint(1, 3))] - # ) - # akof.akofALong = LongArray( - # elements=[self.rnd.choice(self.LONG_VALUES) for i in range(0, self.rnd.randint(1, 3))] - # ) - # akof.akofAShort = ShortArray( - # elements=[self.rnd.choice(self.SHORT_VALUES) for i in range(0, self.rnd.randint(1, 3))] - # ) - # akof.akofAByte = ByteArray( - # elements=[self.rnd.choice(self.BYTE_VALUES) for i in range(0, self.rnd.randint(1, 3))] - # ) - # akof.akofABoolean = BooleanArray( - # elements=[self.rnd.choice(self.BOOL_VALUES) for i in range(0, self.rnd.randint(1, 3))] - # ) - # akof.akofAString = StringArray( - # elements=[self.rnd.choice(self.STRING_VALUES) for i in range(0, self.rnd.randint(1, 3))] - # ) + akof.akofAInt = IntegerArray( + elements=[self.rnd.randint(-2147483648, 2147483647) for i in range(0, self.rnd.randint(1, 3))] + ) + akof.akofAFloat = FloatArray( + elements=[self.rnd.choice(self.FLOAT_VALUES) for i in range(0, self.rnd.randint(1, 3))] + ) + akof.akofADouble = DoubleArray( + elements=[self.rnd.choice(self.DOUBLE_VALUES) for i in range(0, self.rnd.randint(1, 3))] + ) + akof.akofALong = LongArray( + elements=[self.rnd.choice(self.LONG_VALUES) for i in range(0, self.rnd.randint(1, 3))] + ) + akof.akofAShort = ShortArray( + elements=[self.rnd.choice(self.SHORT_VALUES) for i in range(0, self.rnd.randint(1, 3))] + ) + akof.akofAByte = ByteArray( + elements=[self.rnd.choice(self.BYTE_VALUES) for i in range(0, self.rnd.randint(1, 3))] + ) + akof.akofABoolean = BooleanArray( + elements=[self.rnd.choice(self.BOOL_VALUES) for i in range(0, self.rnd.randint(1, 3))] + ) + akof.akofAString = StringArray( + elements=[self.rnd.choice(self.STRING_VALUES) for i in range(0, self.rnd.randint(1, 3))] + ) return akof diff --git a/tests/test_json.py b/tests/test_json.py index 4b10495..f9591fc 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -25,7 +25,7 @@ def test_deserialization_serialization(json_path): actual_json = cas.to_json() - assert_json_equal(actual_json, expected_json) + assert_json_equal(actual_json, expected_json, sort_keys=True) def test_multi_type_random_serialization_deserialization(): diff --git a/tests/util.py b/tests/util.py index 129cc84..1c84f89 100644 --- a/tests/util.py +++ b/tests/util.py @@ -40,7 +40,7 @@ def assert_xml_equal(actual: Union[IO, str], expected: Union[IO, str]): raise e -def assert_json_equal(actual: str, expected: Union[IO, str]): +def assert_json_equal(actual: str, expected: Union[IO, str], sort_keys: bool = False): """Checks whether the JSON trees behind `actual` and `expected` are equal. Args: @@ -56,8 +56,8 @@ def assert_json_equal(actual: str, expected: Union[IO, str]): if isinstance(expected, str): expected = json.loads(expected) - actual_json = json.dumps(actual, sort_keys=True, indent=2) - expected_json = json.dumps(expected, sort_keys=True, indent=2) + actual_json = json.dumps(actual, sort_keys=sort_keys, indent=2) + expected_json = json.dumps(expected, sort_keys=sort_keys, indent=2) try: assert actual_json == expected_json From 59ceea4e47e6a9abb65774c3c381b441af000da3 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 20 Aug 2021 11:53:07 +0200 Subject: [PATCH 09/22] #168 - Experimental JSON CAS support - Change view members field name --- cassis/json.py | 6 +++--- .../json/fs_as_array/ser-ref/casWithSofaDataArray/data.json | 2 +- .../json/fs_as_array/ser-ref/casWithSofaDataURI/data.json | 2 +- .../json/fs_as_array/ser-ref/casWithText/data.json | 2 +- .../fs_as_array/ser-ref/casWithTextAndAnnotation/data.json | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cassis/json.py b/cassis/json.py index bd7655f..320e0f8 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -13,7 +13,7 @@ FEATURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURES" VIEWS_FIELD = RESERVED_FIELD_PREFIX + "VIEWS" VIEW_SOFA_FIELD = RESERVED_FIELD_PREFIX + "SOFA" -VIEW_INDEX_FIELD = RESERVED_FIELD_PREFIX + "INDEX" +VIEW_MEMBERS_FIELD = RESERVED_FIELD_PREFIX + "MEMBERS" FEATURE_STRUCTURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURE_STRUCTURES" REF_FEATURE_PREFIX = "@" NAME_FIELD = RESERVED_FIELD_PREFIX + "NAME" @@ -144,7 +144,7 @@ def _get_or_create_view( def _parse_view(self, cas: Cas, view_name: str, json_view: Dict[str, any], feature_structures: Dict[str, any]): view = self._get_or_create_view(cas, view_name) - for member_id in json_view[VIEW_INDEX_FIELD]: + for member_id in json_view[VIEW_MEMBERS_FIELD]: fs = feature_structures[member_id] view.add_annotation(fs, keep_id=True) @@ -365,7 +365,7 @@ def _serialize_ref(self, fs) -> int: def _serialize_view(self, view: View): - return {VIEW_SOFA_FIELD: view.sofa.xmiID, VIEW_INDEX_FIELD: sorted(x.xmiID for x in view.get_all_annotations())} + return {VIEW_SOFA_FIELD: view.sofa.xmiID, VIEW_MEMBERS_FIELD: sorted(x.xmiID for x in view.get_all_annotations())} def _to_external_type_name(self, type_name: str): if type_name.startswith("uima.noNamespace."): diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json index 054d442..edf6ddc 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json @@ -3,7 +3,7 @@ "%VIEWS" : { "_InitialView" : { "%SOFA" : 1, - "%INDEX" : [ ] + "%MEMBERS" : [ ] } }, "%FEATURE_STRUCTURES" : [ { diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json index 9375241..266ab55 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json @@ -3,7 +3,7 @@ "%VIEWS" : { "_InitialView" : { "%SOFA" : 1, - "%INDEX" : [ ] + "%MEMBERS" : [ ] } }, "%FEATURE_STRUCTURES" : [ { diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json index c8fd81a..1fe9f02 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json @@ -3,7 +3,7 @@ "%VIEWS" : { "_InitialView" : { "%SOFA" : 1, - "%INDEX" : [ 2 ] + "%MEMBERS" : [ 2 ] } }, "%FEATURE_STRUCTURES" : [ { diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json index 8debb1f..7879a33 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json @@ -3,7 +3,7 @@ "%VIEWS" : { "_InitialView" : { "%SOFA" : 1, - "%INDEX" : [ 2, 3 ] + "%MEMBERS" : [ 2, 3 ] } }, "%FEATURE_STRUCTURES" : [ { From f42992f1bd2a82ef1b8219669f561101a7f260ba Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 25 Aug 2021 11:20:31 +0200 Subject: [PATCH 10/22] #168 - Experimental JSON CAS support - Formatting - Removed unused import --- cassis/json.py | 6 ++++-- cassis/xmi.py | 10 ++++++++-- tests/test_json.py | 1 - 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cassis/json.py b/cassis/json.py index 320e0f8..b09bf42 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -363,9 +363,11 @@ def _serialize_ref(self, fs) -> int: return fs.xmiID - def _serialize_view(self, view: View): - return {VIEW_SOFA_FIELD: view.sofa.xmiID, VIEW_MEMBERS_FIELD: sorted(x.xmiID for x in view.get_all_annotations())} + return { + VIEW_SOFA_FIELD: view.sofa.xmiID, + VIEW_MEMBERS_FIELD: sorted(x.xmiID for x in view.get_all_annotations()), + } def _to_external_type_name(self, type_name: str): if type_name.startswith("uima.noNamespace."): diff --git a/cassis/xmi.py b/cassis/xmi.py index 9c7c753..9d4bf50 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -478,9 +478,15 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur for e in value.elements: child = etree.SubElement(elem, feature_name) child.text = e - elif ts.is_primitive_array(feature.rangeTypeName) and not feature.multipleReferencesAllowed and value.elements: + elif ( + ts.is_primitive_array(feature.rangeTypeName) + and not feature.multipleReferencesAllowed + and value.elements + ): elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeTypeName, value.elements) - elif feature.rangeTypeName == "uima.cas.FSArray" and not feature.multipleReferencesAllowed and value.elements: + elif ( + feature.rangeTypeName == "uima.cas.FSArray" and not feature.multipleReferencesAllowed and value.elements + ): elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in value.elements) elif feature_name == "sofa": elem.attrib[feature_name] = str(value.xmiID) diff --git a/tests/test_json.py b/tests/test_json.py index f9591fc..0765ca3 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -1,6 +1,5 @@ import json -from cassis.cas import NAME_DEFAULT_SOFA from tests.fixtures import * from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator, MultiTypeRandomCasGenerator from tests.util import assert_json_equal From dafd693abd459110ba6a26e271f58e4585ba1277 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 25 Aug 2021 12:35:08 +0200 Subject: [PATCH 11/22] #168 - Experimental JSON CAS support - Adjust to changes from #190 --- cassis/json.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cassis/json.py b/cassis/json.py index b09bf42..ab66ef9 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -272,7 +272,7 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): def _serialize_type(self, type_: Type): type_name = self._to_external_type_name(type_.name) - supertype_name = self._to_external_type_name(type_.supertypeName) + supertype_name = self._to_external_type_name(type_.supertype.name) json_type = { NAME_FIELD: type_name, @@ -310,27 +310,27 @@ def _serialize_feature(self, json_type, feature: Feature): return json_feature def _serialize_feature_structure(self, cas, fs) -> dict: + ts = cas.typesystem + type_name = fs.type.name + json_fs = OrderedDict() json_fs[ID_FIELD] = fs.xmiID - json_fs[TYPE_FIELD] = fs.type + json_fs[TYPE_FIELD] = type_name - ts = cas.typesystem - t = ts.get_type(fs.type) - - if t.name == TYPE_NAME_BYTE_ARRAY: + if type_name == TYPE_NAME_BYTE_ARRAY: if fs.elements: json_fs[ELEMENTS_FIELD] = base64.b64encode(bytes(fs.elements)).decode("ascii") return json_fs - elif ts.is_primitive_array(t.name): + elif ts.is_primitive_array(type_name): if fs.elements: json_fs[ELEMENTS_FIELD] = fs.elements return json_fs - elif TYPE_NAME_FS_ARRAY == t.name: + elif TYPE_NAME_FS_ARRAY == type_name: if fs.elements: json_fs[ELEMENTS_FIELD] = [self._serialize_ref(e) for e in fs.elements] return json_fs - for feature in t.all_features: + for feature in fs.type.all_features: if feature.name in CasJsonSerializer._COMMON_FIELD_NAMES: continue From 026fb9df5c27ea6e803e955e1ceb31299246b9b8 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 25 Aug 2021 15:01:27 +0200 Subject: [PATCH 12/22] #168 - Experimental JSON CAS support - Adjust to changes from #190 --- cassis/json.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cassis/json.py b/cassis/json.py index ab66ef9..4f962ab 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -122,7 +122,7 @@ def _parse_type(self, typesystem: TypeSystem, type_name: str, json_type: Dict[st typesystem.create_feature( new_type, name=key, - rangeTypeName=json_feature[RANGE_FIELD], + rangeType=json_feature[RANGE_FIELD], description=json_feature.get(DESCRIPTION_FIELD), elementType=json_feature.get(ELEMENT_TYPE_FIELD), multipleReferencesAllowed=json_feature.get(MULTIPLE_REFERENCES_ALLOWED_FIELD), @@ -249,14 +249,14 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): for view in cas.views: views[view.sofa.sofaID] = self._serialize_view(view) if view.sofa.sofaArray: - json_sofa_array_fs = self._serialize_feature_structure(cas, view.sofa.sofaArray) + json_sofa_array_fs = self._serialize_feature_structure(view.sofa.sofaArray) feature_structures.append(json_sofa_array_fs) - json_sofa_fs = self._serialize_feature_structure(cas, view.sofa) + json_sofa_fs = self._serialize_feature_structure(view.sofa) feature_structures.append(json_sofa_fs) # Find all fs, even the ones that are not directly added to a sofa for fs in sorted(cas._find_all_fs(include_inlinable_arrays=True), key=lambda a: a.xmiID): - json_fs = self._serialize_feature_structure(cas, fs) + json_fs = self._serialize_feature_structure(fs) feature_structures.append(json_fs) if isinstance(sink, BytesIO): @@ -295,7 +295,7 @@ def _serialize_feature(self, json_type, feature: Feature): json_feature = { NAME_FIELD: feature_name, - RANGE_FIELD: self._to_external_type_name(feature.rangeTypeName), + RANGE_FIELD: self._to_external_type_name(feature.rangeType.name), } if feature.description: @@ -309,8 +309,7 @@ def _serialize_feature(self, json_type, feature: Feature): return json_feature - def _serialize_feature_structure(self, cas, fs) -> dict: - ts = cas.typesystem + def _serialize_feature_structure(self, fs) -> dict: type_name = fs.type.name json_fs = OrderedDict() @@ -321,7 +320,7 @@ def _serialize_feature_structure(self, cas, fs) -> dict: if fs.elements: json_fs[ELEMENTS_FIELD] = base64.b64encode(bytes(fs.elements)).decode("ascii") return json_fs - elif ts.is_primitive_array(type_name): + elif is_primitive_array(fs.type): if fs.elements: json_fs[ELEMENTS_FIELD] = fs.elements return json_fs @@ -350,7 +349,7 @@ def _serialize_feature_structure(self, cas, fs) -> dict: # sofa: Sofa = getattr(fs, "sofa") # value = sofa._offset_converter.cassis_to_uima(value) - if ts.is_primitive(feature.rangeTypeName): + if is_primitive(feature.rangeType): json_fs[feature_name] = value else: # We need to encode non-primitive features as a reference From 88ec59b379463e33ff00d5c0cc8ca0ca3f306406 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 25 Aug 2021 22:39:30 +0200 Subject: [PATCH 13/22] #168 - Experimental JSON CAS support - Tune performance when serializing JSON to string instead of writing to disk - Added rudimentary performance "test" --- cassis/cas.py | 4 +-- cassis/json.py | 6 ++-- cassis/xmi.py | 11 +++++++- tests/performance.py | 67 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 tests/performance.py diff --git a/cassis/cas.py b/cassis/cas.py index 34fe87f..b2e1825 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -604,9 +604,7 @@ def _serialize(self, serializer, path: Union[str, Path, None] = None, pretty_pri """ # If `path` is None, then serialize to a string and return it if path is None: - sink = BytesIO() - serializer.serialize(sink, self, pretty_print=pretty_print) - return sink.getvalue().decode("utf-8") + return serializer.serialize(None, self, pretty_print=pretty_print) elif isinstance(path, str): with open(path, "wb") as f: serializer.serialize(f, self, pretty_print=pretty_print) diff --git a/cassis/json.py b/cassis/json.py index 4f962ab..4cd74e2 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -234,7 +234,7 @@ class CasJsonSerializer: def __init__(self): pass - def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): + def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> Union[str, None]: data = {} types = data[TYPES_FIELD] = {} views = data[VIEWS_FIELD] = {} @@ -265,11 +265,13 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): if sink: json.dump(data, sink, sort_keys=False, indent=2 if pretty_print else None) else: - json.dumps(data, sort_keys=False, indent=2 if pretty_print else None) + return json.dumps(data, sort_keys=False, indent=2 if pretty_print else None) if isinstance(sink, TextIOWrapper): sink.detach() # Prevent TextIOWrapper from closing the BytesIO + return None + def _serialize_type(self, type_: Type): type_name = self._to_external_type_name(type_.name) supertype_name = self._to_external_type_name(type_.supertype.name) diff --git a/cassis/xmi.py b/cassis/xmi.py index 3ea65a5..72448ac 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -363,7 +363,7 @@ def __init__(self): self._urls_to_prefixes = {} self._duplicate_namespaces = defaultdict(int) - def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): + def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> Union[str, None]: xmi_attrs = {"{http://www.omg.org/XMI}version": "2.0"} root = etree.Element(etree.QName(self._nsmap["xmi"], "XMI"), nsmap=self._nsmap, **xmi_attrs) @@ -383,8 +383,17 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): doc = etree.ElementTree(root) etree.cleanup_namespaces(doc, top_nsmap=self._nsmap) + return_str = sink is None + if return_str: + sink = BytesIO() + doc.write(sink, xml_declaration=True, pretty_print=pretty_print, encoding="UTF-8") + if return_str: + return sink.getvalue().decode("utf-8") + + return None + def _serialize_cas_null(self, root: etree.Element): name = etree.QName(self._nsmap["cas"], "NULL") elem = etree.SubElement(root, name) diff --git a/tests/performance.py b/tests/performance.py new file mode 100644 index 0000000..6a1b289 --- /dev/null +++ b/tests/performance.py @@ -0,0 +1,67 @@ +from random import Random +from timeit import default_timer as timer + +from cassis import load_cas_from_json, load_cas_from_xmi +from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator + +generator = MultiFeatureRandomCasGenerator() +generator.rnd = Random(123456) +generator.size = 1000 +iterations = 100 + +typesystem = generator.generate_type_system() +randomized_cas = generator.generate_cas(typesystem) +randomized_cas_xmi = randomized_cas.to_xmi() +randomized_cas_json = randomized_cas.to_json() + + +def test_xmi_serialization_performance(): + start = timer() + for i in range(0, iterations): + if i % 10 == 0: + print(".", end='') + if i % 100 == 0: + print(f"{i}") + randomized_cas.to_xmi() + end = timer() + + print(f"XMI: Serializing {iterations} CASes took {end - start} seconds") + + +def test_json_serialization_performance(): + start = timer() + for i in range(0, iterations): + if i % 10 == 0: + print(".", end='') + if i % 100 == 0: + print(f"{i}") + randomized_cas.to_json() + end = timer() + + print(f"JSON: Serializing {iterations} CASes took {end - start} seconds") + + +def test_xmi_deserialization_performance(): + start = timer() + for i in range(0, iterations): + if i % 10 == 0: + print(".", end='') + if i % 100 == 0: + print(f"{i}") + load_cas_from_xmi(randomized_cas_xmi, typesystem) + end = timer() + + print(f"XMI: Deserializing {iterations} CASes took {end - start} seconds") + + +def test_json_deserialization_performance(): + start = timer() + for i in range(0, iterations): + if i % 10 == 0: + print(".", end='') + if i % 100 == 0: + print(f"{i}") + load_cas_from_json(randomized_cas_json, typesystem) + end = timer() + + print(f"JSON: Deserializing {iterations} CASes took {end - start} seconds") From 053bbf73e9c2df9f064c376709a937c6184fc1ad Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 30 Aug 2021 16:46:44 +0200 Subject: [PATCH 14/22] [UIMA-6266] Clean JSON Wire Format for CAS - Do not execute performance "tests" when running make test - Update JSON reference data with new data from UIMA Java SDK - including CAS examples using emojis and other Unicode characters - Enabled character offset conversion on import/export in JSON (de)serializer --- Makefile | 2 +- cassis/cas.py | 36 +++--- cassis/json.py | 21 +++- tests/performance.py | 22 +--- .../data.json | 78 ++++++++++++ .../debug-typesystem.xml | 0 .../debug.xmi | 15 +++ .../data.json | 30 +++-- .../debug-typesystem.xml | 17 +++ .../debug.xmi | 8 ++ .../ser-ref/casWithSofaDataArray/data.json | 20 ++-- .../ser-ref/casWithSofaDataURI/data.json | 14 +-- .../fs_as_array/ser-ref/casWithText/data.json | 14 +-- .../casWithTextAndAnnotation/debug.xmi | 7 -- .../casWithTextAndAnnotations/data.json | 48 ++++++++ .../debug-typesystem.xml | 17 +++ .../casWithTextAndAnnotations/debug.xmi | 9 ++ .../data.json | 48 ++++++++ .../debug-typesystem.xml | 17 +++ .../debug.xmi | 10 ++ .../data.json | 39 ++++++ .../debug-typesystem.xml | 17 +++ .../debug.xmi | 9 ++ .../fs_as_array/ser-ref/emptyCas/data.json | 9 ++ .../ser-ref/emptyCas/debug-typesystem.xml | 17 +++ .../fs_as_array/ser-ref/emptyCas/debug.xmi | 3 + tests/test_json.py | 111 ++++++++++++++++-- 27 files changed, 550 insertions(+), 88 deletions(-) create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json rename tests/test_files/json/fs_as_array/ser-ref/{casWithTextAndAnnotation => casWithEmojiUnicodeTextAndAnnotations}/debug-typesystem.xml (100%) create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi rename tests/test_files/json/fs_as_array/ser-ref/{casWithTextAndAnnotation => casWithLeftToRightTextAndAnnotations}/data.json (67%) create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi delete mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi diff --git a/Makefile b/Makefile index affc02b..584220e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHON_FILES = cassis tests test: - python -m pytest tests/ + python -m pytest -m "not performance" tests/ format: black -l 120 cassis/ diff --git a/cassis/cas.py b/cassis/cas.py index b2e1825..a65dcce 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -1,6 +1,5 @@ import sys from collections import defaultdict -from io import BytesIO from pathlib import Path from typing import Dict, Iterable, List, Optional, Tuple, Union @@ -37,16 +36,16 @@ class OffsetConverter: """ def __init__(self): - self._uima_to_cassis: Dict[int, int] = {0: 0} - self._cassis_to_uima: Dict[int, int] = {0: 0} + self._uima_to_cassis: Union[Dict[int, int], None] = None + self._cassis_to_uima: Union[Dict[int, int], None] = None def create_index(self, sofa_string: str): - self._uima_to_cassis.clear() - self._cassis_to_uima.clear() - if sofa_string is None: return + self._uima_to_cassis = {0: 0} + self._cassis_to_uima = {0: 0} + count_uima = 0 count_cassis = 0 @@ -67,11 +66,19 @@ def create_index(self, sofa_string: str): def uima_to_cassis(self, idx: Optional[int]) -> Optional[int]: if idx is None: return None + + if self._uima_to_cassis is None: + return idx + return self._uima_to_cassis[idx] def cassis_to_uima(self, idx: Optional[int]) -> Optional[int]: if idx is None: return None + + if self._cassis_to_uima is None: + return idx + return self._cassis_to_uima[idx] @@ -572,9 +579,11 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False """ from cassis.xmi import CasXmiSerializer - return self._serialize(CasXmiSerializer(), path, pretty_print) + return self._serialize(CasXmiSerializer(), path, pretty_print=pretty_print) - def to_json(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]: + def to_json( + self, path: Union[str, Path, None] = None, pretty_print: bool = False, ensure_ascii=False + ) -> Optional[str]: """Creates a JSON representation of this CAS. Args: @@ -588,14 +597,13 @@ def to_json(self, path: Union[str, Path, None] = None, pretty_print: bool = Fals """ from cassis.json import CasJsonSerializer - return self._serialize(CasJsonSerializer(), path, pretty_print) + return self._serialize(CasJsonSerializer(), path, pretty_print=pretty_print, ensure_ascii=ensure_ascii) - def _serialize(self, serializer, path: Union[str, Path, None] = None, pretty_print: bool = False): + def _serialize(self, serializer, path: Union[str, Path, None] = None, **kwargs): """Runs this CAS through the given serializer. Args: path: File path, if `None` is provided the result is returned as a string - pretty_print: `True` if the resulting data should be pretty-printed, else `False` Returns: @@ -604,13 +612,13 @@ def _serialize(self, serializer, path: Union[str, Path, None] = None, pretty_pri """ # If `path` is None, then serialize to a string and return it if path is None: - return serializer.serialize(None, self, pretty_print=pretty_print) + return serializer.serialize(None, self, **kwargs) elif isinstance(path, str): with open(path, "wb") as f: - serializer.serialize(f, self, pretty_print=pretty_print) + serializer.serialize(f, self, **kwargs) elif isinstance(path, Path): with path.open("wb") as f: - serializer.serialize(f, self, pretty_print=pretty_print) + serializer.serialize(f, self, **kwargs) else: raise TypeError("`path` needs to be one of [str, None, Path], but was <{0}>".format(type(path))) diff --git a/cassis/json.py b/cassis/json.py index 4cd74e2..d4e2fee 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -198,6 +198,13 @@ def fix_up(elements): fs = AnnotationType(**attributes) self._resolve_references(fs, ref_features, feature_structures) + + # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints + if typesystem.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION): + sofa = fs.sofa + fs.begin = sofa._offset_converter.uima_to_cassis(fs.begin) + fs.end = sofa._offset_converter.uima_to_cassis(fs.end) + return fs def _parse_primitive_array(self, type_name: str, elements: [list, str]) -> List: @@ -234,7 +241,9 @@ class CasJsonSerializer: def __init__(self): pass - def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> Union[str, None]: + def serialize( + self, sink: Union[IO, str, None], cas: Cas, pretty_print: bool = True, ensure_ascii: bool = False + ) -> Union[str, None]: data = {} types = data[TYPES_FIELD] = {} views = data[VIEWS_FIELD] = {} @@ -263,9 +272,9 @@ def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) if sink: - json.dump(data, sink, sort_keys=False, indent=2 if pretty_print else None) + json.dump(data, sink, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii) else: - return json.dumps(data, sort_keys=False, indent=2 if pretty_print else None) + return json.dumps(data, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii) if isinstance(sink, TextIOWrapper): sink.detach() # Prevent TextIOWrapper from closing the BytesIO @@ -347,9 +356,9 @@ def _serialize_feature_structure(self, fs) -> dict: continue # Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets - # if ts.is_instance_of(fs.type, "uima.tcas.Annotation") and feature_name == "begin" or feature_name == "end": - # sofa: Sofa = getattr(fs, "sofa") - # value = sofa._offset_converter.cassis_to_uima(value) + if feature.domainType.name == TYPE_NAME_ANNOTATION and feature_name == "begin" or feature_name == "end": + sofa: Sofa = getattr(fs, "sofa") + value = sofa._offset_converter.cassis_to_uima(value) if is_primitive(feature.rangeType): json_fs[feature_name] = value diff --git a/tests/performance.py b/tests/performance.py index 6a1b289..aaff08a 100644 --- a/tests/performance.py +++ b/tests/performance.py @@ -1,6 +1,8 @@ from random import Random from timeit import default_timer as timer +import pytest + from cassis import load_cas_from_json, load_cas_from_xmi from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator @@ -15,52 +17,40 @@ randomized_cas_json = randomized_cas.to_json() +@pytest.mark.performance def test_xmi_serialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") randomized_cas.to_xmi() end = timer() print(f"XMI: Serializing {iterations} CASes took {end - start} seconds") +@pytest.mark.performance def test_json_serialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") randomized_cas.to_json() end = timer() print(f"JSON: Serializing {iterations} CASes took {end - start} seconds") +@pytest.mark.performance def test_xmi_deserialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") load_cas_from_xmi(randomized_cas_xmi, typesystem) end = timer() print(f"XMI: Deserializing {iterations} CASes took {end - start} seconds") +@pytest.mark.performance def test_json_deserialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") load_cas_from_json(randomized_cas_json, typesystem) end = timer() diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json new file mode 100644 index 0000000..422cea5 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json @@ -0,0 +1,78 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "\uD83E\uDD73 This \uD83D\uDC73\uD83C\uDFFB‍♀️ is ✆ a \uD83E\uDDD4\uD83C\uDFFE‍♂️ test \uD83D\uDC7B" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 2 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 3, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 15 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 16, + "end" : 18 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 19, + "end" : 20 + }, { + "%ID" : 7, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 21, + "end" : 22 + }, { + "%ID" : 8, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 23, + "end" : 30 + }, { + "%ID" : 9, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 31, + "end" : 35 + }, { + "%ID" : 10, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 36, + "end" : 38 + }, { + "%ID" : 11, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 38, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml similarity index 100% rename from tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml rename to tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..6d8ec43 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json similarity index 67% rename from tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json rename to tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json index 7879a33..1944181 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json @@ -1,30 +1,36 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ 2, 3 ] - } - }, "%FEATURE_STRUCTURES" : [ { "%ID" : 1, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text", - "sofaString" : "This is a test." + "sofaString" : "هذا اختبار" }, { "%ID" : 2, - "%TYPE" : "uima.tcas.DocumentAnnotation", + "%TYPE" : "uima.tcas.Annotation", "@sofa" : 1, "begin" : 0, - "end" : 15, - "language" : "x-unspecified" + "end" : 3 }, { "%ID" : 3, "%TYPE" : "uima.tcas.Annotation", "@sofa" : 1, + "begin" : 4, + "end" : 10 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, "begin" : 0, - "end" : 15 - } ] + "end" : 10, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4 ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..108d362 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json index edf6ddc..20d935b 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json @@ -1,21 +1,21 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ ] - } - }, "%FEATURE_STRUCTURES" : [ { - "%ID" : 2, + "%ID" : 1, "%TYPE" : "uima.cas.ByteArray", "%ELEMENTS" : "VGhpcyBpcyBhIHRlc3Q=" }, { - "%ID" : 1, + "%ID" : 2, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text/plain", - "@sofaArray" : 2 - } ] + "@sofaArray" : 1 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 2, + "%MEMBERS" : [ ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json index 266ab55..0b142a8 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json @@ -1,11 +1,5 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ ] - } - }, "%FEATURE_STRUCTURES" : [ { "%ID" : 1, "%TYPE" : "uima.cas.Sofa", @@ -13,5 +7,11 @@ "sofaID" : "_InitialView", "mimeType" : "text/plain", "sofaURI" : "classpath:/ProgrammaticallyCreatedCasDataSuite/document.txt" - } ] + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json index 1fe9f02..39f5ffe 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json @@ -1,11 +1,5 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ 2 ] - } - }, "%FEATURE_STRUCTURES" : [ { "%ID" : 1, "%TYPE" : "uima.cas.Sofa", @@ -20,5 +14,11 @@ "begin" : 0, "end" : 15, "language" : "x-unspecified" - } ] + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2 ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi deleted file mode 100644 index 7292031..0000000 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json new file mode 100644 index 0000000..a9522cf --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json @@ -0,0 +1,48 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 4 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 9 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 10, + "end" : 14 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 14, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..37c1e9b --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json new file mode 100644 index 0000000..d586738 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json @@ -0,0 +1,48 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "這是一個測試" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 1 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 1, + "end" : 2 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 2, + "end" : 4 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 4, + "end" : 6 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 6, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..0087d72 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json new file mode 100644 index 0000000..56784fe --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json @@ -0,0 +1,39 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 4 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 9 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 10, + "end" : 14 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi new file mode 100644 index 0000000..37c1e9b --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json new file mode 100644 index 0000000..fcd8582 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json @@ -0,0 +1,9 @@ +{ + "%TYPES" : { }, + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi new file mode 100644 index 0000000..6fd88bd --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi @@ -0,0 +1,3 @@ + + + diff --git a/tests/test_json.py b/tests/test_json.py index 0765ca3..cc02c56 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -1,28 +1,92 @@ import json +from cassis.typesystem import TYPE_NAME_ANNOTATION from tests.fixtures import * from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator, MultiTypeRandomCasGenerator from tests.util import assert_json_equal -FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files") +FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files", "json", "fs_as_array", "ser-ref") FIXTURES = [ - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithSofaDataArray")), - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithSofaDataURI")), - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithText")), - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithTextAndAnnotation")), + (os.path.join(FIXTURE_DIR, "casWithSofaDataArray"), []), + (os.path.join(FIXTURE_DIR, "casWithSofaDataURI"), []), + (os.path.join(FIXTURE_DIR, "casWithText"), [["uima.tcas.DocumentAnnotation", 0, 15, "This is a test."]]), + ( + os.path.join(FIXTURE_DIR, "casWithoutTextButWithAnnotations"), + [ + ["uima.tcas.Annotation", 0, 4, None], + ["uima.tcas.Annotation", 5, 7, None], + ["uima.tcas.Annotation", 8, 9, None], + ["uima.tcas.Annotation", 10, 14, None], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 4, "This"], + ["uima.tcas.Annotation", 5, 7, "is"], + ["uima.tcas.Annotation", 8, 9, "a"], + ["uima.tcas.Annotation", 10, 14, "test"], + ["uima.tcas.DocumentAnnotation", 0, 14, "This is a test"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithEmojiUnicodeTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 1, "🥳", b"\xf0\x9f\xa5\xb3"], + ["uima.tcas.Annotation", 2, 6, "This"], + [ + "uima.tcas.Annotation", + 7, + 12, + "👳🏻\u200d♀️", + b"\xf0\x9f\x91\xb3\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f", + ], + ["uima.tcas.Annotation", 13, 15, "is"], + ["uima.tcas.Annotation", 16, 17, "✆", b"\xe2\x9c\x86"], + ["uima.tcas.Annotation", 18, 19, "a"], + [ + "uima.tcas.Annotation", + 20, + 25, + "🧔🏾\u200d♂️", + b"\xf0\x9f\xa7\x94\xf0\x9f\x8f\xbe\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f", + ], + ["uima.tcas.Annotation", 26, 30, "test"], + ["uima.tcas.Annotation", 31, 32, "👻", b"\xf0\x9f\x91\xbb"], + ["uima.tcas.DocumentAnnotation", 0, 32, "🥳 This 👳🏻\u200d♀️ is ✆ a 🧔🏾\u200d♂️ test 👻"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithLeftToRightTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 3, "هذا"], + ["uima.tcas.Annotation", 4, 10, "اختبار"], + ["uima.tcas.DocumentAnnotation", 0, 10, "هذا اختبار"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithTraditionalChineseTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 1, "這"], + ["uima.tcas.Annotation", 1, 2, "是"], + ["uima.tcas.Annotation", 2, 4, "一個"], + ["uima.tcas.Annotation", 4, 6, "測試"], + ["uima.tcas.DocumentAnnotation", 0, 6, "這是一個測試"], + ], + ), ] -@pytest.mark.parametrize("json_path", FIXTURES) -def test_deserialization_serialization(json_path): +@pytest.mark.parametrize("json_path, annotations", FIXTURES) +def test_deserialization_serialization(json_path, annotations): with open(os.path.join(json_path, "data.json"), "rb") as f: cas = load_cas_from_json(f) with open(os.path.join(json_path, "data.json"), "rb") as f: expected_json = json.load(f) - actual_json = cas.to_json() + actual_json = cas.to_json(pretty_print=True) assert_json_equal(actual_json, expected_json, sort_keys=True) @@ -56,3 +120,34 @@ def test_multi_feature_random_serialization_deserialization(): actual_json = loaded_cas.to_json() assert_json_equal(actual_json, expected_json) + + +@pytest.mark.parametrize("json_path, annotations", FIXTURES) +def test_unicode(json_path, annotations): + with open(os.path.join(json_path, "data.json"), "rb") as f: + cas = load_cas_from_json(f) + + actual_annotations = [ + [a.type.name, a.begin, a.end, a.get_covered_text()] + for a in sorted(cas.select(TYPE_NAME_ANNOTATION), key=lambda k: k.type.name) + ] + expected_annotations = [a[0:4] for a in annotations] + assert actual_annotations == expected_annotations + + for i in range(0, len(annotations)): + expected = annotations[i] + actual = actual_annotations[i] + + expected_covered_text = expected[3] + actual_covered_text = actual[3] + + if not expected_covered_text: + continue + + for n in range(len(actual_covered_text)): + print(f"{n}: [{actual_covered_text[n]}] {hex(ord(actual_covered_text[n]))}") + + if len(expected) >= 5: + expected_utf8_bytes = expected[4] + actual_utf8_bytes = bytes(actual_covered_text, "UTF-8") + assert actual_utf8_bytes == expected_utf8_bytes From 36709b1ac6abf6f471ca37e2c881f58707d6589e Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 30 Aug 2021 16:46:44 +0200 Subject: [PATCH 15/22] - Do not execute performance "tests" when running make test - Update JSON reference data with new data from UIMA Java SDK - including CAS examples using emojis and other Unicode characters - Enabled character offset conversion on import/export in JSON (de)serializer --- Makefile | 2 +- cassis/cas.py | 36 +++--- cassis/json.py | 21 +++- tests/performance.py | 22 +--- .../data.json | 78 ++++++++++++ .../debug-typesystem.xml | 0 .../debug.xmi | 15 +++ .../data.json | 30 +++-- .../debug-typesystem.xml | 17 +++ .../debug.xmi | 8 ++ .../ser-ref/casWithSofaDataArray/data.json | 20 ++-- .../ser-ref/casWithSofaDataURI/data.json | 14 +-- .../fs_as_array/ser-ref/casWithText/data.json | 14 +-- .../casWithTextAndAnnotation/debug.xmi | 7 -- .../casWithTextAndAnnotations/data.json | 48 ++++++++ .../debug-typesystem.xml | 17 +++ .../casWithTextAndAnnotations/debug.xmi | 9 ++ .../data.json | 48 ++++++++ .../debug-typesystem.xml | 17 +++ .../debug.xmi | 10 ++ .../data.json | 39 ++++++ .../debug-typesystem.xml | 17 +++ .../debug.xmi | 9 ++ .../fs_as_array/ser-ref/emptyCas/data.json | 9 ++ .../ser-ref/emptyCas/debug-typesystem.xml | 17 +++ .../fs_as_array/ser-ref/emptyCas/debug.xmi | 3 + tests/test_json.py | 111 ++++++++++++++++-- 27 files changed, 550 insertions(+), 88 deletions(-) create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json rename tests/test_files/json/fs_as_array/ser-ref/{casWithTextAndAnnotation => casWithEmojiUnicodeTextAndAnnotations}/debug-typesystem.xml (100%) create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi rename tests/test_files/json/fs_as_array/ser-ref/{casWithTextAndAnnotation => casWithLeftToRightTextAndAnnotations}/data.json (67%) create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi delete mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi diff --git a/Makefile b/Makefile index affc02b..584220e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHON_FILES = cassis tests test: - python -m pytest tests/ + python -m pytest -m "not performance" tests/ format: black -l 120 cassis/ diff --git a/cassis/cas.py b/cassis/cas.py index b2e1825..a65dcce 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -1,6 +1,5 @@ import sys from collections import defaultdict -from io import BytesIO from pathlib import Path from typing import Dict, Iterable, List, Optional, Tuple, Union @@ -37,16 +36,16 @@ class OffsetConverter: """ def __init__(self): - self._uima_to_cassis: Dict[int, int] = {0: 0} - self._cassis_to_uima: Dict[int, int] = {0: 0} + self._uima_to_cassis: Union[Dict[int, int], None] = None + self._cassis_to_uima: Union[Dict[int, int], None] = None def create_index(self, sofa_string: str): - self._uima_to_cassis.clear() - self._cassis_to_uima.clear() - if sofa_string is None: return + self._uima_to_cassis = {0: 0} + self._cassis_to_uima = {0: 0} + count_uima = 0 count_cassis = 0 @@ -67,11 +66,19 @@ def create_index(self, sofa_string: str): def uima_to_cassis(self, idx: Optional[int]) -> Optional[int]: if idx is None: return None + + if self._uima_to_cassis is None: + return idx + return self._uima_to_cassis[idx] def cassis_to_uima(self, idx: Optional[int]) -> Optional[int]: if idx is None: return None + + if self._cassis_to_uima is None: + return idx + return self._cassis_to_uima[idx] @@ -572,9 +579,11 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False """ from cassis.xmi import CasXmiSerializer - return self._serialize(CasXmiSerializer(), path, pretty_print) + return self._serialize(CasXmiSerializer(), path, pretty_print=pretty_print) - def to_json(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]: + def to_json( + self, path: Union[str, Path, None] = None, pretty_print: bool = False, ensure_ascii=False + ) -> Optional[str]: """Creates a JSON representation of this CAS. Args: @@ -588,14 +597,13 @@ def to_json(self, path: Union[str, Path, None] = None, pretty_print: bool = Fals """ from cassis.json import CasJsonSerializer - return self._serialize(CasJsonSerializer(), path, pretty_print) + return self._serialize(CasJsonSerializer(), path, pretty_print=pretty_print, ensure_ascii=ensure_ascii) - def _serialize(self, serializer, path: Union[str, Path, None] = None, pretty_print: bool = False): + def _serialize(self, serializer, path: Union[str, Path, None] = None, **kwargs): """Runs this CAS through the given serializer. Args: path: File path, if `None` is provided the result is returned as a string - pretty_print: `True` if the resulting data should be pretty-printed, else `False` Returns: @@ -604,13 +612,13 @@ def _serialize(self, serializer, path: Union[str, Path, None] = None, pretty_pri """ # If `path` is None, then serialize to a string and return it if path is None: - return serializer.serialize(None, self, pretty_print=pretty_print) + return serializer.serialize(None, self, **kwargs) elif isinstance(path, str): with open(path, "wb") as f: - serializer.serialize(f, self, pretty_print=pretty_print) + serializer.serialize(f, self, **kwargs) elif isinstance(path, Path): with path.open("wb") as f: - serializer.serialize(f, self, pretty_print=pretty_print) + serializer.serialize(f, self, **kwargs) else: raise TypeError("`path` needs to be one of [str, None, Path], but was <{0}>".format(type(path))) diff --git a/cassis/json.py b/cassis/json.py index 4cd74e2..d4e2fee 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -198,6 +198,13 @@ def fix_up(elements): fs = AnnotationType(**attributes) self._resolve_references(fs, ref_features, feature_structures) + + # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints + if typesystem.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION): + sofa = fs.sofa + fs.begin = sofa._offset_converter.uima_to_cassis(fs.begin) + fs.end = sofa._offset_converter.uima_to_cassis(fs.end) + return fs def _parse_primitive_array(self, type_name: str, elements: [list, str]) -> List: @@ -234,7 +241,9 @@ class CasJsonSerializer: def __init__(self): pass - def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> Union[str, None]: + def serialize( + self, sink: Union[IO, str, None], cas: Cas, pretty_print: bool = True, ensure_ascii: bool = False + ) -> Union[str, None]: data = {} types = data[TYPES_FIELD] = {} views = data[VIEWS_FIELD] = {} @@ -263,9 +272,9 @@ def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) if sink: - json.dump(data, sink, sort_keys=False, indent=2 if pretty_print else None) + json.dump(data, sink, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii) else: - return json.dumps(data, sort_keys=False, indent=2 if pretty_print else None) + return json.dumps(data, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii) if isinstance(sink, TextIOWrapper): sink.detach() # Prevent TextIOWrapper from closing the BytesIO @@ -347,9 +356,9 @@ def _serialize_feature_structure(self, fs) -> dict: continue # Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets - # if ts.is_instance_of(fs.type, "uima.tcas.Annotation") and feature_name == "begin" or feature_name == "end": - # sofa: Sofa = getattr(fs, "sofa") - # value = sofa._offset_converter.cassis_to_uima(value) + if feature.domainType.name == TYPE_NAME_ANNOTATION and feature_name == "begin" or feature_name == "end": + sofa: Sofa = getattr(fs, "sofa") + value = sofa._offset_converter.cassis_to_uima(value) if is_primitive(feature.rangeType): json_fs[feature_name] = value diff --git a/tests/performance.py b/tests/performance.py index 6a1b289..aaff08a 100644 --- a/tests/performance.py +++ b/tests/performance.py @@ -1,6 +1,8 @@ from random import Random from timeit import default_timer as timer +import pytest + from cassis import load_cas_from_json, load_cas_from_xmi from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator @@ -15,52 +17,40 @@ randomized_cas_json = randomized_cas.to_json() +@pytest.mark.performance def test_xmi_serialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") randomized_cas.to_xmi() end = timer() print(f"XMI: Serializing {iterations} CASes took {end - start} seconds") +@pytest.mark.performance def test_json_serialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") randomized_cas.to_json() end = timer() print(f"JSON: Serializing {iterations} CASes took {end - start} seconds") +@pytest.mark.performance def test_xmi_deserialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") load_cas_from_xmi(randomized_cas_xmi, typesystem) end = timer() print(f"XMI: Deserializing {iterations} CASes took {end - start} seconds") +@pytest.mark.performance def test_json_deserialization_performance(): start = timer() for i in range(0, iterations): - if i % 10 == 0: - print(".", end='') - if i % 100 == 0: - print(f"{i}") load_cas_from_json(randomized_cas_json, typesystem) end = timer() diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json new file mode 100644 index 0000000..422cea5 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json @@ -0,0 +1,78 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "\uD83E\uDD73 This \uD83D\uDC73\uD83C\uDFFB‍♀️ is ✆ a \uD83E\uDDD4\uD83C\uDFFE‍♂️ test \uD83D\uDC7B" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 2 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 3, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 15 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 16, + "end" : 18 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 19, + "end" : 20 + }, { + "%ID" : 7, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 21, + "end" : 22 + }, { + "%ID" : 8, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 23, + "end" : 30 + }, { + "%ID" : 9, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 31, + "end" : 35 + }, { + "%ID" : 10, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 36, + "end" : 38 + }, { + "%ID" : 11, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 38, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml similarity index 100% rename from tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug-typesystem.xml rename to tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..6d8ec43 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json similarity index 67% rename from tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json rename to tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json index 7879a33..1944181 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json @@ -1,30 +1,36 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ 2, 3 ] - } - }, "%FEATURE_STRUCTURES" : [ { "%ID" : 1, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text", - "sofaString" : "This is a test." + "sofaString" : "هذا اختبار" }, { "%ID" : 2, - "%TYPE" : "uima.tcas.DocumentAnnotation", + "%TYPE" : "uima.tcas.Annotation", "@sofa" : 1, "begin" : 0, - "end" : 15, - "language" : "x-unspecified" + "end" : 3 }, { "%ID" : 3, "%TYPE" : "uima.tcas.Annotation", "@sofa" : 1, + "begin" : 4, + "end" : 10 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, "begin" : 0, - "end" : 15 - } ] + "end" : 10, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4 ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..108d362 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json index edf6ddc..20d935b 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json @@ -1,21 +1,21 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ ] - } - }, "%FEATURE_STRUCTURES" : [ { - "%ID" : 2, + "%ID" : 1, "%TYPE" : "uima.cas.ByteArray", "%ELEMENTS" : "VGhpcyBpcyBhIHRlc3Q=" }, { - "%ID" : 1, + "%ID" : 2, "%TYPE" : "uima.cas.Sofa", "sofaNum" : 1, "sofaID" : "_InitialView", "mimeType" : "text/plain", - "@sofaArray" : 2 - } ] + "@sofaArray" : 1 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 2, + "%MEMBERS" : [ ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json index 266ab55..0b142a8 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json @@ -1,11 +1,5 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ ] - } - }, "%FEATURE_STRUCTURES" : [ { "%ID" : 1, "%TYPE" : "uima.cas.Sofa", @@ -13,5 +7,11 @@ "sofaID" : "_InitialView", "mimeType" : "text/plain", "sofaURI" : "classpath:/ProgrammaticallyCreatedCasDataSuite/document.txt" - } ] + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json index 1fe9f02..39f5ffe 100644 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json @@ -1,11 +1,5 @@ { "%TYPES" : { }, - "%VIEWS" : { - "_InitialView" : { - "%SOFA" : 1, - "%MEMBERS" : [ 2 ] - } - }, "%FEATURE_STRUCTURES" : [ { "%ID" : 1, "%TYPE" : "uima.cas.Sofa", @@ -20,5 +14,11 @@ "begin" : 0, "end" : 15, "language" : "x-unspecified" - } ] + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2 ] + } + } } \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi deleted file mode 100644 index 7292031..0000000 --- a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotation/debug.xmi +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json new file mode 100644 index 0000000..a9522cf --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json @@ -0,0 +1,48 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 4 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 9 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 10, + "end" : 14 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 14, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..37c1e9b --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json new file mode 100644 index 0000000..d586738 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json @@ -0,0 +1,48 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "這是一個測試" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 1 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 1, + "end" : 2 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 2, + "end" : 4 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 4, + "end" : 6 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 6, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..0087d72 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json new file mode 100644 index 0000000..56784fe --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json @@ -0,0 +1,39 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 4 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 9 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 10, + "end" : 14 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi new file mode 100644 index 0000000..37c1e9b --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json new file mode 100644 index 0000000..fcd8582 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json @@ -0,0 +1,9 @@ +{ + "%TYPES" : { }, + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi new file mode 100644 index 0000000..6fd88bd --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi @@ -0,0 +1,3 @@ + + + diff --git a/tests/test_json.py b/tests/test_json.py index 0765ca3..cc02c56 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -1,28 +1,92 @@ import json +from cassis.typesystem import TYPE_NAME_ANNOTATION from tests.fixtures import * from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator, MultiTypeRandomCasGenerator from tests.util import assert_json_equal -FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files") +FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files", "json", "fs_as_array", "ser-ref") FIXTURES = [ - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithSofaDataArray")), - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithSofaDataURI")), - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithText")), - (os.path.join(FIXTURE_DIR, "json", "fs_as_array", "ser-ref", "casWithTextAndAnnotation")), + (os.path.join(FIXTURE_DIR, "casWithSofaDataArray"), []), + (os.path.join(FIXTURE_DIR, "casWithSofaDataURI"), []), + (os.path.join(FIXTURE_DIR, "casWithText"), [["uima.tcas.DocumentAnnotation", 0, 15, "This is a test."]]), + ( + os.path.join(FIXTURE_DIR, "casWithoutTextButWithAnnotations"), + [ + ["uima.tcas.Annotation", 0, 4, None], + ["uima.tcas.Annotation", 5, 7, None], + ["uima.tcas.Annotation", 8, 9, None], + ["uima.tcas.Annotation", 10, 14, None], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 4, "This"], + ["uima.tcas.Annotation", 5, 7, "is"], + ["uima.tcas.Annotation", 8, 9, "a"], + ["uima.tcas.Annotation", 10, 14, "test"], + ["uima.tcas.DocumentAnnotation", 0, 14, "This is a test"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithEmojiUnicodeTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 1, "🥳", b"\xf0\x9f\xa5\xb3"], + ["uima.tcas.Annotation", 2, 6, "This"], + [ + "uima.tcas.Annotation", + 7, + 12, + "👳🏻\u200d♀️", + b"\xf0\x9f\x91\xb3\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f", + ], + ["uima.tcas.Annotation", 13, 15, "is"], + ["uima.tcas.Annotation", 16, 17, "✆", b"\xe2\x9c\x86"], + ["uima.tcas.Annotation", 18, 19, "a"], + [ + "uima.tcas.Annotation", + 20, + 25, + "🧔🏾\u200d♂️", + b"\xf0\x9f\xa7\x94\xf0\x9f\x8f\xbe\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f", + ], + ["uima.tcas.Annotation", 26, 30, "test"], + ["uima.tcas.Annotation", 31, 32, "👻", b"\xf0\x9f\x91\xbb"], + ["uima.tcas.DocumentAnnotation", 0, 32, "🥳 This 👳🏻\u200d♀️ is ✆ a 🧔🏾\u200d♂️ test 👻"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithLeftToRightTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 3, "هذا"], + ["uima.tcas.Annotation", 4, 10, "اختبار"], + ["uima.tcas.DocumentAnnotation", 0, 10, "هذا اختبار"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithTraditionalChineseTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 1, "這"], + ["uima.tcas.Annotation", 1, 2, "是"], + ["uima.tcas.Annotation", 2, 4, "一個"], + ["uima.tcas.Annotation", 4, 6, "測試"], + ["uima.tcas.DocumentAnnotation", 0, 6, "這是一個測試"], + ], + ), ] -@pytest.mark.parametrize("json_path", FIXTURES) -def test_deserialization_serialization(json_path): +@pytest.mark.parametrize("json_path, annotations", FIXTURES) +def test_deserialization_serialization(json_path, annotations): with open(os.path.join(json_path, "data.json"), "rb") as f: cas = load_cas_from_json(f) with open(os.path.join(json_path, "data.json"), "rb") as f: expected_json = json.load(f) - actual_json = cas.to_json() + actual_json = cas.to_json(pretty_print=True) assert_json_equal(actual_json, expected_json, sort_keys=True) @@ -56,3 +120,34 @@ def test_multi_feature_random_serialization_deserialization(): actual_json = loaded_cas.to_json() assert_json_equal(actual_json, expected_json) + + +@pytest.mark.parametrize("json_path, annotations", FIXTURES) +def test_unicode(json_path, annotations): + with open(os.path.join(json_path, "data.json"), "rb") as f: + cas = load_cas_from_json(f) + + actual_annotations = [ + [a.type.name, a.begin, a.end, a.get_covered_text()] + for a in sorted(cas.select(TYPE_NAME_ANNOTATION), key=lambda k: k.type.name) + ] + expected_annotations = [a[0:4] for a in annotations] + assert actual_annotations == expected_annotations + + for i in range(0, len(annotations)): + expected = annotations[i] + actual = actual_annotations[i] + + expected_covered_text = expected[3] + actual_covered_text = actual[3] + + if not expected_covered_text: + continue + + for n in range(len(actual_covered_text)): + print(f"{n}: [{actual_covered_text[n]}] {hex(ord(actual_covered_text[n]))}") + + if len(expected) >= 5: + expected_utf8_bytes = expected[4] + actual_utf8_bytes = bytes(actual_covered_text, "UTF-8") + assert actual_utf8_bytes == expected_utf8_bytes From 381a7ec2d2b0189d439cf233f82c3320deb08346 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 10 Sep 2021 15:38:22 +0200 Subject: [PATCH 16/22] #209 - Parsing an array that was serialized using multipleReferencesAllowed=true fails - Fixed problem by checking the multipleReferencesAllowed feature during deserialization - Added test --- cassis/xmi.py | 4 +-- tests/fixtures.py | 28 +++++++++++++++++++ ...ystem_with_multiple_references_allowed.xml | 19 +++++++++++++ ...ltiple_references_allowed_string_array.xmi | 22 +++++++++++++++ tests/test_xmi.py | 8 ++++-- 5 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 tests/test_files/typesystems/typesystem_with_multiple_references_allowed.xml create mode 100644 tests/test_files/xmi/cas_with_multiple_references_allowed_string_array.xmi diff --git a/cassis/xmi.py b/cassis/xmi.py index f402a28..c05dd2e 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -203,7 +203,7 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b elif typesystem.is_primitive_array(fs.type) and feature_name == "elements": # Separately rendered arrays (typically used with multipleReferencesAllowed = True) fs[feature_name] = self._parse_primitive_array(fs.type, value) - elif typesystem.is_primitive_array(feature.rangeType): + elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then @@ -337,7 +337,7 @@ def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[ return AnnotationType(**attributes) def _parse_primitive_array(self, type_: Type, value: str) -> List: - """Primitive collections are serialized as white space seperated primitive values""" + """Primitive collections are serialized as white space separated primitive values""" # TODO: Use type name global variable here instead of hardcoded string literal elements = value.split(" ") diff --git a/tests/fixtures.py b/tests/fixtures.py index 3a670f3..9cde40c 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -104,6 +104,20 @@ def cas_with_empty_array_references_xmi(cas_with_empty_array_references_path): return f.read() +# CAS with multipleReferencesAllowed=true on string array + + +@pytest.fixture +def cas_with_multiple_references_allowed_string_array_path(): + return os.path.join(FIXTURE_DIR, "xmi", "cas_with_multiple_references_allowed_string_array.xmi") + + +@pytest.fixture +def cas_with_multiple_references_allowed_string_array_xmi(cas_with_multiple_references_allowed_string_array_path): + with open(cas_with_multiple_references_allowed_string_array_path, "r") as f: + return f.read() + + # CAS with reserved names @@ -273,6 +287,20 @@ def typesystem_with_collections_xml(typesystem_with_collections_path): return f.read() +# CAS with multipleReferencesAllowed=true on string array + + +@pytest.fixture +def typesystem_with_multiple_references_allowed_path(): + return os.path.join(FIXTURE_DIR, "typesystems", "typesystem_with_multiple_references_allowed.xml") + + +@pytest.fixture +def typesystem_with_multiple_references_allowed_xml(typesystem_with_multiple_references_allowed_path): + with open(typesystem_with_multiple_references_allowed_path, "r") as f: + return f.read() + + # DKPro types diff --git a/tests/test_files/typesystems/typesystem_with_multiple_references_allowed.xml b/tests/test_files/typesystems/typesystem_with_multiple_references_allowed.xml new file mode 100644 index 0000000..530a0f3 --- /dev/null +++ b/tests/test_files/typesystems/typesystem_with_multiple_references_allowed.xml @@ -0,0 +1,19 @@ + + + + + test.type + + uima.tcas.Annotation + + + target + + uima.cas.StringArray + uima.cas.String + true + + + + + diff --git a/tests/test_files/xmi/cas_with_multiple_references_allowed_string_array.xmi b/tests/test_files/xmi/cas_with_multiple_references_allowed_string_array.xmi new file mode 100644 index 0000000..41577b0 --- /dev/null +++ b/tests/test_files/xmi/cas_with_multiple_references_allowed_string_array.xmi @@ -0,0 +1,22 @@ + + + + + + + + + + + LNC + MTH + SNOMEDCT_US + + + + + diff --git a/tests/test_xmi.py b/tests/test_xmi.py index 49b5875..39ed338 100644 --- a/tests/test_xmi.py +++ b/tests/test_xmi.py @@ -28,6 +28,10 @@ pytest.lazy_fixture("cas_has_fs_with_no_namespace_xmi"), pytest.lazy_fixture("typesystem_has_types_with_no_namespace_xml"), ), + ( + pytest.lazy_fixture("cas_with_multiple_references_allowed_string_array_xmi"), + pytest.lazy_fixture("typesystem_with_multiple_references_allowed_xml"), + ), ] @@ -294,14 +298,14 @@ def test_offsets_work_for_empty_sofastring(): # Leniency -def test_leniency_type_not_in_typeystem_lenient(cas_with_leniency_xmi, small_typesystem_xml): +def test_leniency_type_not_in_typesystem_lenient(cas_with_leniency_xmi, small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) with pytest.warns(UserWarning): cas = load_cas_from_xmi(cas_with_leniency_xmi, typesystem=typesystem, lenient=True) -def test_leniency_type_not_in_typeystem_not_lenient(cas_with_leniency_xmi, small_typesystem_xml): +def test_leniency_type_not_in_typesystem_not_lenient(cas_with_leniency_xmi, small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) with pytest.raises(TypeNotFoundError): From fbcda8e46c75d0bbb7fd9e1455e43b37321f238c Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Fri, 10 Sep 2021 15:43:46 +0200 Subject: [PATCH 17/22] #168 - Experimental JSON CAS support - Better check whether adding a TextIOWrapper is necessary during serialization - Fixed bad access to element type name - Formatting --- cassis/json.py | 6 +++--- tests/performance.py | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/cassis/json.py b/cassis/json.py index d4e2fee..370188f 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -1,7 +1,7 @@ import base64 import json from collections import OrderedDict -from io import TextIOWrapper +from io import TextIOBase, TextIOWrapper from cassis.cas import NAME_DEFAULT_SOFA, Cas, IdGenerator, Sofa, View from cassis.typesystem import * @@ -268,7 +268,7 @@ def serialize( json_fs = self._serialize_feature_structure(fs) feature_structures.append(json_fs) - if isinstance(sink, BytesIO): + if not isinstance(sink, TextIOBase): sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) if sink: @@ -316,7 +316,7 @@ def _serialize_feature(self, json_type, feature: Feature): json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed if feature.elementType is not None: - json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType) + json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType.name) return json_feature diff --git a/tests/performance.py b/tests/performance.py index aaff08a..69575de 100644 --- a/tests/performance.py +++ b/tests/performance.py @@ -13,8 +13,12 @@ typesystem = generator.generate_type_system() randomized_cas = generator.generate_cas(typesystem) + randomized_cas_xmi = randomized_cas.to_xmi() +randomized_cas_xmi_bytes = randomized_cas_xmi.encode("utf-8") + randomized_cas_json = randomized_cas.to_json() +randomized_cas_json_bytes = randomized_cas_json.encode("utf-8") @pytest.mark.performance @@ -24,7 +28,9 @@ def test_xmi_serialization_performance(): randomized_cas.to_xmi() end = timer() - print(f"XMI: Serializing {iterations} CASes took {end - start} seconds") + print( + f"XMI: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)" + ) @pytest.mark.performance @@ -34,7 +40,9 @@ def test_json_serialization_performance(): randomized_cas.to_json() end = timer() - print(f"JSON: Serializing {iterations} CASes took {end - start} seconds") + print( + f"JSON: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)" + ) @pytest.mark.performance @@ -44,7 +52,9 @@ def test_xmi_deserialization_performance(): load_cas_from_xmi(randomized_cas_xmi, typesystem) end = timer() - print(f"XMI: Deserializing {iterations} CASes took {end - start} seconds") + print( + f"XMI: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)" + ) @pytest.mark.performance @@ -54,4 +64,6 @@ def test_json_deserialization_performance(): load_cas_from_json(randomized_cas_json, typesystem) end = timer() - print(f"JSON: Deserializing {iterations} CASes took {end - start} seconds") + print( + f"JSON: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)" + ) From 1e97c37d270f1ee474f53a763176fa804f9abdb7 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 20 Sep 2021 17:17:57 +0200 Subject: [PATCH 18/22] #168 - Experimental JSON CAS support - Better test if using a TextIOWrapper is really necessary --- cassis/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cassis/json.py b/cassis/json.py index 370188f..506df1b 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -268,7 +268,7 @@ def serialize( json_fs = self._serialize_feature_structure(fs) feature_structures.append(json_fs) - if not isinstance(sink, TextIOBase): + if sink and not isinstance(sink, TextIOBase): sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) if sink: From 555ed6fc6e08891b483d115bdc58b6429429eb21 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 20 Sep 2021 18:35:53 +0200 Subject: [PATCH 19/22] #168 - Experimental JSON CAS support - Work around issues with cas_to_compareble_text and FSArrays --- cassis/util.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cassis/util.py b/cassis/util.py index 3bbac33..c54de57 100644 --- a/cassis/util.py +++ b/cassis/util.py @@ -7,7 +7,7 @@ import attr from cassis import Cas -from cassis.typesystem import FEATURE_BASE_NAME_SOFA, TYPE_NAME_ANNOTATION, FeatureStructure, Type +from cassis.typesystem import FEATURE_BASE_NAME_SOFA, TYPE_NAME_ANNOTATION, FeatureStructure, Type, TypeSystem, is_array _EXCLUDED_FEATURES = {FEATURE_BASE_NAME_SOFA} _NULL_VALUE = "" @@ -21,7 +21,7 @@ def cas_to_comparable_text( covered_text: bool = True, ) -> [str, None]: indexed_feature_structures = _get_indexed_feature_structures(cas) - all_feature_structures_by_type = _group_feature_structures_by_type(cas._find_all_fs(seeds)) + all_feature_structures_by_type = _group_feature_structures_by_type(cas._find_all_fs(seeds=seeds)) types_sorted = sorted(all_feature_structures_by_type.keys()) fs_id_to_anchor = _generate_anchors( cas, types_sorted, all_feature_structures_by_type, indexed_feature_structures, mark_indexed=mark_indexed @@ -32,6 +32,11 @@ def cas_to_comparable_text( csv_writer = csv.writer(out, dialect=csv.unix_dialect) for t in types_sorted: + # FIXME This avoids problems with FSArrays which are indexed in a view - need to write a test case for + # FSArrays that are in the index and have as elements another set of FSArrays ... + if is_array(t): + continue + type_ = cas.typesystem.get_type(t) csv_writer.writerow([type_.name]) From ceeabb74b63e2c4012abe8e1dea666370b71ec03 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 27 Sep 2021 17:29:52 +0200 Subject: [PATCH 20/22] #168 - Experimental JSON CAS support - Support for floating point special values in JSON - Support for not serializing the full type system in JSON but only the minimal or none at all --- cassis/cas.py | 21 ++- cassis/json.py | 120 +++++++++++++++--- cassis/typesystem.py | 46 ++++++- .../data.json | 102 +++++++++++++++ .../debug-typesystem.xml | 74 +++++++++++ .../debug.xmi | 7 + .../typesystem.xml | 74 +++++++++++ tests/test_json.py | 1 + tests/test_typesystem.py | 33 +++++ 9 files changed, 453 insertions(+), 25 deletions(-) create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi create mode 100644 tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml diff --git a/cassis/cas.py b/cassis/cas.py index a65dcce..75d3af1 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -8,7 +8,7 @@ from attr import validators from sortedcontainers import SortedKeyList -from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem +from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem, TypeSystemMode _validator_optional_string = validators.optional(validators.instance_of(str)) @@ -582,22 +582,33 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False return self._serialize(CasXmiSerializer(), path, pretty_print=pretty_print) def to_json( - self, path: Union[str, Path, None] = None, pretty_print: bool = False, ensure_ascii=False + self, + path: Union[str, Path, None] = None, + pretty_print: bool = False, + ensure_ascii=False, + type_system_mode: TypeSystemMode = TypeSystemMode.FULL, ) -> Optional[str]: """Creates a JSON representation of this CAS. Args: path: File path, if `None` is provided the result is returned as a string pretty_print: `True` if the resulting JSON should be pretty-printed, else `False` - + ensure_ascii: Whether to escape non-ASCII Unicode characters or not + type_system_mode: Whether to serialize the full type system (`FUL`), only the types used (`MINIMAL`), or no + type system information at all (`NONE`) Returns: If `path` is None, then the JSON representation of this CAS is returned as a string - """ from cassis.json import CasJsonSerializer - return self._serialize(CasJsonSerializer(), path, pretty_print=pretty_print, ensure_ascii=ensure_ascii) + return self._serialize( + CasJsonSerializer(), + path, + pretty_print=pretty_print, + ensure_ascii=ensure_ascii, + type_system_mode=type_system_mode, + ) def _serialize(self, serializer, path: Union[str, Path, None] = None, **kwargs): """Runs this CAS through the given serializer. diff --git a/cassis/json.py b/cassis/json.py index 506df1b..da9d66d 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -1,12 +1,17 @@ import base64 import json +import math from collections import OrderedDict from io import TextIOBase, TextIOWrapper +from math import isnan from cassis.cas import NAME_DEFAULT_SOFA, Cas, IdGenerator, Sofa, View from cassis.typesystem import * RESERVED_FIELD_PREFIX = "%" +REF_FEATURE_PREFIX = "@" +NUMBER_FEATURE_PREFIX = "#" +ANCHOR_FEATURE_PREFIX = "^" TYPE_FIELD = RESERVED_FIELD_PREFIX + "TYPE" RANGE_FIELD = RESERVED_FIELD_PREFIX + "RANGE" TYPES_FIELD = RESERVED_FIELD_PREFIX + "TYPES" @@ -15,7 +20,6 @@ VIEW_SOFA_FIELD = RESERVED_FIELD_PREFIX + "SOFA" VIEW_MEMBERS_FIELD = RESERVED_FIELD_PREFIX + "MEMBERS" FEATURE_STRUCTURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURE_STRUCTURES" -REF_FEATURE_PREFIX = "@" NAME_FIELD = RESERVED_FIELD_PREFIX + "NAME" SUPER_TYPE_FIELD = RESERVED_FIELD_PREFIX + "SUPER_TYPE" DESCRIPTION_FIELD = RESERVED_FIELD_PREFIX + "DESCRIPTION" @@ -26,6 +30,11 @@ FLAG_DOCUMENT_ANNOTATION = "DocumentAnnotation" ARRAY_SUFFIX = "[]" ELEMENTS_FIELD = RESERVED_FIELD_PREFIX + "ELEMENTS" +NAN_VALUE = "NaN" +POSITIVE_INFINITE_VALUE = "Infinity" +POSITIVE_INFINITE_VALUE_ABBR = "Inf" +NEGATIVE_INFINITE_VALUE = "-Infinity" +NEGATIVE_INFINITE_VALUE_ABBR = "-Inf" def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None) -> Cas: @@ -193,6 +202,9 @@ def fix_up(elements): if key.startswith(REF_FEATURE_PREFIX): ref_features[key[1:]] = value attributes.pop(key) + if key.startswith(NUMBER_FEATURE_PREFIX): + attributes[key[1:]] = self._parse_float_value(value) + attributes.pop(key) self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) fs = AnnotationType(**attributes) @@ -200,16 +212,34 @@ def fix_up(elements): self._resolve_references(fs, ref_features, feature_structures) # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints - if typesystem.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION): + if typesystem.is_instance_of(fs.type, TYPE_NAME_ANNOTATION): sofa = fs.sofa fs.begin = sofa._offset_converter.uima_to_cassis(fs.begin) fs.end = sofa._offset_converter.uima_to_cassis(fs.end) return fs + def _parse_float_value(self, value: Union[str, float]) -> float: + if isinstance(value, float): + return value + elif value == NAN_VALUE: + return float("nan") + elif value == POSITIVE_INFINITE_VALUE or value == POSITIVE_INFINITE_VALUE_ABBR: + return float("inf") + elif value == NEGATIVE_INFINITE_VALUE or value == NEGATIVE_INFINITE_VALUE_ABBR: + return float("-inf") + + raise ValueError( + f"Illegal floating point value [{value}]. Must be a float literal or one of {NAN_VALUE}, " + f"{POSITIVE_INFINITE_VALUE}, {POSITIVE_INFINITE_VALUE_ABBR}, {NEGATIVE_INFINITE_VALUE}, or " + f"{NEGATIVE_INFINITE_VALUE_ABBR}" + ) + def _parse_primitive_array(self, type_name: str, elements: [list, str]) -> List: - if type_name == TYPE_NAME_BYTE_ARRAY: + if elements and type_name == TYPE_NAME_BYTE_ARRAY: return base64.b64decode(elements) + if elements and (type_name == TYPE_NAME_FLOAT_ARRAY or type_name == TYPE_NAME_DOUBLE_ARRAY): + return [self._parse_float_value(v) for v in elements] else: return elements @@ -242,21 +272,19 @@ def __init__(self): pass def serialize( - self, sink: Union[IO, str, None], cas: Cas, pretty_print: bool = True, ensure_ascii: bool = False + self, + sink: Union[IO, str, None], + cas: Cas, + pretty_print: bool = True, + ensure_ascii: bool = False, + type_system_mode: TypeSystemMode = TypeSystemMode.FULL, ) -> Union[str, None]: - data = {} - types = data[TYPES_FIELD] = {} - views = data[VIEWS_FIELD] = {} - feature_structures = data[FEATURE_STRUCTURES_FIELD] = [] - - for type_ in cas.typesystem.get_types(): - if type_.name == TYPE_NAME_DOCUMENT_ANNOTATION: - continue - json_type = self._serialize_type(type_) - types[json_type[NAME_FIELD]] = json_type + feature_structures = [] + views = {} for view in cas.views: views[view.sofa.sofaID] = self._serialize_view(view) + if view.sofa.sofaArray: json_sofa_array_fs = self._serialize_feature_structure(view.sofa.sofaArray) feature_structures.append(json_sofa_array_fs) @@ -264,17 +292,52 @@ def serialize( feature_structures.append(json_sofa_fs) # Find all fs, even the ones that are not directly added to a sofa + used_types = set() for fs in sorted(cas._find_all_fs(include_inlinable_arrays=True), key=lambda a: a.xmiID): + used_types.add(fs.type) json_fs = self._serialize_feature_structure(fs) feature_structures.append(json_fs) + types = None + if type_system_mode is not TypeSystemMode.NONE: + types = {} + + if type_system_mode is TypeSystemMode.MINIMAL: + # Build transitive closure of used types by following parents, features, etc. + types_to_include = cas.typesystem.transitive_closure(used_types) + elif type_system_mode is TypeSystemMode.FULL: + types_to_include = cas.typesystem.get_types() + + for type_ in types_to_include: + if type_.name == TYPE_NAME_DOCUMENT_ANNOTATION: + continue + json_type = self._serialize_type(type_) + types[json_type[NAME_FIELD]] = json_type + + data = {} + if types is not None: + data[TYPES_FIELD] = types + if feature_structures is not None: + data[FEATURE_STRUCTURES_FIELD] = feature_structures + if views is not None: + data[VIEWS_FIELD] = views + if sink and not isinstance(sink, TextIOBase): sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) if sink: - json.dump(data, sink, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii) + json.dump( + data, + sink, + sort_keys=False, + indent=2 if pretty_print else None, + ensure_ascii=ensure_ascii, + allow_nan=False, + ) else: - return json.dumps(data, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii) + return json.dumps( + data, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii, allow_nan=False + ) if isinstance(sink, TextIOWrapper): sink.detach() # Prevent TextIOWrapper from closing the BytesIO @@ -288,9 +351,11 @@ def _serialize_type(self, type_: Type): json_type = { NAME_FIELD: type_name, SUPER_TYPE_FIELD: supertype_name, - DESCRIPTION_FIELD: type_.description, } + if type_.description: + json_type[DESCRIPTION_FIELD] = type_.description + for feature in list(type_.features): json_feature = self._serialize_feature(json_type, feature) json_type[json_feature[NAME_FIELD]] = json_feature @@ -331,6 +396,10 @@ def _serialize_feature_structure(self, fs) -> dict: if fs.elements: json_fs[ELEMENTS_FIELD] = base64.b64encode(bytes(fs.elements)).decode("ascii") return json_fs + elif type_name in {TYPE_NAME_DOUBLE_ARRAY, TYPE_NAME_FLOAT_ARRAY}: + if fs.elements: + json_fs[ELEMENTS_FIELD] = [self._serialize_float_value(e) for e in fs.elements] + return json_fs elif is_primitive_array(fs.type): if fs.elements: json_fs[ELEMENTS_FIELD] = fs.elements @@ -360,13 +429,28 @@ def _serialize_feature_structure(self, fs) -> dict: sofa: Sofa = getattr(fs, "sofa") value = sofa._offset_converter.cassis_to_uima(value) - if is_primitive(feature.rangeType): + if feature.rangeType.name in {TYPE_NAME_DOUBLE, TYPE_NAME_FLOAT}: + float_value = self._serialize_float_value(value) + if isinstance(float_value, str): + feature_name = NUMBER_FEATURE_PREFIX + feature_name + json_fs[feature_name] = self._serialize_float_value(value) + elif is_primitive(feature.rangeType): json_fs[feature_name] = value else: # We need to encode non-primitive features as a reference json_fs[REF_FEATURE_PREFIX + feature_name] = self._serialize_ref(value) return json_fs + def _serialize_float_value(self, value) -> Union[float, str]: + if isnan(value): + return NAN_VALUE + elif math.isinf(value): + if value > 0: + return POSITIVE_INFINITE_VALUE + else: + return NEGATIVE_INFINITE_VALUE + return value + def _serialize_ref(self, fs) -> int: if not fs: return None diff --git a/cassis/typesystem.py b/cassis/typesystem.py index 026daeb..c9e3244 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -1,10 +1,11 @@ import re import warnings from collections import defaultdict +from enum import Enum, auto from io import BytesIO from itertools import chain, filterfalse from pathlib import Path -from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Union +from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Set, Union import attr import deprecation @@ -172,6 +173,14 @@ _ARRAY_TYPES = _PRIMITIVE_ARRAY_TYPES | {"uima.cas.FSArray"} +class TypeSystemMode(Enum): + """How much type system information to include.""" + + FULL = auto() + MINIMAL = auto() + NONE = auto() + + def _string_to_valid_classname(name: str): return re.sub("[^a-zA-Z0-9_]", "_", name) @@ -402,7 +411,7 @@ def __lt__(self, other): return self.name < other.name -@attr.s(slots=True) +@attr.s(slots=True, hash=False, eq=True) class Type: """Describes types in a type system. @@ -584,6 +593,12 @@ def subsumes(self, other_type: "Type") -> bool: return False + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return self.name == other.name + class TypeSystem: def __init__(self, add_document_annotation_type: bool = True): @@ -967,6 +982,33 @@ def _add_document_annotation_type(self): t = self.create_type(name=_DOCUMENT_ANNOTATION_TYPE, supertypeName="uima.tcas.Annotation") self.create_feature(t, name="language", rangeType="uima.cas.String") + def transitive_closure(self, seed_types: Set[Type], built_in: bool = False) -> Set[Type]: + # Build transitive closure of used types by following parents, features, etc. + transitively_referenced_types = set() + openlist = [] + openlist.extend(seed_types) + while openlist: + type_ = openlist.pop(0) + + if type_ in transitively_referenced_types: + continue + + if not built_in and type_.name in _PREDEFINED_TYPES: + continue + + transitively_referenced_types.add(type_) + + if type_.supertype and type_.supertype not in transitively_referenced_types: + openlist.append(type_.supertype) + + for feature in type_.all_features: + if feature.rangeType not in transitively_referenced_types: + openlist.append(feature.rangeType) + if feature.elementType and feature.elementType not in transitively_referenced_types: + openlist.append(feature.elementType) + + return transitively_referenced_types + # Deserializing diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json new file mode 100644 index 0000000..0d97fb3 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json @@ -0,0 +1,102 @@ +{ + "%FEATURE_STRUCTURES": [ + { + "%ID": 1, + "%TYPE": "uima.cas.Sofa", + "sofaID": "_InitialView", + "sofaNum": 1 + }, + { + "#doubleNan": "NaN", + "#doubleNegInfinity": "-Infinity", + "#doublePosInfinity": "Infinity", + "#floatNan": "NaN", + "#floatNegInfinity": "-Infinity", + "#floatPosInfinity": "Infinity", + "%ID": 1, + "%TYPE": "SpecialValuesType", + "doubleOne": 1.0, + "doubleZero": 0.0, + "floatOne": 1.0, + "floatZero": 0.0 + }, + { + "%ELEMENTS": [ + 0.0, + 1.0, + "-Infinity", + "Infinity", + "NaN" + ], + "%ID": 2, + "%TYPE": "uima.cas.DoubleArray" + }, + { + "%ELEMENTS": [ + 0.0, + 1.0, + "-Infinity", + "Infinity", + "NaN" + ], + "%ID": 3, + "%TYPE": "uima.cas.FloatArray" + } + ], + "%TYPES": { + "SpecialValuesType": { + "%NAME": "SpecialValuesType", + "%SUPER_TYPE": "uima.cas.TOP", + "doubleNan": { + "%NAME": "doubleNan", + "%RANGE": "uima.cas.Double" + }, + "doubleNegInfinity": { + "%NAME": "doubleNegInfinity", + "%RANGE": "uima.cas.Double" + }, + "doubleOne": { + "%NAME": "doubleOne", + "%RANGE": "uima.cas.Double" + }, + "doublePosInfinity": { + "%NAME": "doublePosInfinity", + "%RANGE": "uima.cas.Double" + }, + "doubleZero": { + "%NAME": "doubleZero", + "%RANGE": "uima.cas.Double" + }, + "floatNan": { + "%NAME": "floatNan", + "%RANGE": "uima.cas.Float" + }, + "floatNegInfinity": { + "%NAME": "floatNegInfinity", + "%RANGE": "uima.cas.Float" + }, + "floatOne": { + "%NAME": "floatOne", + "%RANGE": "uima.cas.Float" + }, + "floatPosInfinity": { + "%NAME": "floatPosInfinity", + "%RANGE": "uima.cas.Float" + }, + "floatZero": { + "%NAME": "floatZero", + "%RANGE": "uima.cas.Float" + } + } + }, + "%VIEWS": { + "_InitialView": { + "%MEMBERS": [ + 1, + 2, + 3 + ], + "%SOFA": 1 + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml new file mode 100644 index 0000000..9a8766d --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml @@ -0,0 +1,74 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + SpecialValuesType + + uima.cas.TOP + + + doubleZero + + uima.cas.Double + + + doubleOne + + uima.cas.Double + + + doublePosInfinity + + uima.cas.Double + + + doubleNegInfinity + + uima.cas.Double + + + doubleNan + + uima.cas.Double + + + floatZero + + uima.cas.Float + + + floatOne + + uima.cas.Float + + + floatPosInfinity + + uima.cas.Float + + + floatNegInfinity + + uima.cas.Float + + + floatNan + + uima.cas.Float + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi new file mode 100644 index 0000000..e02d4cb --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml new file mode 100644 index 0000000..9a8766d --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml @@ -0,0 +1,74 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + SpecialValuesType + + uima.cas.TOP + + + doubleZero + + uima.cas.Double + + + doubleOne + + uima.cas.Double + + + doublePosInfinity + + uima.cas.Double + + + doubleNegInfinity + + uima.cas.Double + + + doubleNan + + uima.cas.Double + + + floatZero + + uima.cas.Float + + + floatOne + + uima.cas.Float + + + floatPosInfinity + + uima.cas.Float + + + floatNegInfinity + + uima.cas.Float + + + floatNan + + uima.cas.Float + + + + + diff --git a/tests/test_json.py b/tests/test_json.py index cc02c56..3889b11 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -10,6 +10,7 @@ FIXTURES = [ (os.path.join(FIXTURE_DIR, "casWithSofaDataArray"), []), (os.path.join(FIXTURE_DIR, "casWithSofaDataURI"), []), + (os.path.join(FIXTURE_DIR, "casWithFloatingPointSpecialValues"), []), (os.path.join(FIXTURE_DIR, "casWithText"), [["uima.tcas.DocumentAnnotation", 0, 15, "This is a test."]]), ( os.path.join(FIXTURE_DIR, "casWithoutTextButWithAnnotations"), diff --git a/tests/test_typesystem.py b/tests/test_typesystem.py index 82f8264..7d3a70a 100644 --- a/tests/test_typesystem.py +++ b/tests/test_typesystem.py @@ -7,9 +7,14 @@ from cassis.typesystem import ( _COLLECTION_TYPES, TOP_TYPE_NAME, + TYPE_NAME_ANNOTATION, + TYPE_NAME_ANNOTATION_BASE, + TYPE_NAME_ARRAY_BASE, TYPE_NAME_BOOLEAN, TYPE_NAME_INTEGER, + TYPE_NAME_SOFA, TYPE_NAME_STRING, + TYPE_NAME_STRING_ARRAY, TYPE_NAME_TOP, Feature, TypeCheckError, @@ -861,3 +866,31 @@ def test_create_same_type_twice_fails(): typesystem.create_type("my.Type") with pytest.raises(ValueError): typesystem.create_type("my.Type") + + +def test_transitive_closure(): + typesystem = TypeSystem() + base_type = typesystem.create_type("BaseType", supertypeName=TYPE_NAME_ANNOTATION) + child_type = typesystem.create_type("ChildType", supertypeName="BaseType") + typesystem.create_feature("ChildType", "primitiveFeature", TYPE_NAME_STRING) + typesystem.create_feature("ChildType", "arrayFeature", TYPE_NAME_STRING_ARRAY, elementType=TYPE_NAME_STRING) + typesystem.create_feature("ChildType", "fsFeature", "BaseType") + + transitive_closure_without_builtins = typesystem.transitive_closure({child_type}, built_in=False) + + assert transitive_closure_without_builtins == {base_type, child_type} + + transitive_closure_with_builtins = typesystem.transitive_closure({child_type}, built_in=True) + + assert transitive_closure_with_builtins == { + base_type, + child_type, + typesystem.get_type(TYPE_NAME_TOP), + typesystem.get_type(TYPE_NAME_ANNOTATION_BASE), + typesystem.get_type(TYPE_NAME_ANNOTATION), + typesystem.get_type(TYPE_NAME_STRING), + typesystem.get_type(TYPE_NAME_ARRAY_BASE), + typesystem.get_type(TYPE_NAME_STRING_ARRAY), + typesystem.get_type(TYPE_NAME_INTEGER), + typesystem.get_type(TYPE_NAME_SOFA), + } From 03b14afc4101c233b8e1be3da8170b25c83b07eb Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 29 Sep 2021 11:17:04 +0200 Subject: [PATCH 21/22] #192 - Cleanup stuff - Run pyupgrade --- tests/test_typesystem.py | 8 ++++---- tests/test_xmi.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_typesystem.py b/tests/test_typesystem.py index 96746d5..e2b9df2 100644 --- a/tests/test_typesystem.py +++ b/tests/test_typesystem.py @@ -292,7 +292,7 @@ def test_is_instance_of(child_name: str, parent_name: str, expected: bool): # manually load the type system path = os.path.join(FIXTURE_DIR, "typesystems", "important_dkpro_types.xml") - with open(path, "r") as f: + with open(path) as f: ts = load_typesystem(f.read()) assert ts.is_instance_of(child_name, parent_name) == expected @@ -648,7 +648,7 @@ def test_that_typesystem_with_redefined_documentation_annotation_works( ], ) def test_that_merging_compatible_typesystem_works(name, rangeTypeName, elementType, multipleReferencesAllowed): - with open(typesystem_merge_base_path(), "r") as f: + with open(typesystem_merge_base_path()) as f: base = load_typesystem(f.read()) ts = TypeSystem() @@ -682,7 +682,7 @@ def test_that_merging_compatible_typesystem_works(name, rangeTypeName, elementTy ], ) def test_that_merging_incompatible_typesystem_throws(name, rangeTypeName, elementType, multipleReferencesAllowed): - with open(typesystem_merge_base_path(), "r") as f: + with open(typesystem_merge_base_path()) as f: base = load_typesystem(f.read()) ts = TypeSystem() @@ -697,7 +697,7 @@ def test_that_merging_incompatible_typesystem_throws(name, rangeTypeName, elemen with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) - with pytest.raises(ValueError, match=r".*\[{0}\].*".format(name)): + with pytest.raises(ValueError, match=fr".*\[{name}\].*"): merge_typesystems(base, ts) diff --git a/tests/test_xmi.py b/tests/test_xmi.py index 17c7e6e..5c5de01 100644 --- a/tests/test_xmi.py +++ b/tests/test_xmi.py @@ -181,7 +181,7 @@ def test_serializing_cas_to_file_path(tmpdir, xmi, typesystem_xml): cas.to_xmi(path) - with open(path, "r") as actual: + with open(path) as actual: assert_xml_equal(actual.read(), xmi) From e89ada4bf40d09be22072d48e256adb31fb01bb1 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 12 Dec 2021 21:37:17 +0100 Subject: [PATCH 22/22] #168 - Experimental JSON CAS support - Added mention about non-final status in README file --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index aac0fe2..cbb308d 100644 --- a/README.rst +++ b/README.rst @@ -59,6 +59,7 @@ Some features are still under development, e.g. - Proper type checking - XML/XMI schema validation +- UIMA JSON CAS support (the format is not yet finalized) Installation ------------