diff --git a/.gitignore b/.gitignore index 0d13f2a..e2de877 100644 --- a/.gitignore +++ b/.gitignore @@ -221,3 +221,5 @@ expected.xml difference.diff xml_issue.py +actual.json +expected.json diff --git a/Makefile b/Makefile index 6f2a1b3..d5528c4 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHON_FILES = cassis tests test: - python -m pytest tests/ + python -m pytest -m "not performance" tests/ format: black -l 120 cassis/ diff --git a/README.rst b/README.rst index aac0fe2..cbb308d 100644 --- a/README.rst +++ b/README.rst @@ -59,6 +59,7 @@ Some features are still under development, e.g. - Proper type checking - XML/XMI schema validation +- UIMA JSON CAS support (the format is not yet finalized) Installation ------------ diff --git a/cassis/__init__.py b/cassis/__init__.py index 712c73a..9a5239a 100644 --- a/cassis/__init__.py +++ b/cassis/__init__.py @@ -1,6 +1,7 @@ """UIMA CAS processing library in Python.""" from .cas import Cas, Sofa, View +from .json import load_cas_from_json from .typesystem import TypeSystem, load_dkpro_core_typesystem, load_typesystem, merge_typesystems from .util import cas_to_comparable_text from .xmi import load_cas_from_xmi @@ -14,5 +15,6 @@ "load_dkpro_core_typesystem", "merge_typesystems", "load_cas_from_xmi", + "load_cas_from_json", "cas_to_comparable_text", ] diff --git a/cassis/cas.py b/cassis/cas.py index ec42fa6..d786543 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -1,6 +1,5 @@ import sys from collections import defaultdict -from io import BytesIO from pathlib import Path from typing import Dict, Iterable, List, Optional, Tuple, Union @@ -10,10 +9,12 @@ from sortedcontainers import SortedKeyList from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem, TYPE_NAME_FS_LIST, \ - TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD + TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD, TypeSystemMode _validator_optional_string = validators.optional(validators.instance_of(str)) +NAME_DEFAULT_SOFA = "_InitialView" + class IdGenerator: def __init__(self, initial_id: int = 1): @@ -107,6 +108,9 @@ class Sofa: #: str: The sofa URI, it references remote sofa data sofaURI = attr.ib(default=None, validator=_validator_optional_string) + #: str: The sofa data byte array + sofaArray = attr.ib(default=None) + #: OffsetConverter: Converts from UIMA UTF-16 based offsets to Unicode codepoint offsets and back _offset_converter = attr.ib(factory=OffsetConverter, eq=False, hash=False) @@ -543,6 +547,25 @@ def sofa_uri(self, value: str): """ self.get_sofa().sofaURI = value + @property + def sofa_array(self) -> str: + """The sofa byte array references a uima.cas.ByteArray feature structure + + Returns: The sofa data byte array. + + """ + return self.get_sofa().sofaArray + + @sofa_array.setter + def sofa_array(self, value): + """Sets the sofa byte array to the given uima.cas.ByteArray feature structure. + + Args: + value: The new sofa byte array feature structure. + + """ + self.get_sofa().sofaArray = value + def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]: """Creates a XMI representation of this CAS. @@ -557,19 +580,57 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False """ from cassis.xmi import CasXmiSerializer - serializer = CasXmiSerializer() + return self._serialize(CasXmiSerializer(), path, pretty_print=pretty_print) + def to_json( + self, + path: Union[str, Path, None] = None, + pretty_print: bool = False, + ensure_ascii=False, + type_system_mode: TypeSystemMode = TypeSystemMode.FULL, + ) -> Optional[str]: + """Creates a JSON representation of this CAS. + + Args: + path: File path, if `None` is provided the result is returned as a string + pretty_print: `True` if the resulting JSON should be pretty-printed, else `False` + ensure_ascii: Whether to escape non-ASCII Unicode characters or not + type_system_mode: Whether to serialize the full type system (`FUL`), only the types used (`MINIMAL`), or no + type system information at all (`NONE`) + + Returns: + If `path` is None, then the JSON representation of this CAS is returned as a string + """ + from cassis.json import CasJsonSerializer + + return self._serialize( + CasJsonSerializer(), + path, + pretty_print=pretty_print, + ensure_ascii=ensure_ascii, + type_system_mode=type_system_mode, + ) + + def _serialize(self, serializer, path: Union[str, Path, None] = None, **kwargs): + """Runs this CAS through the given serializer. + + Args: + path: File path, if `None` is provided the result is returned as a string + + + Returns: + If `path` is None, then the data representation of this CAS is returned as a string + + """ # If `path` is None, then serialize to a string and return it if path is None: - sink = BytesIO() - serializer.serialize(sink, self, pretty_print=pretty_print) - return sink.getvalue().decode("utf-8") + return serializer.serialize(None, self, **kwargs) elif isinstance(path, str): with open(path, "wb") as f: - serializer.serialize(f, self, pretty_print=pretty_print) + serializer.serialize(f, self, **kwargs) elif isinstance(path, Path): with path.open("wb") as f: - serializer.serialize(f, self, pretty_print=pretty_print) + serializer.serialize(f, self, **kwargs) else: raise TypeError(f"`path` needs to be one of [str, None, Path], but was <{type(path)}>") diff --git a/cassis/json.py b/cassis/json.py new file mode 100644 index 0000000..ecb1c37 --- /dev/null +++ b/cassis/json.py @@ -0,0 +1,469 @@ +import base64 +import json +import math +from collections import OrderedDict +from io import TextIOBase, TextIOWrapper +from math import isnan + +from cassis.cas import NAME_DEFAULT_SOFA, Cas, IdGenerator, Sofa, View +from cassis.typesystem import * + +RESERVED_FIELD_PREFIX = "%" +REF_FEATURE_PREFIX = "@" +NUMBER_FEATURE_PREFIX = "#" +ANCHOR_FEATURE_PREFIX = "^" +TYPE_FIELD = RESERVED_FIELD_PREFIX + "TYPE" +RANGE_FIELD = RESERVED_FIELD_PREFIX + "RANGE" +TYPES_FIELD = RESERVED_FIELD_PREFIX + "TYPES" +FEATURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURES" +VIEWS_FIELD = RESERVED_FIELD_PREFIX + "VIEWS" +VIEW_SOFA_FIELD = RESERVED_FIELD_PREFIX + "SOFA" +VIEW_MEMBERS_FIELD = RESERVED_FIELD_PREFIX + "MEMBERS" +FEATURE_STRUCTURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURE_STRUCTURES" +NAME_FIELD = RESERVED_FIELD_PREFIX + "NAME" +SUPER_TYPE_FIELD = RESERVED_FIELD_PREFIX + "SUPER_TYPE" +DESCRIPTION_FIELD = RESERVED_FIELD_PREFIX + "DESCRIPTION" +ELEMENT_TYPE_FIELD = RESERVED_FIELD_PREFIX + "ELEMENT_TYPE" +MULTIPLE_REFERENCES_ALLOWED_FIELD = RESERVED_FIELD_PREFIX + "MULTIPLE_REFERENCES_ALLOWED" +ID_FIELD = RESERVED_FIELD_PREFIX + "ID" +FLAGS_FIELD = RESERVED_FIELD_PREFIX + "FLAGS" +FLAG_DOCUMENT_ANNOTATION = "DocumentAnnotation" +ARRAY_SUFFIX = "[]" +ELEMENTS_FIELD = RESERVED_FIELD_PREFIX + "ELEMENTS" +NAN_VALUE = "NaN" +POSITIVE_INFINITE_VALUE = "Infinity" +POSITIVE_INFINITE_VALUE_ABBR = "Inf" +NEGATIVE_INFINITE_VALUE = "-Infinity" +NEGATIVE_INFINITE_VALUE_ABBR = "-Inf" + + +def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None) -> Cas: + """Loads a CAS from a JSON source. + + Args: + source: The JSON source. If `source` is a string, then it is assumed to be an JSON string. + If `source` is a file-like object, then the data is read from it. + typesystem: The type system that belongs to this CAS. If `None`, an empty type system is provided. + lenient: If `True`, unknown Types will be ignored. If `False`, unknown Types will cause an exception. + The default is `False`. + + Returns: + The deserialized CAS + + """ + if typesystem is None: + typesystem = TypeSystem() + + deserializer = CasJsonDeserializer() + return deserializer.deserialize(source, typesystem=typesystem) + + +class CasJsonDeserializer: + def __init__(self): + self._max_xmi_id = 0 + self._max_sofa_num = 0 + self._post_processors = [] + + def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] = None) -> Cas: + if isinstance(source, str): + data = json.loads(source) + else: + data = json.load(source) + + self._max_xmi_id = 0 + self._max_sofa_num = 0 + self._post_processors = [] + + embedded_typesystem = TypeSystem() + json_typesystem = data.get(TYPES_FIELD) + for type_name, json_type in json_typesystem.items(): + self._parse_type(embedded_typesystem, type_name, json_type) + + typesystem = merge_typesystems(typesystem, embedded_typesystem) + + cas = Cas(typesystem=typesystem) + + feature_structures = {} + json_feature_structures = data.get(FEATURE_STRUCTURES_FIELD) + if isinstance(json_feature_structures, list): + for json_fs in json_feature_structures: + if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA: + fs_id = json_fs.get(ID_FIELD) + fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures) + else: + fs_id = json_fs.get(ID_FIELD) + fs = self._parse_feature_structure(typesystem, fs_id, json_fs, feature_structures) + feature_structures[fs.xmiID] = fs + + if isinstance(json_feature_structures, dict): + for fs_id, json_fs in json_feature_structures.items(): + if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA: + fs_id = int(fs_id) + fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures) + else: + fs_id = int(fs_id) + fs = self._parse_feature_structure(typesystem, fs_id, json_fs, feature_structures) + feature_structures[fs.xmiID] = fs + + for post_processor in self._post_processors: + post_processor() + + cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) + cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) + + # At this point all views for which we have a sofa with a known ID and sofaNum have already been created + # as part of parsing the feature structures. Thus, if there are any views remaining that are only declared + # in the views section, we just create them with auto-assigned IDs + json_views = data.get(VIEWS_FIELD) + for view_name, json_view in json_views.items(): + self._parse_view(cas, view_name, json_view, feature_structures) + + return cas + + def _parse_type(self, typesystem: TypeSystem, type_name: str, json_type: Dict[str, any]): + super_type_name = json_type[SUPER_TYPE_FIELD] + description = json_type.get(DESCRIPTION_FIELD) + new_type = typesystem.create_type(type_name, super_type_name, description=description) + + for key, json_feature in json_type.items(): + if key.startswith(RESERVED_FIELD_PREFIX): + continue + typesystem.create_feature( + new_type, + name=key, + rangeType=json_feature[RANGE_FIELD], + description=json_feature.get(DESCRIPTION_FIELD), + elementType=json_feature.get(ELEMENT_TYPE_FIELD), + multipleReferencesAllowed=json_feature.get(MULTIPLE_REFERENCES_ALLOWED_FIELD), + ) + + def _get_or_create_view( + self, cas: Cas, view_name: str, fs_id: Optional[int] = None, sofa_num: Optional[int] = None + ) -> Cas: + if view_name == NAME_DEFAULT_SOFA: + view = cas.get_view(NAME_DEFAULT_SOFA) + + # We need to make sure that the sofa gets the real xmi, see #155 + if fs_id is not None: + view.get_sofa().xmiID = fs_id + + return view + else: + return cas.create_view(view_name, xmiID=fs_id, sofaNum=sofa_num) + + def _parse_view(self, cas: Cas, view_name: str, json_view: Dict[str, any], feature_structures: Dict[str, any]): + view = self._get_or_create_view(cas, view_name) + for member_id in json_view[VIEW_MEMBERS_FIELD]: + fs = feature_structures[member_id] + view.add_annotation(fs, keep_id=True) + + def _parse_sofa(self, cas: Cas, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any]) -> Sofa: + view = self._get_or_create_view( + cas, json_fs.get(FEATURE_BASE_NAME_SOFAID), fs_id, json_fs.get(FEATURE_BASE_NAME_SOFANUM) + ) + + view.sofa_string = json_fs.get(FEATURE_BASE_NAME_SOFASTRING) + view.sofa_mime = json_fs.get(FEATURE_BASE_NAME_SOFAMIME) + view.sofa_uri = json_fs.get(FEATURE_BASE_NAME_SOFAURI) + view.sofa_array = feature_structures.get(json_fs.get(REF_FEATURE_PREFIX + FEATURE_BASE_NAME_SOFAARRAY)) + + return view.get_sofa() + + def _parse_feature_structure( + self, typesystem: TypeSystem, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any] + ): + AnnotationType = typesystem.get_type(json_fs.get(TYPE_FIELD)) + + attributes = dict(json_fs) + + # Map the JSON FS ID to xmiID + attributes["xmiID"] = fs_id + + # Remap features that use a reserved Python name + if "self" in attributes: + attributes["self_"] = attributes.pop("self") + + if "type" in attributes: + attributes["type_"] = attributes.pop("type") + + if typesystem.is_primitive_array(AnnotationType.name): + attributes["elements"] = self._parse_primitive_array(AnnotationType.name, json_fs.get(ELEMENTS_FIELD)) + elif AnnotationType.name == TYPE_NAME_FS_ARRAY: + # Resolve id-ref at the end of processing + def fix_up(elements): + return lambda: setattr(fs, "elements", [feature_structures.get(e) for e in elements]) + + self._post_processors.append(fix_up(json_fs.get(ELEMENTS_FIELD))) + + self._strip_reserved_json_keys(attributes) + + ref_features = {} + for key, value in list(attributes.items()): + if key.startswith(REF_FEATURE_PREFIX): + ref_features[key[1:]] = value + attributes.pop(key) + if key.startswith(NUMBER_FEATURE_PREFIX): + attributes[key[1:]] = self._parse_float_value(value) + attributes.pop(key) + + self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) + fs = AnnotationType(**attributes) + + self._resolve_references(fs, ref_features, feature_structures) + + # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints + if typesystem.is_instance_of(fs.type, TYPE_NAME_ANNOTATION): + sofa = fs.sofa + fs.begin = sofa._offset_converter.uima_to_cassis(fs.begin) + fs.end = sofa._offset_converter.uima_to_cassis(fs.end) + + return fs + + def _parse_float_value(self, value: Union[str, float]) -> float: + if isinstance(value, float): + return value + elif value == NAN_VALUE: + return float("nan") + elif value == POSITIVE_INFINITE_VALUE or value == POSITIVE_INFINITE_VALUE_ABBR: + return float("inf") + elif value == NEGATIVE_INFINITE_VALUE or value == NEGATIVE_INFINITE_VALUE_ABBR: + return float("-inf") + + raise ValueError( + f"Illegal floating point value [{value}]. Must be a float literal or one of {NAN_VALUE}, " + f"{POSITIVE_INFINITE_VALUE}, {POSITIVE_INFINITE_VALUE_ABBR}, {NEGATIVE_INFINITE_VALUE}, or " + f"{NEGATIVE_INFINITE_VALUE_ABBR}" + ) + + def _parse_primitive_array(self, type_name: str, elements: [list, str]) -> List: + if elements and type_name == TYPE_NAME_BYTE_ARRAY: + return base64.b64decode(elements) + if elements and (type_name == TYPE_NAME_FLOAT_ARRAY or type_name == TYPE_NAME_DOUBLE_ARRAY): + return [self._parse_float_value(v) for v in elements] + else: + return elements + + def _resolve_references(self, fs, ref_features: Dict[str, any], feature_structures: Dict[int, any]): + for key, value in ref_features.items(): + target_fs = feature_structures.get(value) + if target_fs: + # Resolve id-ref now + setattr(fs, key, target_fs) + else: + # Resolve id-ref at the end of processing + def fix_up(k, v): + return lambda: setattr(fs, k, feature_structures.get(v)) + + self._post_processors.append(fix_up(key, value)) + + def _strip_reserved_json_keys( + self, + attributes: Dict[str, any], + ): + for key in list(attributes): + if key.startswith(RESERVED_FIELD_PREFIX): + attributes.pop(key) + + +class CasJsonSerializer: + _COMMON_FIELD_NAMES = {"xmiID", "type"} + + def __init__(self): + pass + + def serialize( + self, + sink: Union[IO, str, None], + cas: Cas, + pretty_print: bool = True, + ensure_ascii: bool = False, + type_system_mode: TypeSystemMode = TypeSystemMode.FULL, + ) -> Union[str, None]: + feature_structures = [] + + views = {} + for view in cas.views: + views[view.sofa.sofaID] = self._serialize_view(view) + + if view.sofa.sofaArray: + json_sofa_array_fs = self._serialize_feature_structure(view.sofa.sofaArray) + feature_structures.append(json_sofa_array_fs) + json_sofa_fs = self._serialize_feature_structure(view.sofa) + feature_structures.append(json_sofa_fs) + + # Find all fs, even the ones that are not directly added to a sofa + used_types = set() + for fs in sorted(cas._find_all_fs(include_inlinable_arrays_and_lists=True), key=lambda a: a.xmiID): + used_types.add(fs.type) + json_fs = self._serialize_feature_structure(fs) + feature_structures.append(json_fs) + + types = None + if type_system_mode is not TypeSystemMode.NONE: + types = {} + + if type_system_mode is TypeSystemMode.MINIMAL: + # Build transitive closure of used types by following parents, features, etc. + types_to_include = cas.typesystem.transitive_closure(used_types) + elif type_system_mode is TypeSystemMode.FULL: + types_to_include = cas.typesystem.get_types() + + for type_ in types_to_include: + if type_.name == TYPE_NAME_DOCUMENT_ANNOTATION: + continue + json_type = self._serialize_type(type_) + types[json_type[NAME_FIELD]] = json_type + + data = {} + if types is not None: + data[TYPES_FIELD] = types + if feature_structures is not None: + data[FEATURE_STRUCTURES_FIELD] = feature_structures + if views is not None: + data[VIEWS_FIELD] = views + + if sink and not isinstance(sink, TextIOBase): + sink = TextIOWrapper(sink, encoding="utf-8", write_through=True) + + if sink: + json.dump( + data, + sink, + sort_keys=False, + indent=2 if pretty_print else None, + ensure_ascii=ensure_ascii, + allow_nan=False, + ) + else: + return json.dumps( + data, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii, allow_nan=False + ) + + if isinstance(sink, TextIOWrapper): + sink.detach() # Prevent TextIOWrapper from closing the BytesIO + + return None + + def _serialize_type(self, type_: Type): + type_name = self._to_external_type_name(type_.name) + supertype_name = self._to_external_type_name(type_.supertype.name) + + json_type = { + NAME_FIELD: type_name, + SUPER_TYPE_FIELD: supertype_name, + } + + if type_.description: + json_type[DESCRIPTION_FIELD] = type_.description + + for feature in list(type_.features): + json_feature = self._serialize_feature(json_type, feature) + json_type[json_feature[NAME_FIELD]] = json_feature + + return json_type + + def _serialize_feature(self, json_type, feature: Feature): + # If the feature name is a reserved name like `self`, then we added an + # underscore to it before so Python can handle it. We now need to remove it. + feature_name = feature.name + if feature._has_reserved_name: + feature_name = feature_name[:-1] + + json_feature = { + NAME_FIELD: feature_name, + RANGE_FIELD: self._to_external_type_name(feature.rangeType.name), + } + + if feature.description: + json_feature[DESCRIPTION_FIELD] = feature.description + + if feature.multipleReferencesAllowed is not None: + json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed + + if feature.elementType is not None: + json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType.name) + + return json_feature + + def _serialize_feature_structure(self, fs) -> dict: + type_name = fs.type.name + + json_fs = OrderedDict() + json_fs[ID_FIELD] = fs.xmiID + json_fs[TYPE_FIELD] = type_name + + if type_name == TYPE_NAME_BYTE_ARRAY: + if fs.elements: + json_fs[ELEMENTS_FIELD] = base64.b64encode(bytes(fs.elements)).decode("ascii") + return json_fs + elif type_name in {TYPE_NAME_DOUBLE_ARRAY, TYPE_NAME_FLOAT_ARRAY}: + if fs.elements: + json_fs[ELEMENTS_FIELD] = [self._serialize_float_value(e) for e in fs.elements] + return json_fs + elif is_primitive_array(fs.type): + if fs.elements: + json_fs[ELEMENTS_FIELD] = fs.elements + return json_fs + elif TYPE_NAME_FS_ARRAY == type_name: + if fs.elements: + json_fs[ELEMENTS_FIELD] = [self._serialize_ref(e) for e in fs.elements] + return json_fs + + for feature in fs.type.all_features: + if feature.name in CasJsonSerializer._COMMON_FIELD_NAMES: + continue + + feature_name = feature.name + + # Strip the underscore we added for reserved names + if feature._has_reserved_name: + feature_name = feature.name[:-1] + + # Skip over 'None' features + value = getattr(fs, feature.name) + if value is None: + continue + + # Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets + if feature.domainType.name == TYPE_NAME_ANNOTATION and feature_name == "begin" or feature_name == "end": + sofa: Sofa = getattr(fs, "sofa") + value = sofa._offset_converter.cassis_to_uima(value) + + if feature.rangeType.name in {TYPE_NAME_DOUBLE, TYPE_NAME_FLOAT}: + float_value = self._serialize_float_value(value) + if isinstance(float_value, str): + feature_name = NUMBER_FEATURE_PREFIX + feature_name + json_fs[feature_name] = self._serialize_float_value(value) + elif is_primitive(feature.rangeType): + json_fs[feature_name] = value + else: + # We need to encode non-primitive features as a reference + json_fs[REF_FEATURE_PREFIX + feature_name] = self._serialize_ref(value) + return json_fs + + def _serialize_float_value(self, value) -> Union[float, str]: + if isnan(value): + return NAN_VALUE + elif math.isinf(value): + if value > 0: + return POSITIVE_INFINITE_VALUE + else: + return NEGATIVE_INFINITE_VALUE + return value + + def _serialize_ref(self, fs) -> int: + if not fs: + return None + + return fs.xmiID + + def _serialize_view(self, view: View): + return { + VIEW_SOFA_FIELD: view.sofa.xmiID, + VIEW_MEMBERS_FIELD: sorted(x.xmiID for x in view.get_all_annotations()), + } + + def _to_external_type_name(self, type_name: str): + if type_name.startswith("uima.noNamespace."): + return type_name.replace("uima.noNamespace.", "") + return type_name diff --git a/cassis/typesystem.py b/cassis/typesystem.py index 2957a0d..12d8122 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -1,6 +1,7 @@ import re import warnings from collections import defaultdict +from enum import Enum, auto from io import BytesIO from itertools import chain, filterfalse from pathlib import Path @@ -188,6 +189,14 @@ _LIST_TYPES = _PRIMITIVE_LIST_TYPES | {TYPE_NAME_FS_LIST} +class TypeSystemMode(Enum): + """How much type system information to include.""" + + FULL = auto() + MINIMAL = auto() + NONE = auto() + + def _string_to_valid_classname(name: str): return re.sub("[^a-zA-Z0-9_]", "_", name) diff --git a/cassis/xmi.py b/cassis/xmi.py index e2a41fd..fc6f2d7 100644 --- a/cassis/xmi.py +++ b/cassis/xmi.py @@ -475,7 +475,7 @@ def __init__(self): self._urls_to_prefixes = {} self._duplicate_namespaces = defaultdict(int) - def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): + def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> Union[str, None]: xmi_attrs = {"{http://www.omg.org/XMI}version": "2.0"} root = etree.Element(etree.QName(self._nsmap["xmi"], "XMI"), nsmap=self._nsmap, **xmi_attrs) @@ -495,8 +495,17 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True): doc = etree.ElementTree(root) etree.cleanup_namespaces(doc, top_nsmap=self._nsmap) + return_str = sink is None + if return_str: + sink = BytesIO() + doc.write(sink, xml_declaration=True, pretty_print=pretty_print, encoding="UTF-8") + if return_str: + return sink.getvalue().decode("utf-8") + + return None + def _serialize_cas_null(self, root: etree.Element): name = etree.QName(self._nsmap["cas"], "NULL") elem = etree.SubElement(root, name) diff --git a/tests/performance.py b/tests/performance.py new file mode 100644 index 0000000..69575de --- /dev/null +++ b/tests/performance.py @@ -0,0 +1,69 @@ +from random import Random +from timeit import default_timer as timer + +import pytest + +from cassis import load_cas_from_json, load_cas_from_xmi +from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator + +generator = MultiFeatureRandomCasGenerator() +generator.rnd = Random(123456) +generator.size = 1000 +iterations = 100 + +typesystem = generator.generate_type_system() +randomized_cas = generator.generate_cas(typesystem) + +randomized_cas_xmi = randomized_cas.to_xmi() +randomized_cas_xmi_bytes = randomized_cas_xmi.encode("utf-8") + +randomized_cas_json = randomized_cas.to_json() +randomized_cas_json_bytes = randomized_cas_json.encode("utf-8") + + +@pytest.mark.performance +def test_xmi_serialization_performance(): + start = timer() + for i in range(0, iterations): + randomized_cas.to_xmi() + end = timer() + + print( + f"XMI: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)" + ) + + +@pytest.mark.performance +def test_json_serialization_performance(): + start = timer() + for i in range(0, iterations): + randomized_cas.to_json() + end = timer() + + print( + f"JSON: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)" + ) + + +@pytest.mark.performance +def test_xmi_deserialization_performance(): + start = timer() + for i in range(0, iterations): + load_cas_from_xmi(randomized_cas_xmi, typesystem) + end = timer() + + print( + f"XMI: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)" + ) + + +@pytest.mark.performance +def test_json_deserialization_performance(): + start = timer() + for i in range(0, iterations): + load_cas_from_json(randomized_cas_json, typesystem) + end = timer() + + print( + f"JSON: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)" + ) diff --git a/tests/test_files/json/README.md b/tests/test_files/json/README.md new file mode 100644 index 0000000..483853a --- /dev/null +++ b/tests/test_files/json/README.md @@ -0,0 +1,5 @@ +Test files in this folder were sourced from + +https://github.com/apache/uima-uimaj/tree/feature/UIMA-6266-Clean-JSON-Wire-Format-for-CAS/uimaj-json/src/test/resources/CasSerializationDeserialization_JsonCas2_FsAsArray_Test/ser-ref + +Apache License 2.0 diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json new file mode 100644 index 0000000..422cea5 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json @@ -0,0 +1,78 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "\uD83E\uDD73 This \uD83D\uDC73\uD83C\uDFFB‍♀️ is ✆ a \uD83E\uDDD4\uD83C\uDFFE‍♂️ test \uD83D\uDC7B" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 2 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 3, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 15 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 16, + "end" : 18 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 19, + "end" : 20 + }, { + "%ID" : 7, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 21, + "end" : 22 + }, { + "%ID" : 8, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 23, + "end" : 30 + }, { + "%ID" : 9, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 31, + "end" : 35 + }, { + "%ID" : 10, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 36, + "end" : 38 + }, { + "%ID" : 11, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 38, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..6d8ec43 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json new file mode 100644 index 0000000..0d97fb3 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json @@ -0,0 +1,102 @@ +{ + "%FEATURE_STRUCTURES": [ + { + "%ID": 1, + "%TYPE": "uima.cas.Sofa", + "sofaID": "_InitialView", + "sofaNum": 1 + }, + { + "#doubleNan": "NaN", + "#doubleNegInfinity": "-Infinity", + "#doublePosInfinity": "Infinity", + "#floatNan": "NaN", + "#floatNegInfinity": "-Infinity", + "#floatPosInfinity": "Infinity", + "%ID": 1, + "%TYPE": "SpecialValuesType", + "doubleOne": 1.0, + "doubleZero": 0.0, + "floatOne": 1.0, + "floatZero": 0.0 + }, + { + "%ELEMENTS": [ + 0.0, + 1.0, + "-Infinity", + "Infinity", + "NaN" + ], + "%ID": 2, + "%TYPE": "uima.cas.DoubleArray" + }, + { + "%ELEMENTS": [ + 0.0, + 1.0, + "-Infinity", + "Infinity", + "NaN" + ], + "%ID": 3, + "%TYPE": "uima.cas.FloatArray" + } + ], + "%TYPES": { + "SpecialValuesType": { + "%NAME": "SpecialValuesType", + "%SUPER_TYPE": "uima.cas.TOP", + "doubleNan": { + "%NAME": "doubleNan", + "%RANGE": "uima.cas.Double" + }, + "doubleNegInfinity": { + "%NAME": "doubleNegInfinity", + "%RANGE": "uima.cas.Double" + }, + "doubleOne": { + "%NAME": "doubleOne", + "%RANGE": "uima.cas.Double" + }, + "doublePosInfinity": { + "%NAME": "doublePosInfinity", + "%RANGE": "uima.cas.Double" + }, + "doubleZero": { + "%NAME": "doubleZero", + "%RANGE": "uima.cas.Double" + }, + "floatNan": { + "%NAME": "floatNan", + "%RANGE": "uima.cas.Float" + }, + "floatNegInfinity": { + "%NAME": "floatNegInfinity", + "%RANGE": "uima.cas.Float" + }, + "floatOne": { + "%NAME": "floatOne", + "%RANGE": "uima.cas.Float" + }, + "floatPosInfinity": { + "%NAME": "floatPosInfinity", + "%RANGE": "uima.cas.Float" + }, + "floatZero": { + "%NAME": "floatZero", + "%RANGE": "uima.cas.Float" + } + } + }, + "%VIEWS": { + "_InitialView": { + "%MEMBERS": [ + 1, + 2, + 3 + ], + "%SOFA": 1 + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml new file mode 100644 index 0000000..9a8766d --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml @@ -0,0 +1,74 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + SpecialValuesType + + uima.cas.TOP + + + doubleZero + + uima.cas.Double + + + doubleOne + + uima.cas.Double + + + doublePosInfinity + + uima.cas.Double + + + doubleNegInfinity + + uima.cas.Double + + + doubleNan + + uima.cas.Double + + + floatZero + + uima.cas.Float + + + floatOne + + uima.cas.Float + + + floatPosInfinity + + uima.cas.Float + + + floatNegInfinity + + uima.cas.Float + + + floatNan + + uima.cas.Float + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi new file mode 100644 index 0000000..e02d4cb --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml new file mode 100644 index 0000000..9a8766d --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml @@ -0,0 +1,74 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + SpecialValuesType + + uima.cas.TOP + + + doubleZero + + uima.cas.Double + + + doubleOne + + uima.cas.Double + + + doublePosInfinity + + uima.cas.Double + + + doubleNegInfinity + + uima.cas.Double + + + doubleNan + + uima.cas.Double + + + floatZero + + uima.cas.Float + + + floatOne + + uima.cas.Float + + + floatPosInfinity + + uima.cas.Float + + + floatNegInfinity + + uima.cas.Float + + + floatNan + + uima.cas.Float + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json new file mode 100644 index 0000000..1944181 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json @@ -0,0 +1,36 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "هذا اختبار" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 3 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 4, + "end" : 10 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 10, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..108d362 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json new file mode 100644 index 0000000..20d935b --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json @@ -0,0 +1,21 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.ByteArray", + "%ELEMENTS" : "VGhpcyBpcyBhIHRlc3Q=" + }, { + "%ID" : 2, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text/plain", + "@sofaArray" : 1 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 2, + "%MEMBERS" : [ ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi new file mode 100644 index 0000000..89075f6 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi @@ -0,0 +1,5 @@ + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json new file mode 100644 index 0000000..0b142a8 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json @@ -0,0 +1,17 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text/plain", + "sofaURI" : "classpath:/ProgrammaticallyCreatedCasDataSuite/document.txt" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi new file mode 100644 index 0000000..89966e0 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi @@ -0,0 +1,4 @@ + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json new file mode 100644 index 0000000..39f5ffe --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json @@ -0,0 +1,24 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test." + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 15, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi new file mode 100644 index 0000000..943df5f --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi @@ -0,0 +1,6 @@ + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json new file mode 100644 index 0000000..a9522cf --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json @@ -0,0 +1,48 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 4 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 9 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 10, + "end" : 14 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 14, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..37c1e9b --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json new file mode 100644 index 0000000..d586738 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json @@ -0,0 +1,48 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "這是一個測試" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 1 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 1, + "end" : 2 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 2, + "end" : 4 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 4, + "end" : 6 + }, { + "%ID" : 6, + "%TYPE" : "uima.tcas.DocumentAnnotation", + "@sofa" : 1, + "begin" : 0, + "end" : 6, + "language" : "x-unspecified" + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5, 6 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi new file mode 100644 index 0000000..0087d72 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json new file mode 100644 index 0000000..56784fe --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json @@ -0,0 +1,39 @@ +{ + "%TYPES" : { }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 1, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView" + }, { + "%ID" : 2, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 0, + "end" : 4 + }, { + "%ID" : 3, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 4, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 8, + "end" : 9 + }, { + "%ID" : 5, + "%TYPE" : "uima.tcas.Annotation", + "@sofa" : 1, + "begin" : 10, + "end" : 14 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ 2, 3, 4, 5 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi new file mode 100644 index 0000000..37c1e9b --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json new file mode 100644 index 0000000..fcd8582 --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json @@ -0,0 +1,9 @@ +{ + "%TYPES" : { }, + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 1, + "%MEMBERS" : [ ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml new file mode 100644 index 0000000..07e327a --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml @@ -0,0 +1,17 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi new file mode 100644 index 0000000..6fd88bd --- /dev/null +++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi @@ -0,0 +1,3 @@ + + + diff --git a/tests/test_json.py b/tests/test_json.py new file mode 100644 index 0000000..3889b11 --- /dev/null +++ b/tests/test_json.py @@ -0,0 +1,154 @@ +import json + +from cassis.typesystem import TYPE_NAME_ANNOTATION +from tests.fixtures import * +from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator, MultiTypeRandomCasGenerator +from tests.util import assert_json_equal + +FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files", "json", "fs_as_array", "ser-ref") + +FIXTURES = [ + (os.path.join(FIXTURE_DIR, "casWithSofaDataArray"), []), + (os.path.join(FIXTURE_DIR, "casWithSofaDataURI"), []), + (os.path.join(FIXTURE_DIR, "casWithFloatingPointSpecialValues"), []), + (os.path.join(FIXTURE_DIR, "casWithText"), [["uima.tcas.DocumentAnnotation", 0, 15, "This is a test."]]), + ( + os.path.join(FIXTURE_DIR, "casWithoutTextButWithAnnotations"), + [ + ["uima.tcas.Annotation", 0, 4, None], + ["uima.tcas.Annotation", 5, 7, None], + ["uima.tcas.Annotation", 8, 9, None], + ["uima.tcas.Annotation", 10, 14, None], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 4, "This"], + ["uima.tcas.Annotation", 5, 7, "is"], + ["uima.tcas.Annotation", 8, 9, "a"], + ["uima.tcas.Annotation", 10, 14, "test"], + ["uima.tcas.DocumentAnnotation", 0, 14, "This is a test"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithEmojiUnicodeTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 1, "🥳", b"\xf0\x9f\xa5\xb3"], + ["uima.tcas.Annotation", 2, 6, "This"], + [ + "uima.tcas.Annotation", + 7, + 12, + "👳🏻\u200d♀️", + b"\xf0\x9f\x91\xb3\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f", + ], + ["uima.tcas.Annotation", 13, 15, "is"], + ["uima.tcas.Annotation", 16, 17, "✆", b"\xe2\x9c\x86"], + ["uima.tcas.Annotation", 18, 19, "a"], + [ + "uima.tcas.Annotation", + 20, + 25, + "🧔🏾\u200d♂️", + b"\xf0\x9f\xa7\x94\xf0\x9f\x8f\xbe\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f", + ], + ["uima.tcas.Annotation", 26, 30, "test"], + ["uima.tcas.Annotation", 31, 32, "👻", b"\xf0\x9f\x91\xbb"], + ["uima.tcas.DocumentAnnotation", 0, 32, "🥳 This 👳🏻\u200d♀️ is ✆ a 🧔🏾\u200d♂️ test 👻"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithLeftToRightTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 3, "هذا"], + ["uima.tcas.Annotation", 4, 10, "اختبار"], + ["uima.tcas.DocumentAnnotation", 0, 10, "هذا اختبار"], + ], + ), + ( + os.path.join(FIXTURE_DIR, "casWithTraditionalChineseTextAndAnnotations"), + [ + ["uima.tcas.Annotation", 0, 1, "這"], + ["uima.tcas.Annotation", 1, 2, "是"], + ["uima.tcas.Annotation", 2, 4, "一個"], + ["uima.tcas.Annotation", 4, 6, "測試"], + ["uima.tcas.DocumentAnnotation", 0, 6, "這是一個測試"], + ], + ), +] + + +@pytest.mark.parametrize("json_path, annotations", FIXTURES) +def test_deserialization_serialization(json_path, annotations): + with open(os.path.join(json_path, "data.json"), "rb") as f: + cas = load_cas_from_json(f) + + with open(os.path.join(json_path, "data.json"), "rb") as f: + expected_json = json.load(f) + + actual_json = cas.to_json(pretty_print=True) + + assert_json_equal(actual_json, expected_json, sort_keys=True) + + +def test_multi_type_random_serialization_deserialization(): + generator = MultiTypeRandomCasGenerator() + for i in range(0, 10): + generator.size = (i + 1) * 10 + generator.type_count = i + 1 + typesystem = generator.generate_type_system() + randomized_cas = generator.generate_cas(typesystem) + print(f"CAS size: {sum(len(view.get_all_annotations()) for view in randomized_cas.views)}") + expected_json = randomized_cas.to_json() + + loaded_cas = load_cas_from_json(expected_json) + actual_json = loaded_cas.to_json() + + assert_json_equal(actual_json, expected_json) + + +def test_multi_feature_random_serialization_deserialization(): + generator = MultiFeatureRandomCasGenerator() + for i in range(0, 10): + generator.size = (i + 1) * 10 + typesystem = generator.generate_type_system() + randomized_cas = generator.generate_cas(typesystem) + print(f"CAS size: {sum(len(view.get_all_annotations()) for view in randomized_cas.views)}") + expected_json = randomized_cas.to_json() + + loaded_cas = load_cas_from_json(expected_json) + actual_json = loaded_cas.to_json() + + assert_json_equal(actual_json, expected_json) + + +@pytest.mark.parametrize("json_path, annotations", FIXTURES) +def test_unicode(json_path, annotations): + with open(os.path.join(json_path, "data.json"), "rb") as f: + cas = load_cas_from_json(f) + + actual_annotations = [ + [a.type.name, a.begin, a.end, a.get_covered_text()] + for a in sorted(cas.select(TYPE_NAME_ANNOTATION), key=lambda k: k.type.name) + ] + expected_annotations = [a[0:4] for a in annotations] + assert actual_annotations == expected_annotations + + for i in range(0, len(annotations)): + expected = annotations[i] + actual = actual_annotations[i] + + expected_covered_text = expected[3] + actual_covered_text = actual[3] + + if not expected_covered_text: + continue + + for n in range(len(actual_covered_text)): + print(f"{n}: [{actual_covered_text[n]}] {hex(ord(actual_covered_text[n]))}") + + if len(expected) >= 5: + expected_utf8_bytes = expected[4] + actual_utf8_bytes = bytes(actual_covered_text, "UTF-8") + assert actual_utf8_bytes == expected_utf8_bytes diff --git a/tests/test_xmi.py b/tests/test_xmi.py index 8adc317..ac290d5 100644 --- a/tests/test_xmi.py +++ b/tests/test_xmi.py @@ -185,7 +185,7 @@ def test_serializing_cas_to_file_path(tmpdir, xmi, typesystem_xml): cas.to_xmi(path) - with open(path, "r") as actual: + with open(path) as actual: assert_xml_equal(actual.read(), xmi) diff --git a/tests/util.py b/tests/util.py index 3f6367c..1c84f89 100644 --- a/tests/util.py +++ b/tests/util.py @@ -1,4 +1,5 @@ import difflib +import json from typing import IO, Union import lxml_asserts @@ -39,6 +40,45 @@ def assert_xml_equal(actual: Union[IO, str], expected: Union[IO, str]): raise e +def assert_json_equal(actual: str, expected: Union[IO, str], sort_keys: bool = False): + """Checks whether the JSON trees behind `actual` and `expected` are equal. + + Args: + actual: The actual JSON + expected: The expected JSON + + Throws: + AssertionError when json(actual) != json(expected) + """ + if isinstance(actual, str): + actual = json.loads(actual) + + if isinstance(expected, str): + expected = json.loads(expected) + + actual_json = json.dumps(actual, sort_keys=sort_keys, indent=2) + expected_json = json.dumps(expected, sort_keys=sort_keys, indent=2) + + try: + assert actual_json == expected_json + except AssertionError as e: + # For debugging purposes, the trees are saved to later inspect their contents + with open("actual.json", "w") as f: + f.write(actual_json) + + with open("expected.json", "w") as f: + f.write(expected_json) + + with open("difference.diff", "w") as f: + diff = difflib.unified_diff( + actual_json.splitlines(), expected_json.splitlines(), fromfile="Actual", tofile="Expected" + ) + diff_string = "\n".join(diff) + f.write(diff_string) + + raise e + + def _to_etree(source: Union[IO, str]) -> etree.Element: parser = etree.XMLParser(remove_blank_text=True)