diff --git a/.gitignore b/.gitignore
index 0d13f2a..e2de877 100644
--- a/.gitignore
+++ b/.gitignore
@@ -221,3 +221,5 @@ expected.xml
difference.diff
xml_issue.py
+actual.json
+expected.json
diff --git a/Makefile b/Makefile
index 6f2a1b3..d5528c4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
PYTHON_FILES = cassis tests
test:
- python -m pytest tests/
+ python -m pytest -m "not performance" tests/
format:
black -l 120 cassis/
diff --git a/README.rst b/README.rst
index aac0fe2..cbb308d 100644
--- a/README.rst
+++ b/README.rst
@@ -59,6 +59,7 @@ Some features are still under development, e.g.
- Proper type checking
- XML/XMI schema validation
+- UIMA JSON CAS support (the format is not yet finalized)
Installation
------------
diff --git a/cassis/__init__.py b/cassis/__init__.py
index 712c73a..9a5239a 100644
--- a/cassis/__init__.py
+++ b/cassis/__init__.py
@@ -1,6 +1,7 @@
"""UIMA CAS processing library in Python."""
from .cas import Cas, Sofa, View
+from .json import load_cas_from_json
from .typesystem import TypeSystem, load_dkpro_core_typesystem, load_typesystem, merge_typesystems
from .util import cas_to_comparable_text
from .xmi import load_cas_from_xmi
@@ -14,5 +15,6 @@
"load_dkpro_core_typesystem",
"merge_typesystems",
"load_cas_from_xmi",
+ "load_cas_from_json",
"cas_to_comparable_text",
]
diff --git a/cassis/cas.py b/cassis/cas.py
index ec42fa6..d786543 100644
--- a/cassis/cas.py
+++ b/cassis/cas.py
@@ -1,6 +1,5 @@
import sys
from collections import defaultdict
-from io import BytesIO
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union
@@ -10,10 +9,12 @@
from sortedcontainers import SortedKeyList
from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem, TYPE_NAME_FS_LIST, \
- TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD
+ TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD, TypeSystemMode
_validator_optional_string = validators.optional(validators.instance_of(str))
+NAME_DEFAULT_SOFA = "_InitialView"
+
class IdGenerator:
def __init__(self, initial_id: int = 1):
@@ -107,6 +108,9 @@ class Sofa:
#: str: The sofa URI, it references remote sofa data
sofaURI = attr.ib(default=None, validator=_validator_optional_string)
+ #: str: The sofa data byte array
+ sofaArray = attr.ib(default=None)
+
#: OffsetConverter: Converts from UIMA UTF-16 based offsets to Unicode codepoint offsets and back
_offset_converter = attr.ib(factory=OffsetConverter, eq=False, hash=False)
@@ -543,6 +547,25 @@ def sofa_uri(self, value: str):
"""
self.get_sofa().sofaURI = value
+ @property
+ def sofa_array(self) -> str:
+ """The sofa byte array references a uima.cas.ByteArray feature structure
+
+ Returns: The sofa data byte array.
+
+ """
+ return self.get_sofa().sofaArray
+
+ @sofa_array.setter
+ def sofa_array(self, value):
+ """Sets the sofa byte array to the given uima.cas.ByteArray feature structure.
+
+ Args:
+ value: The new sofa byte array feature structure.
+
+ """
+ self.get_sofa().sofaArray = value
+
def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]:
"""Creates a XMI representation of this CAS.
@@ -557,19 +580,57 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False
"""
from cassis.xmi import CasXmiSerializer
- serializer = CasXmiSerializer()
+ return self._serialize(CasXmiSerializer(), path, pretty_print=pretty_print)
+ def to_json(
+ self,
+ path: Union[str, Path, None] = None,
+ pretty_print: bool = False,
+ ensure_ascii=False,
+ type_system_mode: TypeSystemMode = TypeSystemMode.FULL,
+ ) -> Optional[str]:
+ """Creates a JSON representation of this CAS.
+
+ Args:
+ path: File path, if `None` is provided the result is returned as a string
+ pretty_print: `True` if the resulting JSON should be pretty-printed, else `False`
+ ensure_ascii: Whether to escape non-ASCII Unicode characters or not
+ type_system_mode: Whether to serialize the full type system (`FUL`), only the types used (`MINIMAL`), or no
+ type system information at all (`NONE`)
+
+ Returns:
+ If `path` is None, then the JSON representation of this CAS is returned as a string
+ """
+ from cassis.json import CasJsonSerializer
+
+ return self._serialize(
+ CasJsonSerializer(),
+ path,
+ pretty_print=pretty_print,
+ ensure_ascii=ensure_ascii,
+ type_system_mode=type_system_mode,
+ )
+
+ def _serialize(self, serializer, path: Union[str, Path, None] = None, **kwargs):
+ """Runs this CAS through the given serializer.
+
+ Args:
+ path: File path, if `None` is provided the result is returned as a string
+
+
+ Returns:
+ If `path` is None, then the data representation of this CAS is returned as a string
+
+ """
# If `path` is None, then serialize to a string and return it
if path is None:
- sink = BytesIO()
- serializer.serialize(sink, self, pretty_print=pretty_print)
- return sink.getvalue().decode("utf-8")
+ return serializer.serialize(None, self, **kwargs)
elif isinstance(path, str):
with open(path, "wb") as f:
- serializer.serialize(f, self, pretty_print=pretty_print)
+ serializer.serialize(f, self, **kwargs)
elif isinstance(path, Path):
with path.open("wb") as f:
- serializer.serialize(f, self, pretty_print=pretty_print)
+ serializer.serialize(f, self, **kwargs)
else:
raise TypeError(f"`path` needs to be one of [str, None, Path], but was <{type(path)}>")
diff --git a/cassis/json.py b/cassis/json.py
new file mode 100644
index 0000000..ecb1c37
--- /dev/null
+++ b/cassis/json.py
@@ -0,0 +1,469 @@
+import base64
+import json
+import math
+from collections import OrderedDict
+from io import TextIOBase, TextIOWrapper
+from math import isnan
+
+from cassis.cas import NAME_DEFAULT_SOFA, Cas, IdGenerator, Sofa, View
+from cassis.typesystem import *
+
+RESERVED_FIELD_PREFIX = "%"
+REF_FEATURE_PREFIX = "@"
+NUMBER_FEATURE_PREFIX = "#"
+ANCHOR_FEATURE_PREFIX = "^"
+TYPE_FIELD = RESERVED_FIELD_PREFIX + "TYPE"
+RANGE_FIELD = RESERVED_FIELD_PREFIX + "RANGE"
+TYPES_FIELD = RESERVED_FIELD_PREFIX + "TYPES"
+FEATURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURES"
+VIEWS_FIELD = RESERVED_FIELD_PREFIX + "VIEWS"
+VIEW_SOFA_FIELD = RESERVED_FIELD_PREFIX + "SOFA"
+VIEW_MEMBERS_FIELD = RESERVED_FIELD_PREFIX + "MEMBERS"
+FEATURE_STRUCTURES_FIELD = RESERVED_FIELD_PREFIX + "FEATURE_STRUCTURES"
+NAME_FIELD = RESERVED_FIELD_PREFIX + "NAME"
+SUPER_TYPE_FIELD = RESERVED_FIELD_PREFIX + "SUPER_TYPE"
+DESCRIPTION_FIELD = RESERVED_FIELD_PREFIX + "DESCRIPTION"
+ELEMENT_TYPE_FIELD = RESERVED_FIELD_PREFIX + "ELEMENT_TYPE"
+MULTIPLE_REFERENCES_ALLOWED_FIELD = RESERVED_FIELD_PREFIX + "MULTIPLE_REFERENCES_ALLOWED"
+ID_FIELD = RESERVED_FIELD_PREFIX + "ID"
+FLAGS_FIELD = RESERVED_FIELD_PREFIX + "FLAGS"
+FLAG_DOCUMENT_ANNOTATION = "DocumentAnnotation"
+ARRAY_SUFFIX = "[]"
+ELEMENTS_FIELD = RESERVED_FIELD_PREFIX + "ELEMENTS"
+NAN_VALUE = "NaN"
+POSITIVE_INFINITE_VALUE = "Infinity"
+POSITIVE_INFINITE_VALUE_ABBR = "Inf"
+NEGATIVE_INFINITE_VALUE = "-Infinity"
+NEGATIVE_INFINITE_VALUE_ABBR = "-Inf"
+
+
+def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None) -> Cas:
+ """Loads a CAS from a JSON source.
+
+ Args:
+ source: The JSON source. If `source` is a string, then it is assumed to be an JSON string.
+ If `source` is a file-like object, then the data is read from it.
+ typesystem: The type system that belongs to this CAS. If `None`, an empty type system is provided.
+ lenient: If `True`, unknown Types will be ignored. If `False`, unknown Types will cause an exception.
+ The default is `False`.
+
+ Returns:
+ The deserialized CAS
+
+ """
+ if typesystem is None:
+ typesystem = TypeSystem()
+
+ deserializer = CasJsonDeserializer()
+ return deserializer.deserialize(source, typesystem=typesystem)
+
+
+class CasJsonDeserializer:
+ def __init__(self):
+ self._max_xmi_id = 0
+ self._max_sofa_num = 0
+ self._post_processors = []
+
+ def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] = None) -> Cas:
+ if isinstance(source, str):
+ data = json.loads(source)
+ else:
+ data = json.load(source)
+
+ self._max_xmi_id = 0
+ self._max_sofa_num = 0
+ self._post_processors = []
+
+ embedded_typesystem = TypeSystem()
+ json_typesystem = data.get(TYPES_FIELD)
+ for type_name, json_type in json_typesystem.items():
+ self._parse_type(embedded_typesystem, type_name, json_type)
+
+ typesystem = merge_typesystems(typesystem, embedded_typesystem)
+
+ cas = Cas(typesystem=typesystem)
+
+ feature_structures = {}
+ json_feature_structures = data.get(FEATURE_STRUCTURES_FIELD)
+ if isinstance(json_feature_structures, list):
+ for json_fs in json_feature_structures:
+ if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA:
+ fs_id = json_fs.get(ID_FIELD)
+ fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures)
+ else:
+ fs_id = json_fs.get(ID_FIELD)
+ fs = self._parse_feature_structure(typesystem, fs_id, json_fs, feature_structures)
+ feature_structures[fs.xmiID] = fs
+
+ if isinstance(json_feature_structures, dict):
+ for fs_id, json_fs in json_feature_structures.items():
+ if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA:
+ fs_id = int(fs_id)
+ fs = self._parse_sofa(cas, fs_id, json_fs, feature_structures)
+ else:
+ fs_id = int(fs_id)
+ fs = self._parse_feature_structure(typesystem, fs_id, json_fs, feature_structures)
+ feature_structures[fs.xmiID] = fs
+
+ for post_processor in self._post_processors:
+ post_processor()
+
+ cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
+ cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)
+
+ # At this point all views for which we have a sofa with a known ID and sofaNum have already been created
+ # as part of parsing the feature structures. Thus, if there are any views remaining that are only declared
+ # in the views section, we just create them with auto-assigned IDs
+ json_views = data.get(VIEWS_FIELD)
+ for view_name, json_view in json_views.items():
+ self._parse_view(cas, view_name, json_view, feature_structures)
+
+ return cas
+
+ def _parse_type(self, typesystem: TypeSystem, type_name: str, json_type: Dict[str, any]):
+ super_type_name = json_type[SUPER_TYPE_FIELD]
+ description = json_type.get(DESCRIPTION_FIELD)
+ new_type = typesystem.create_type(type_name, super_type_name, description=description)
+
+ for key, json_feature in json_type.items():
+ if key.startswith(RESERVED_FIELD_PREFIX):
+ continue
+ typesystem.create_feature(
+ new_type,
+ name=key,
+ rangeType=json_feature[RANGE_FIELD],
+ description=json_feature.get(DESCRIPTION_FIELD),
+ elementType=json_feature.get(ELEMENT_TYPE_FIELD),
+ multipleReferencesAllowed=json_feature.get(MULTIPLE_REFERENCES_ALLOWED_FIELD),
+ )
+
+ def _get_or_create_view(
+ self, cas: Cas, view_name: str, fs_id: Optional[int] = None, sofa_num: Optional[int] = None
+ ) -> Cas:
+ if view_name == NAME_DEFAULT_SOFA:
+ view = cas.get_view(NAME_DEFAULT_SOFA)
+
+ # We need to make sure that the sofa gets the real xmi, see #155
+ if fs_id is not None:
+ view.get_sofa().xmiID = fs_id
+
+ return view
+ else:
+ return cas.create_view(view_name, xmiID=fs_id, sofaNum=sofa_num)
+
+ def _parse_view(self, cas: Cas, view_name: str, json_view: Dict[str, any], feature_structures: Dict[str, any]):
+ view = self._get_or_create_view(cas, view_name)
+ for member_id in json_view[VIEW_MEMBERS_FIELD]:
+ fs = feature_structures[member_id]
+ view.add_annotation(fs, keep_id=True)
+
+ def _parse_sofa(self, cas: Cas, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any]) -> Sofa:
+ view = self._get_or_create_view(
+ cas, json_fs.get(FEATURE_BASE_NAME_SOFAID), fs_id, json_fs.get(FEATURE_BASE_NAME_SOFANUM)
+ )
+
+ view.sofa_string = json_fs.get(FEATURE_BASE_NAME_SOFASTRING)
+ view.sofa_mime = json_fs.get(FEATURE_BASE_NAME_SOFAMIME)
+ view.sofa_uri = json_fs.get(FEATURE_BASE_NAME_SOFAURI)
+ view.sofa_array = feature_structures.get(json_fs.get(REF_FEATURE_PREFIX + FEATURE_BASE_NAME_SOFAARRAY))
+
+ return view.get_sofa()
+
+ def _parse_feature_structure(
+ self, typesystem: TypeSystem, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any]
+ ):
+ AnnotationType = typesystem.get_type(json_fs.get(TYPE_FIELD))
+
+ attributes = dict(json_fs)
+
+ # Map the JSON FS ID to xmiID
+ attributes["xmiID"] = fs_id
+
+ # Remap features that use a reserved Python name
+ if "self" in attributes:
+ attributes["self_"] = attributes.pop("self")
+
+ if "type" in attributes:
+ attributes["type_"] = attributes.pop("type")
+
+ if typesystem.is_primitive_array(AnnotationType.name):
+ attributes["elements"] = self._parse_primitive_array(AnnotationType.name, json_fs.get(ELEMENTS_FIELD))
+ elif AnnotationType.name == TYPE_NAME_FS_ARRAY:
+ # Resolve id-ref at the end of processing
+ def fix_up(elements):
+ return lambda: setattr(fs, "elements", [feature_structures.get(e) for e in elements])
+
+ self._post_processors.append(fix_up(json_fs.get(ELEMENTS_FIELD)))
+
+ self._strip_reserved_json_keys(attributes)
+
+ ref_features = {}
+ for key, value in list(attributes.items()):
+ if key.startswith(REF_FEATURE_PREFIX):
+ ref_features[key[1:]] = value
+ attributes.pop(key)
+ if key.startswith(NUMBER_FEATURE_PREFIX):
+ attributes[key[1:]] = self._parse_float_value(value)
+ attributes.pop(key)
+
+ self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id)
+ fs = AnnotationType(**attributes)
+
+ self._resolve_references(fs, ref_features, feature_structures)
+
+ # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints
+ if typesystem.is_instance_of(fs.type, TYPE_NAME_ANNOTATION):
+ sofa = fs.sofa
+ fs.begin = sofa._offset_converter.uima_to_cassis(fs.begin)
+ fs.end = sofa._offset_converter.uima_to_cassis(fs.end)
+
+ return fs
+
+ def _parse_float_value(self, value: Union[str, float]) -> float:
+ if isinstance(value, float):
+ return value
+ elif value == NAN_VALUE:
+ return float("nan")
+ elif value == POSITIVE_INFINITE_VALUE or value == POSITIVE_INFINITE_VALUE_ABBR:
+ return float("inf")
+ elif value == NEGATIVE_INFINITE_VALUE or value == NEGATIVE_INFINITE_VALUE_ABBR:
+ return float("-inf")
+
+ raise ValueError(
+ f"Illegal floating point value [{value}]. Must be a float literal or one of {NAN_VALUE}, "
+ f"{POSITIVE_INFINITE_VALUE}, {POSITIVE_INFINITE_VALUE_ABBR}, {NEGATIVE_INFINITE_VALUE}, or "
+ f"{NEGATIVE_INFINITE_VALUE_ABBR}"
+ )
+
+ def _parse_primitive_array(self, type_name: str, elements: [list, str]) -> List:
+ if elements and type_name == TYPE_NAME_BYTE_ARRAY:
+ return base64.b64decode(elements)
+ if elements and (type_name == TYPE_NAME_FLOAT_ARRAY or type_name == TYPE_NAME_DOUBLE_ARRAY):
+ return [self._parse_float_value(v) for v in elements]
+ else:
+ return elements
+
+ def _resolve_references(self, fs, ref_features: Dict[str, any], feature_structures: Dict[int, any]):
+ for key, value in ref_features.items():
+ target_fs = feature_structures.get(value)
+ if target_fs:
+ # Resolve id-ref now
+ setattr(fs, key, target_fs)
+ else:
+ # Resolve id-ref at the end of processing
+ def fix_up(k, v):
+ return lambda: setattr(fs, k, feature_structures.get(v))
+
+ self._post_processors.append(fix_up(key, value))
+
+ def _strip_reserved_json_keys(
+ self,
+ attributes: Dict[str, any],
+ ):
+ for key in list(attributes):
+ if key.startswith(RESERVED_FIELD_PREFIX):
+ attributes.pop(key)
+
+
+class CasJsonSerializer:
+ _COMMON_FIELD_NAMES = {"xmiID", "type"}
+
+ def __init__(self):
+ pass
+
+ def serialize(
+ self,
+ sink: Union[IO, str, None],
+ cas: Cas,
+ pretty_print: bool = True,
+ ensure_ascii: bool = False,
+ type_system_mode: TypeSystemMode = TypeSystemMode.FULL,
+ ) -> Union[str, None]:
+ feature_structures = []
+
+ views = {}
+ for view in cas.views:
+ views[view.sofa.sofaID] = self._serialize_view(view)
+
+ if view.sofa.sofaArray:
+ json_sofa_array_fs = self._serialize_feature_structure(view.sofa.sofaArray)
+ feature_structures.append(json_sofa_array_fs)
+ json_sofa_fs = self._serialize_feature_structure(view.sofa)
+ feature_structures.append(json_sofa_fs)
+
+ # Find all fs, even the ones that are not directly added to a sofa
+ used_types = set()
+ for fs in sorted(cas._find_all_fs(include_inlinable_arrays_and_lists=True), key=lambda a: a.xmiID):
+ used_types.add(fs.type)
+ json_fs = self._serialize_feature_structure(fs)
+ feature_structures.append(json_fs)
+
+ types = None
+ if type_system_mode is not TypeSystemMode.NONE:
+ types = {}
+
+ if type_system_mode is TypeSystemMode.MINIMAL:
+ # Build transitive closure of used types by following parents, features, etc.
+ types_to_include = cas.typesystem.transitive_closure(used_types)
+ elif type_system_mode is TypeSystemMode.FULL:
+ types_to_include = cas.typesystem.get_types()
+
+ for type_ in types_to_include:
+ if type_.name == TYPE_NAME_DOCUMENT_ANNOTATION:
+ continue
+ json_type = self._serialize_type(type_)
+ types[json_type[NAME_FIELD]] = json_type
+
+ data = {}
+ if types is not None:
+ data[TYPES_FIELD] = types
+ if feature_structures is not None:
+ data[FEATURE_STRUCTURES_FIELD] = feature_structures
+ if views is not None:
+ data[VIEWS_FIELD] = views
+
+ if sink and not isinstance(sink, TextIOBase):
+ sink = TextIOWrapper(sink, encoding="utf-8", write_through=True)
+
+ if sink:
+ json.dump(
+ data,
+ sink,
+ sort_keys=False,
+ indent=2 if pretty_print else None,
+ ensure_ascii=ensure_ascii,
+ allow_nan=False,
+ )
+ else:
+ return json.dumps(
+ data, sort_keys=False, indent=2 if pretty_print else None, ensure_ascii=ensure_ascii, allow_nan=False
+ )
+
+ if isinstance(sink, TextIOWrapper):
+ sink.detach() # Prevent TextIOWrapper from closing the BytesIO
+
+ return None
+
+ def _serialize_type(self, type_: Type):
+ type_name = self._to_external_type_name(type_.name)
+ supertype_name = self._to_external_type_name(type_.supertype.name)
+
+ json_type = {
+ NAME_FIELD: type_name,
+ SUPER_TYPE_FIELD: supertype_name,
+ }
+
+ if type_.description:
+ json_type[DESCRIPTION_FIELD] = type_.description
+
+ for feature in list(type_.features):
+ json_feature = self._serialize_feature(json_type, feature)
+ json_type[json_feature[NAME_FIELD]] = json_feature
+
+ return json_type
+
+ def _serialize_feature(self, json_type, feature: Feature):
+ # If the feature name is a reserved name like `self`, then we added an
+ # underscore to it before so Python can handle it. We now need to remove it.
+ feature_name = feature.name
+ if feature._has_reserved_name:
+ feature_name = feature_name[:-1]
+
+ json_feature = {
+ NAME_FIELD: feature_name,
+ RANGE_FIELD: self._to_external_type_name(feature.rangeType.name),
+ }
+
+ if feature.description:
+ json_feature[DESCRIPTION_FIELD] = feature.description
+
+ if feature.multipleReferencesAllowed is not None:
+ json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed
+
+ if feature.elementType is not None:
+ json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType.name)
+
+ return json_feature
+
+ def _serialize_feature_structure(self, fs) -> dict:
+ type_name = fs.type.name
+
+ json_fs = OrderedDict()
+ json_fs[ID_FIELD] = fs.xmiID
+ json_fs[TYPE_FIELD] = type_name
+
+ if type_name == TYPE_NAME_BYTE_ARRAY:
+ if fs.elements:
+ json_fs[ELEMENTS_FIELD] = base64.b64encode(bytes(fs.elements)).decode("ascii")
+ return json_fs
+ elif type_name in {TYPE_NAME_DOUBLE_ARRAY, TYPE_NAME_FLOAT_ARRAY}:
+ if fs.elements:
+ json_fs[ELEMENTS_FIELD] = [self._serialize_float_value(e) for e in fs.elements]
+ return json_fs
+ elif is_primitive_array(fs.type):
+ if fs.elements:
+ json_fs[ELEMENTS_FIELD] = fs.elements
+ return json_fs
+ elif TYPE_NAME_FS_ARRAY == type_name:
+ if fs.elements:
+ json_fs[ELEMENTS_FIELD] = [self._serialize_ref(e) for e in fs.elements]
+ return json_fs
+
+ for feature in fs.type.all_features:
+ if feature.name in CasJsonSerializer._COMMON_FIELD_NAMES:
+ continue
+
+ feature_name = feature.name
+
+ # Strip the underscore we added for reserved names
+ if feature._has_reserved_name:
+ feature_name = feature.name[:-1]
+
+ # Skip over 'None' features
+ value = getattr(fs, feature.name)
+ if value is None:
+ continue
+
+ # Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets
+ if feature.domainType.name == TYPE_NAME_ANNOTATION and feature_name == "begin" or feature_name == "end":
+ sofa: Sofa = getattr(fs, "sofa")
+ value = sofa._offset_converter.cassis_to_uima(value)
+
+ if feature.rangeType.name in {TYPE_NAME_DOUBLE, TYPE_NAME_FLOAT}:
+ float_value = self._serialize_float_value(value)
+ if isinstance(float_value, str):
+ feature_name = NUMBER_FEATURE_PREFIX + feature_name
+ json_fs[feature_name] = self._serialize_float_value(value)
+ elif is_primitive(feature.rangeType):
+ json_fs[feature_name] = value
+ else:
+ # We need to encode non-primitive features as a reference
+ json_fs[REF_FEATURE_PREFIX + feature_name] = self._serialize_ref(value)
+ return json_fs
+
+ def _serialize_float_value(self, value) -> Union[float, str]:
+ if isnan(value):
+ return NAN_VALUE
+ elif math.isinf(value):
+ if value > 0:
+ return POSITIVE_INFINITE_VALUE
+ else:
+ return NEGATIVE_INFINITE_VALUE
+ return value
+
+ def _serialize_ref(self, fs) -> int:
+ if not fs:
+ return None
+
+ return fs.xmiID
+
+ def _serialize_view(self, view: View):
+ return {
+ VIEW_SOFA_FIELD: view.sofa.xmiID,
+ VIEW_MEMBERS_FIELD: sorted(x.xmiID for x in view.get_all_annotations()),
+ }
+
+ def _to_external_type_name(self, type_name: str):
+ if type_name.startswith("uima.noNamespace."):
+ return type_name.replace("uima.noNamespace.", "")
+ return type_name
diff --git a/cassis/typesystem.py b/cassis/typesystem.py
index 2957a0d..12d8122 100644
--- a/cassis/typesystem.py
+++ b/cassis/typesystem.py
@@ -1,6 +1,7 @@
import re
import warnings
from collections import defaultdict
+from enum import Enum, auto
from io import BytesIO
from itertools import chain, filterfalse
from pathlib import Path
@@ -188,6 +189,14 @@
_LIST_TYPES = _PRIMITIVE_LIST_TYPES | {TYPE_NAME_FS_LIST}
+class TypeSystemMode(Enum):
+ """How much type system information to include."""
+
+ FULL = auto()
+ MINIMAL = auto()
+ NONE = auto()
+
+
def _string_to_valid_classname(name: str):
return re.sub("[^a-zA-Z0-9_]", "_", name)
diff --git a/cassis/xmi.py b/cassis/xmi.py
index e2a41fd..fc6f2d7 100644
--- a/cassis/xmi.py
+++ b/cassis/xmi.py
@@ -475,7 +475,7 @@ def __init__(self):
self._urls_to_prefixes = {}
self._duplicate_namespaces = defaultdict(int)
- def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True):
+ def serialize(self, sink: Union[IO, str, None], cas: Cas, pretty_print=True) -> Union[str, None]:
xmi_attrs = {"{http://www.omg.org/XMI}version": "2.0"}
root = etree.Element(etree.QName(self._nsmap["xmi"], "XMI"), nsmap=self._nsmap, **xmi_attrs)
@@ -495,8 +495,17 @@ def serialize(self, sink: Union[IO, str], cas: Cas, pretty_print=True):
doc = etree.ElementTree(root)
etree.cleanup_namespaces(doc, top_nsmap=self._nsmap)
+ return_str = sink is None
+ if return_str:
+ sink = BytesIO()
+
doc.write(sink, xml_declaration=True, pretty_print=pretty_print, encoding="UTF-8")
+ if return_str:
+ return sink.getvalue().decode("utf-8")
+
+ return None
+
def _serialize_cas_null(self, root: etree.Element):
name = etree.QName(self._nsmap["cas"], "NULL")
elem = etree.SubElement(root, name)
diff --git a/tests/performance.py b/tests/performance.py
new file mode 100644
index 0000000..69575de
--- /dev/null
+++ b/tests/performance.py
@@ -0,0 +1,69 @@
+from random import Random
+from timeit import default_timer as timer
+
+import pytest
+
+from cassis import load_cas_from_json, load_cas_from_xmi
+from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator
+
+generator = MultiFeatureRandomCasGenerator()
+generator.rnd = Random(123456)
+generator.size = 1000
+iterations = 100
+
+typesystem = generator.generate_type_system()
+randomized_cas = generator.generate_cas(typesystem)
+
+randomized_cas_xmi = randomized_cas.to_xmi()
+randomized_cas_xmi_bytes = randomized_cas_xmi.encode("utf-8")
+
+randomized_cas_json = randomized_cas.to_json()
+randomized_cas_json_bytes = randomized_cas_json.encode("utf-8")
+
+
+@pytest.mark.performance
+def test_xmi_serialization_performance():
+ start = timer()
+ for i in range(0, iterations):
+ randomized_cas.to_xmi()
+ end = timer()
+
+ print(
+ f"XMI: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)"
+ )
+
+
+@pytest.mark.performance
+def test_json_serialization_performance():
+ start = timer()
+ for i in range(0, iterations):
+ randomized_cas.to_json()
+ end = timer()
+
+ print(
+ f"JSON: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)"
+ )
+
+
+@pytest.mark.performance
+def test_xmi_deserialization_performance():
+ start = timer()
+ for i in range(0, iterations):
+ load_cas_from_xmi(randomized_cas_xmi, typesystem)
+ end = timer()
+
+ print(
+ f"XMI: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)"
+ )
+
+
+@pytest.mark.performance
+def test_json_deserialization_performance():
+ start = timer()
+ for i in range(0, iterations):
+ load_cas_from_json(randomized_cas_json, typesystem)
+ end = timer()
+
+ print(
+ f"JSON: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)"
+ )
diff --git a/tests/test_files/json/README.md b/tests/test_files/json/README.md
new file mode 100644
index 0000000..483853a
--- /dev/null
+++ b/tests/test_files/json/README.md
@@ -0,0 +1,5 @@
+Test files in this folder were sourced from
+
+https://github.com/apache/uima-uimaj/tree/feature/UIMA-6266-Clean-JSON-Wire-Format-for-CAS/uimaj-json/src/test/resources/CasSerializationDeserialization_JsonCas2_FsAsArray_Test/ser-ref
+
+Apache License 2.0
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json
new file mode 100644
index 0000000..422cea5
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/data.json
@@ -0,0 +1,78 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView",
+ "mimeType" : "text",
+ "sofaString" : "\uD83E\uDD73 This \uD83D\uDC73\uD83C\uDFFB♀️ is ✆ a \uD83E\uDDD4\uD83C\uDFFE♂️ test \uD83D\uDC7B"
+ }, {
+ "%ID" : 2,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 2
+ }, {
+ "%ID" : 3,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 3,
+ "end" : 7
+ }, {
+ "%ID" : 4,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 8,
+ "end" : 15
+ }, {
+ "%ID" : 5,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 16,
+ "end" : 18
+ }, {
+ "%ID" : 6,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 19,
+ "end" : 20
+ }, {
+ "%ID" : 7,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 21,
+ "end" : 22
+ }, {
+ "%ID" : 8,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 23,
+ "end" : 30
+ }, {
+ "%ID" : 9,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 31,
+ "end" : 35
+ }, {
+ "%ID" : 10,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 36,
+ "end" : 38
+ }, {
+ "%ID" : 11,
+ "%TYPE" : "uima.tcas.DocumentAnnotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 38,
+ "language" : "x-unspecified"
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi
new file mode 100644
index 0000000..6d8ec43
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithEmojiUnicodeTextAndAnnotations/debug.xmi
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json
new file mode 100644
index 0000000..0d97fb3
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/data.json
@@ -0,0 +1,102 @@
+{
+ "%FEATURE_STRUCTURES": [
+ {
+ "%ID": 1,
+ "%TYPE": "uima.cas.Sofa",
+ "sofaID": "_InitialView",
+ "sofaNum": 1
+ },
+ {
+ "#doubleNan": "NaN",
+ "#doubleNegInfinity": "-Infinity",
+ "#doublePosInfinity": "Infinity",
+ "#floatNan": "NaN",
+ "#floatNegInfinity": "-Infinity",
+ "#floatPosInfinity": "Infinity",
+ "%ID": 1,
+ "%TYPE": "SpecialValuesType",
+ "doubleOne": 1.0,
+ "doubleZero": 0.0,
+ "floatOne": 1.0,
+ "floatZero": 0.0
+ },
+ {
+ "%ELEMENTS": [
+ 0.0,
+ 1.0,
+ "-Infinity",
+ "Infinity",
+ "NaN"
+ ],
+ "%ID": 2,
+ "%TYPE": "uima.cas.DoubleArray"
+ },
+ {
+ "%ELEMENTS": [
+ 0.0,
+ 1.0,
+ "-Infinity",
+ "Infinity",
+ "NaN"
+ ],
+ "%ID": 3,
+ "%TYPE": "uima.cas.FloatArray"
+ }
+ ],
+ "%TYPES": {
+ "SpecialValuesType": {
+ "%NAME": "SpecialValuesType",
+ "%SUPER_TYPE": "uima.cas.TOP",
+ "doubleNan": {
+ "%NAME": "doubleNan",
+ "%RANGE": "uima.cas.Double"
+ },
+ "doubleNegInfinity": {
+ "%NAME": "doubleNegInfinity",
+ "%RANGE": "uima.cas.Double"
+ },
+ "doubleOne": {
+ "%NAME": "doubleOne",
+ "%RANGE": "uima.cas.Double"
+ },
+ "doublePosInfinity": {
+ "%NAME": "doublePosInfinity",
+ "%RANGE": "uima.cas.Double"
+ },
+ "doubleZero": {
+ "%NAME": "doubleZero",
+ "%RANGE": "uima.cas.Double"
+ },
+ "floatNan": {
+ "%NAME": "floatNan",
+ "%RANGE": "uima.cas.Float"
+ },
+ "floatNegInfinity": {
+ "%NAME": "floatNegInfinity",
+ "%RANGE": "uima.cas.Float"
+ },
+ "floatOne": {
+ "%NAME": "floatOne",
+ "%RANGE": "uima.cas.Float"
+ },
+ "floatPosInfinity": {
+ "%NAME": "floatPosInfinity",
+ "%RANGE": "uima.cas.Float"
+ },
+ "floatZero": {
+ "%NAME": "floatZero",
+ "%RANGE": "uima.cas.Float"
+ }
+ }
+ },
+ "%VIEWS": {
+ "_InitialView": {
+ "%MEMBERS": [
+ 1,
+ 2,
+ 3
+ ],
+ "%SOFA": 1
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml
new file mode 100644
index 0000000..9a8766d
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug-typesystem.xml
@@ -0,0 +1,74 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+ SpecialValuesType
+
+ uima.cas.TOP
+
+
+ doubleZero
+
+ uima.cas.Double
+
+
+ doubleOne
+
+ uima.cas.Double
+
+
+ doublePosInfinity
+
+ uima.cas.Double
+
+
+ doubleNegInfinity
+
+ uima.cas.Double
+
+
+ doubleNan
+
+ uima.cas.Double
+
+
+ floatZero
+
+ uima.cas.Float
+
+
+ floatOne
+
+ uima.cas.Float
+
+
+ floatPosInfinity
+
+ uima.cas.Float
+
+
+ floatNegInfinity
+
+ uima.cas.Float
+
+
+ floatNan
+
+ uima.cas.Float
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi
new file mode 100644
index 0000000..e02d4cb
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/debug.xmi
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml
new file mode 100644
index 0000000..9a8766d
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithFloatingPointSpecialValues/typesystem.xml
@@ -0,0 +1,74 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+ SpecialValuesType
+
+ uima.cas.TOP
+
+
+ doubleZero
+
+ uima.cas.Double
+
+
+ doubleOne
+
+ uima.cas.Double
+
+
+ doublePosInfinity
+
+ uima.cas.Double
+
+
+ doubleNegInfinity
+
+ uima.cas.Double
+
+
+ doubleNan
+
+ uima.cas.Double
+
+
+ floatZero
+
+ uima.cas.Float
+
+
+ floatOne
+
+ uima.cas.Float
+
+
+ floatPosInfinity
+
+ uima.cas.Float
+
+
+ floatNegInfinity
+
+ uima.cas.Float
+
+
+ floatNan
+
+ uima.cas.Float
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json
new file mode 100644
index 0000000..1944181
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/data.json
@@ -0,0 +1,36 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView",
+ "mimeType" : "text",
+ "sofaString" : "هذا اختبار"
+ }, {
+ "%ID" : 2,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 3
+ }, {
+ "%ID" : 3,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 4,
+ "end" : 10
+ }, {
+ "%ID" : 4,
+ "%TYPE" : "uima.tcas.DocumentAnnotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 10,
+ "language" : "x-unspecified"
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ 2, 3, 4 ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi
new file mode 100644
index 0000000..108d362
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithLeftToRightTextAndAnnotations/debug.xmi
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json
new file mode 100644
index 0000000..20d935b
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/data.json
@@ -0,0 +1,21 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.ByteArray",
+ "%ELEMENTS" : "VGhpcyBpcyBhIHRlc3Q="
+ }, {
+ "%ID" : 2,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView",
+ "mimeType" : "text/plain",
+ "@sofaArray" : 1
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 2,
+ "%MEMBERS" : [ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi
new file mode 100644
index 0000000..89075f6
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataArray/debug.xmi
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json
new file mode 100644
index 0000000..0b142a8
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/data.json
@@ -0,0 +1,17 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView",
+ "mimeType" : "text/plain",
+ "sofaURI" : "classpath:/ProgrammaticallyCreatedCasDataSuite/document.txt"
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi
new file mode 100644
index 0000000..89966e0
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithSofaDataURI/debug.xmi
@@ -0,0 +1,4 @@
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json
new file mode 100644
index 0000000..39f5ffe
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/data.json
@@ -0,0 +1,24 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView",
+ "mimeType" : "text",
+ "sofaString" : "This is a test."
+ }, {
+ "%ID" : 2,
+ "%TYPE" : "uima.tcas.DocumentAnnotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 15,
+ "language" : "x-unspecified"
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ 2 ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi
new file mode 100644
index 0000000..943df5f
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithText/debug.xmi
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json
new file mode 100644
index 0000000..a9522cf
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/data.json
@@ -0,0 +1,48 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView",
+ "mimeType" : "text",
+ "sofaString" : "This is a test"
+ }, {
+ "%ID" : 2,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 4
+ }, {
+ "%ID" : 3,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 5,
+ "end" : 7
+ }, {
+ "%ID" : 4,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 8,
+ "end" : 9
+ }, {
+ "%ID" : 5,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 10,
+ "end" : 14
+ }, {
+ "%ID" : 6,
+ "%TYPE" : "uima.tcas.DocumentAnnotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 14,
+ "language" : "x-unspecified"
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ 2, 3, 4, 5, 6 ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi
new file mode 100644
index 0000000..37c1e9b
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTextAndAnnotations/debug.xmi
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json
new file mode 100644
index 0000000..d586738
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/data.json
@@ -0,0 +1,48 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView",
+ "mimeType" : "text",
+ "sofaString" : "這是一個測試"
+ }, {
+ "%ID" : 2,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 1
+ }, {
+ "%ID" : 3,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 1,
+ "end" : 2
+ }, {
+ "%ID" : 4,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 2,
+ "end" : 4
+ }, {
+ "%ID" : 5,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 4,
+ "end" : 6
+ }, {
+ "%ID" : 6,
+ "%TYPE" : "uima.tcas.DocumentAnnotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 6,
+ "language" : "x-unspecified"
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ 2, 3, 4, 5, 6 ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi
new file mode 100644
index 0000000..0087d72
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithTraditionalChineseTextAndAnnotations/debug.xmi
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json
new file mode 100644
index 0000000..56784fe
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/data.json
@@ -0,0 +1,39 @@
+{
+ "%TYPES" : { },
+ "%FEATURE_STRUCTURES" : [ {
+ "%ID" : 1,
+ "%TYPE" : "uima.cas.Sofa",
+ "sofaNum" : 1,
+ "sofaID" : "_InitialView"
+ }, {
+ "%ID" : 2,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 0,
+ "end" : 4
+ }, {
+ "%ID" : 3,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 5,
+ "end" : 7
+ }, {
+ "%ID" : 4,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 8,
+ "end" : 9
+ }, {
+ "%ID" : 5,
+ "%TYPE" : "uima.tcas.Annotation",
+ "@sofa" : 1,
+ "begin" : 10,
+ "end" : 14
+ } ],
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ 2, 3, 4, 5 ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi
new file mode 100644
index 0000000..37c1e9b
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/casWithoutTextButWithAnnotations/debug.xmi
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json
new file mode 100644
index 0000000..fcd8582
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/data.json
@@ -0,0 +1,9 @@
+{
+ "%TYPES" : { },
+ "%VIEWS" : {
+ "_InitialView" : {
+ "%SOFA" : 1,
+ "%MEMBERS" : [ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml
new file mode 100644
index 0000000..07e327a
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug-typesystem.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ uima.tcas.DocumentAnnotation
+
+ uima.tcas.Annotation
+
+
+ language
+
+ uima.cas.String
+
+
+
+
+
diff --git a/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi
new file mode 100644
index 0000000..6fd88bd
--- /dev/null
+++ b/tests/test_files/json/fs_as_array/ser-ref/emptyCas/debug.xmi
@@ -0,0 +1,3 @@
+
+
+
diff --git a/tests/test_json.py b/tests/test_json.py
new file mode 100644
index 0000000..3889b11
--- /dev/null
+++ b/tests/test_json.py
@@ -0,0 +1,154 @@
+import json
+
+from cassis.typesystem import TYPE_NAME_ANNOTATION
+from tests.fixtures import *
+from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator, MultiTypeRandomCasGenerator
+from tests.util import assert_json_equal
+
+FIXTURE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test_files", "json", "fs_as_array", "ser-ref")
+
+FIXTURES = [
+ (os.path.join(FIXTURE_DIR, "casWithSofaDataArray"), []),
+ (os.path.join(FIXTURE_DIR, "casWithSofaDataURI"), []),
+ (os.path.join(FIXTURE_DIR, "casWithFloatingPointSpecialValues"), []),
+ (os.path.join(FIXTURE_DIR, "casWithText"), [["uima.tcas.DocumentAnnotation", 0, 15, "This is a test."]]),
+ (
+ os.path.join(FIXTURE_DIR, "casWithoutTextButWithAnnotations"),
+ [
+ ["uima.tcas.Annotation", 0, 4, None],
+ ["uima.tcas.Annotation", 5, 7, None],
+ ["uima.tcas.Annotation", 8, 9, None],
+ ["uima.tcas.Annotation", 10, 14, None],
+ ],
+ ),
+ (
+ os.path.join(FIXTURE_DIR, "casWithTextAndAnnotations"),
+ [
+ ["uima.tcas.Annotation", 0, 4, "This"],
+ ["uima.tcas.Annotation", 5, 7, "is"],
+ ["uima.tcas.Annotation", 8, 9, "a"],
+ ["uima.tcas.Annotation", 10, 14, "test"],
+ ["uima.tcas.DocumentAnnotation", 0, 14, "This is a test"],
+ ],
+ ),
+ (
+ os.path.join(FIXTURE_DIR, "casWithEmojiUnicodeTextAndAnnotations"),
+ [
+ ["uima.tcas.Annotation", 0, 1, "🥳", b"\xf0\x9f\xa5\xb3"],
+ ["uima.tcas.Annotation", 2, 6, "This"],
+ [
+ "uima.tcas.Annotation",
+ 7,
+ 12,
+ "👳🏻\u200d♀️",
+ b"\xf0\x9f\x91\xb3\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f",
+ ],
+ ["uima.tcas.Annotation", 13, 15, "is"],
+ ["uima.tcas.Annotation", 16, 17, "✆", b"\xe2\x9c\x86"],
+ ["uima.tcas.Annotation", 18, 19, "a"],
+ [
+ "uima.tcas.Annotation",
+ 20,
+ 25,
+ "🧔🏾\u200d♂️",
+ b"\xf0\x9f\xa7\x94\xf0\x9f\x8f\xbe\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f",
+ ],
+ ["uima.tcas.Annotation", 26, 30, "test"],
+ ["uima.tcas.Annotation", 31, 32, "👻", b"\xf0\x9f\x91\xbb"],
+ ["uima.tcas.DocumentAnnotation", 0, 32, "🥳 This 👳🏻\u200d♀️ is ✆ a 🧔🏾\u200d♂️ test 👻"],
+ ],
+ ),
+ (
+ os.path.join(FIXTURE_DIR, "casWithLeftToRightTextAndAnnotations"),
+ [
+ ["uima.tcas.Annotation", 0, 3, "هذا"],
+ ["uima.tcas.Annotation", 4, 10, "اختبار"],
+ ["uima.tcas.DocumentAnnotation", 0, 10, "هذا اختبار"],
+ ],
+ ),
+ (
+ os.path.join(FIXTURE_DIR, "casWithTraditionalChineseTextAndAnnotations"),
+ [
+ ["uima.tcas.Annotation", 0, 1, "這"],
+ ["uima.tcas.Annotation", 1, 2, "是"],
+ ["uima.tcas.Annotation", 2, 4, "一個"],
+ ["uima.tcas.Annotation", 4, 6, "測試"],
+ ["uima.tcas.DocumentAnnotation", 0, 6, "這是一個測試"],
+ ],
+ ),
+]
+
+
+@pytest.mark.parametrize("json_path, annotations", FIXTURES)
+def test_deserialization_serialization(json_path, annotations):
+ with open(os.path.join(json_path, "data.json"), "rb") as f:
+ cas = load_cas_from_json(f)
+
+ with open(os.path.join(json_path, "data.json"), "rb") as f:
+ expected_json = json.load(f)
+
+ actual_json = cas.to_json(pretty_print=True)
+
+ assert_json_equal(actual_json, expected_json, sort_keys=True)
+
+
+def test_multi_type_random_serialization_deserialization():
+ generator = MultiTypeRandomCasGenerator()
+ for i in range(0, 10):
+ generator.size = (i + 1) * 10
+ generator.type_count = i + 1
+ typesystem = generator.generate_type_system()
+ randomized_cas = generator.generate_cas(typesystem)
+ print(f"CAS size: {sum(len(view.get_all_annotations()) for view in randomized_cas.views)}")
+ expected_json = randomized_cas.to_json()
+
+ loaded_cas = load_cas_from_json(expected_json)
+ actual_json = loaded_cas.to_json()
+
+ assert_json_equal(actual_json, expected_json)
+
+
+def test_multi_feature_random_serialization_deserialization():
+ generator = MultiFeatureRandomCasGenerator()
+ for i in range(0, 10):
+ generator.size = (i + 1) * 10
+ typesystem = generator.generate_type_system()
+ randomized_cas = generator.generate_cas(typesystem)
+ print(f"CAS size: {sum(len(view.get_all_annotations()) for view in randomized_cas.views)}")
+ expected_json = randomized_cas.to_json()
+
+ loaded_cas = load_cas_from_json(expected_json)
+ actual_json = loaded_cas.to_json()
+
+ assert_json_equal(actual_json, expected_json)
+
+
+@pytest.mark.parametrize("json_path, annotations", FIXTURES)
+def test_unicode(json_path, annotations):
+ with open(os.path.join(json_path, "data.json"), "rb") as f:
+ cas = load_cas_from_json(f)
+
+ actual_annotations = [
+ [a.type.name, a.begin, a.end, a.get_covered_text()]
+ for a in sorted(cas.select(TYPE_NAME_ANNOTATION), key=lambda k: k.type.name)
+ ]
+ expected_annotations = [a[0:4] for a in annotations]
+ assert actual_annotations == expected_annotations
+
+ for i in range(0, len(annotations)):
+ expected = annotations[i]
+ actual = actual_annotations[i]
+
+ expected_covered_text = expected[3]
+ actual_covered_text = actual[3]
+
+ if not expected_covered_text:
+ continue
+
+ for n in range(len(actual_covered_text)):
+ print(f"{n}: [{actual_covered_text[n]}] {hex(ord(actual_covered_text[n]))}")
+
+ if len(expected) >= 5:
+ expected_utf8_bytes = expected[4]
+ actual_utf8_bytes = bytes(actual_covered_text, "UTF-8")
+ assert actual_utf8_bytes == expected_utf8_bytes
diff --git a/tests/test_xmi.py b/tests/test_xmi.py
index 8adc317..ac290d5 100644
--- a/tests/test_xmi.py
+++ b/tests/test_xmi.py
@@ -185,7 +185,7 @@ def test_serializing_cas_to_file_path(tmpdir, xmi, typesystem_xml):
cas.to_xmi(path)
- with open(path, "r") as actual:
+ with open(path) as actual:
assert_xml_equal(actual.read(), xmi)
diff --git a/tests/util.py b/tests/util.py
index 3f6367c..1c84f89 100644
--- a/tests/util.py
+++ b/tests/util.py
@@ -1,4 +1,5 @@
import difflib
+import json
from typing import IO, Union
import lxml_asserts
@@ -39,6 +40,45 @@ def assert_xml_equal(actual: Union[IO, str], expected: Union[IO, str]):
raise e
+def assert_json_equal(actual: str, expected: Union[IO, str], sort_keys: bool = False):
+ """Checks whether the JSON trees behind `actual` and `expected` are equal.
+
+ Args:
+ actual: The actual JSON
+ expected: The expected JSON
+
+ Throws:
+ AssertionError when json(actual) != json(expected)
+ """
+ if isinstance(actual, str):
+ actual = json.loads(actual)
+
+ if isinstance(expected, str):
+ expected = json.loads(expected)
+
+ actual_json = json.dumps(actual, sort_keys=sort_keys, indent=2)
+ expected_json = json.dumps(expected, sort_keys=sort_keys, indent=2)
+
+ try:
+ assert actual_json == expected_json
+ except AssertionError as e:
+ # For debugging purposes, the trees are saved to later inspect their contents
+ with open("actual.json", "w") as f:
+ f.write(actual_json)
+
+ with open("expected.json", "w") as f:
+ f.write(expected_json)
+
+ with open("difference.diff", "w") as f:
+ diff = difflib.unified_diff(
+ actual_json.splitlines(), expected_json.splitlines(), fromfile="Actual", tofile="Expected"
+ )
+ diff_string = "\n".join(diff)
+ f.write(diff_string)
+
+ raise e
+
+
def _to_etree(source: Union[IO, str]) -> etree.Element:
parser = etree.XMLParser(remove_blank_text=True)