Skip to content

Commit

Permalink
Merge branch 'feature/168-Experimental-JSON-CAS-support' of https://g…
Browse files Browse the repository at this point in the history
…ithub.com/dkpro/dkpro-cassis into feature/168-Experimental-JSON-CAS-support

* 'feature/168-Experimental-JSON-CAS-support' of https://github.com/dkpro/dkpro-cassis:
  No issue. Formatting.
  #215 - Ability to exclude types from cas_to_comparable_text
  #212 - Allow loading/saving XMI/typesystems from/to Path
  #211 - Serializing an FSArray without any elements breaks
  #212 - Allow loading/saving XMI/typesystems from/to Path
  #168 - Experimental JSON CAS support
  #168 - Experimental JSON CAS support
  #168 - Experimental JSON CAS support
  #209 - Parsing an array that was serialized using multipleReferencesAllowed=true fails
  - Do not execute performance "tests" when running make test - Update JSON reference data with new data from UIMA Java SDK - including CAS examples using emojis and other Unicode characters - Enabled character offset conversion on import/export in JSON (de)serializer
  #209 - Parsing an array that was serialized using multipleReferencesAllowed=true fails
  • Loading branch information
reckart committed Sep 27, 2021
2 parents 053bbf7 + a77935d commit e607dd5
Show file tree
Hide file tree
Showing 12 changed files with 243 additions and 88 deletions.
6 changes: 3 additions & 3 deletions cassis/json.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import base64
import json
from collections import OrderedDict
from io import TextIOWrapper
from io import TextIOBase, TextIOWrapper

from cassis.cas import NAME_DEFAULT_SOFA, Cas, IdGenerator, Sofa, View
from cassis.typesystem import *
Expand Down Expand Up @@ -268,7 +268,7 @@ def serialize(
json_fs = self._serialize_feature_structure(fs)
feature_structures.append(json_fs)

if isinstance(sink, BytesIO):
if sink and not isinstance(sink, TextIOBase):
sink = TextIOWrapper(sink, encoding="utf-8", write_through=True)

if sink:
Expand Down Expand Up @@ -316,7 +316,7 @@ def _serialize_feature(self, json_type, feature: Feature):
json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed

if feature.elementType is not None:
json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType)
json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType.name)

return json_feature

Expand Down
6 changes: 5 additions & 1 deletion cassis/typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,12 +971,13 @@ def _add_document_annotation_type(self):
# Deserializing


def load_typesystem(source: Union[IO, str]) -> TypeSystem:
def load_typesystem(source: Union[IO, str, Path]) -> TypeSystem:
"""Loads a type system from a XML source.
Args:
source: The XML source. If `source` is a string, then it is assumed to be an XML string.
If `source` is a file-like object, then the data is read from it.
If `source` is a `Path`, then load the file at the given location.
Returns:
The deserialized type system
Expand All @@ -985,6 +986,9 @@ def load_typesystem(source: Union[IO, str]) -> TypeSystem:
deserializer = TypeSystemDeserializer()
if isinstance(source, str):
return deserializer.deserialize(BytesIO(source.encode("utf-8")))
elif isinstance(source, Path):
with source.open("rb") as src:
return deserializer.deserialize(src)
else:
return deserializer.deserialize(source)

Expand Down
10 changes: 7 additions & 3 deletions cassis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from collections import defaultdict
from functools import cmp_to_key
from io import IOBase, StringIO
from typing import Dict, Iterable
from typing import Dict, Iterable, Set

import attr

from cassis import Cas
from cassis.typesystem import FEATURE_BASE_NAME_SOFA, TYPE_NAME_ANNOTATION, FeatureStructure, Type
from cassis.typesystem import FEATURE_BASE_NAME_SOFA, TYPE_NAME_ANNOTATION, FeatureStructure, Type, TypeSystem, is_array

_EXCLUDED_FEATURES = {FEATURE_BASE_NAME_SOFA}
_NULL_VALUE = "<NULL>"
Expand All @@ -19,9 +19,10 @@ def cas_to_comparable_text(
seeds: Iterable[FeatureStructure] = None,
mark_indexed: bool = True,
covered_text: bool = True,
exclude_types: Set[str] = None,
) -> [str, None]:
indexed_feature_structures = _get_indexed_feature_structures(cas)
all_feature_structures_by_type = _group_feature_structures_by_type(cas._find_all_fs(seeds))
all_feature_structures_by_type = _group_feature_structures_by_type(cas._find_all_fs(seeds=seeds))
types_sorted = sorted(all_feature_structures_by_type.keys())
fs_id_to_anchor = _generate_anchors(
cas, types_sorted, all_feature_structures_by_type, indexed_feature_structures, mark_indexed=mark_indexed
Expand All @@ -32,6 +33,9 @@ def cas_to_comparable_text(

csv_writer = csv.writer(out, dialect=csv.unix_dialect)
for t in types_sorted:
if exclude_types and t in exclude_types:
continue

type_ = cas.typesystem.get_type(t)

csv_writer.writerow([type_.name])
Expand Down
46 changes: 23 additions & 23 deletions cassis/xmi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import warnings
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from typing import IO, Dict, Iterable, List, Set, Union

import attr
Expand Down Expand Up @@ -47,13 +48,14 @@ class ProtoView:


def load_cas_from_xmi(
source: Union[IO, str], typesystem: TypeSystem = None, lenient: bool = False, trusted: bool = False
source: Union[IO, Path, str], typesystem: TypeSystem = None, lenient: bool = False, trusted: bool = False
) -> Cas:
"""Loads a CAS from a XMI source.
Args:
source: The XML source. If `source` is a string, then it is assumed to be an XML string.
If `source` is a file-like object, then the data is read from it.
If `source` is a `Path`, then load the file at the given location.
typesystem: The type system that belongs to this CAS. If `None`, an empty type system is provided.
lenient: If `True`, unknown Types will be ignored. If `False`, unknown Types will cause an exception.
The default is `False`.
Expand All @@ -71,6 +73,9 @@ def load_cas_from_xmi(
return deserializer.deserialize(
BytesIO(source.encode("utf-8")), typesystem=typesystem, lenient=lenient, trusted=trusted
)
if isinstance(source, Path):
with source.open("rb") as src:
return deserializer.deserialize(src, typesystem=typesystem, lenient=lenient, trusted=trusted)
else:
return deserializer.deserialize(source, typesystem=typesystem, lenient=lenient, trusted=trusted)

Expand Down Expand Up @@ -203,7 +208,7 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b
elif typesystem.is_primitive_array(fs.type) and feature_name == "elements":
# Separately rendered arrays (typically used with multipleReferencesAllowed = True)
fs[feature_name] = self._parse_primitive_array(fs.type, value)
elif typesystem.is_primitive_array(feature.rangeType):
elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed:
# Array feature rendered inline (multipleReferencesAllowed = False|None)
# We also end up here for array features that were rendered as child elements. No need to parse
# them again, so we check if the value is still a string (i.e. attribute value) and only then
Expand Down Expand Up @@ -337,19 +342,19 @@ def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[
return AnnotationType(**attributes)

def _parse_primitive_array(self, type_: Type, value: str) -> List:
"""Primitive collections are serialized as white space seperated primitive values"""
"""Primitive collections are serialized as white space separated primitive values"""

# TODO: Use type name global variable here instead of hardcoded string literal
elements = value.split(" ")
type_name = type_.name
if type_name in [TYPE_NAME_FLOAT_ARRAY, TYPE_NAME_DOUBLE_ARRAY]:
return [float(e) for e in elements]
return [float(e) for e in elements] if value else []
elif type_name in [TYPE_NAME_INTEGER_ARRAY, TYPE_NAME_SHORT_ARRAY, TYPE_NAME_LONG_ARRAY]:
return [int(e) for e in elements]
return [int(e) for e in elements] if value else []
elif type_name == TYPE_NAME_BOOLEAN_ARRAY:
return [self._parse_bool(e) for e in elements]
return [self._parse_bool(e) for e in elements] if value else []
elif type_name == TYPE_NAME_BYTE_ARRAY:
return list(bytearray.fromhex(value))
return list(bytearray.fromhex(value)) if value else []
else:
raise ValueError(f"Not a primitive collection type: {type_name}")

Expand Down Expand Up @@ -519,22 +524,17 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur
sofa: Sofa = fs.sofa
value = sofa._offset_converter.cassis_to_uima(value)

if (
ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_ARRAY)
and not feature.multipleReferencesAllowed
and value.elements
):
for e in value.elements:
child = etree.SubElement(elem, feature_name)
child.text = e
elif ts.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed and value.elements:
elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeType.name, value.elements)
elif (
feature.rangeType.name == TYPE_NAME_FS_ARRAY
and not feature.multipleReferencesAllowed
and value.elements
):
elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in value.elements)
if ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_ARRAY) and not feature.multipleReferencesAllowed:
if value.elements is not None: # Compare to none to not skip if elements is empty!
for e in value.elements:
child = etree.SubElement(elem, feature_name)
child.text = e
elif ts.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed:
if value.elements is not None: # Compare to none to not skip if elements is empty!
elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeType.name, value.elements)
elif feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed:
if value.elements is not None: # Compare to none to not skip if elements is empty!
elem.attrib[feature_name] = " ".join(str(e.xmiID) for e in value.elements)
elif feature_name == FEATURE_BASE_NAME_SOFA:
elem.attrib[feature_name] = str(value.xmiID)
elif feature.rangeType.name == TYPE_NAME_BOOLEAN:
Expand Down
28 changes: 28 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,20 @@ def cas_with_empty_array_references_xmi(cas_with_empty_array_references_path):
return f.read()


# CAS with multipleReferencesAllowed=true on string array


@pytest.fixture
def cas_with_multiple_references_allowed_string_array_path():
return os.path.join(FIXTURE_DIR, "xmi", "cas_with_multiple_references_allowed_string_array.xmi")


@pytest.fixture
def cas_with_multiple_references_allowed_string_array_xmi(cas_with_multiple_references_allowed_string_array_path):
with open(cas_with_multiple_references_allowed_string_array_path, "r") as f:
return f.read()


# CAS with reserved names


Expand Down Expand Up @@ -273,6 +287,20 @@ def typesystem_with_collections_xml(typesystem_with_collections_path):
return f.read()


# CAS with multipleReferencesAllowed=true on string array


@pytest.fixture
def typesystem_with_multiple_references_allowed_path():
return os.path.join(FIXTURE_DIR, "typesystems", "typesystem_with_multiple_references_allowed.xml")


@pytest.fixture
def typesystem_with_multiple_references_allowed_xml(typesystem_with_multiple_references_allowed_path):
with open(typesystem_with_multiple_references_allowed_path, "r") as f:
return f.read()


# DKPro types


Expand Down
20 changes: 16 additions & 4 deletions tests/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@

typesystem = generator.generate_type_system()
randomized_cas = generator.generate_cas(typesystem)

randomized_cas_xmi = randomized_cas.to_xmi()
randomized_cas_xmi_bytes = randomized_cas_xmi.encode("utf-8")

randomized_cas_json = randomized_cas.to_json()
randomized_cas_json_bytes = randomized_cas_json.encode("utf-8")


@pytest.mark.performance
Expand All @@ -24,7 +28,9 @@ def test_xmi_serialization_performance():
randomized_cas.to_xmi()
end = timer()

print(f"XMI: Serializing {iterations} CASes took {end - start} seconds")
print(
f"XMI: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)"
)


@pytest.mark.performance
Expand All @@ -34,7 +40,9 @@ def test_json_serialization_performance():
randomized_cas.to_json()
end = timer()

print(f"JSON: Serializing {iterations} CASes took {end - start} seconds")
print(
f"JSON: Serializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)"
)


@pytest.mark.performance
Expand All @@ -44,7 +52,9 @@ def test_xmi_deserialization_performance():
load_cas_from_xmi(randomized_cas_xmi, typesystem)
end = timer()

print(f"XMI: Deserializing {iterations} CASes took {end - start} seconds")
print(
f"XMI: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_xmi_bytes)} bytes each)"
)


@pytest.mark.performance
Expand All @@ -54,4 +64,6 @@ def test_json_deserialization_performance():
load_cas_from_json(randomized_cas_json, typesystem)
end = timer()

print(f"JSON: Deserializing {iterations} CASes took {end - start} seconds")
print(
f"JSON: Deserializing {iterations} CASes with {generator.size} each took {end - start} seconds ({len(randomized_cas_json_bytes)} bytes each)"
)
39 changes: 37 additions & 2 deletions tests/test_files/typesystems/typesystem_with_collections.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,50 @@
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>collection1</name>
<name>strings</name>
<description/>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>collection2</name>
<name>shorts</name>
<description/>
<rangeTypeName>uima.cas.ShortArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>integers</name>
<description/>
<rangeTypeName>uima.cas.IntegerArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>longs</name>
<description/>
<rangeTypeName>uima.cas.LongArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>booleans</name>
<description/>
<rangeTypeName>uima.cas.BooleanArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>bytes</name>
<description/>
<rangeTypeName>uima.cas.ByteArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>floats</name>
<description/>
<rangeTypeName>uima.cas.FloatArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>doubles</name>
<description/>
<rangeTypeName>uima.cas.DoubleArray</rangeTypeName>
</featureDescription>
<featureDescription>
<name>fses</name>
<description/>
<rangeTypeName>uima.cas.FSArray</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
<types>
<typeDescription>
<name>test.type</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>target</name>
<description/>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
<elementType>uima.cas.String</elementType>
<multipleReferencesAllowed>true</multipleReferencesAllowed>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
Loading

0 comments on commit e607dd5

Please sign in to comment.