Skip to content

Commit

Permalink
Merge branch 'master' into feature/168-Experimental-JSON-CAS-support
Browse files Browse the repository at this point in the history
* master:
  #167 - Cant serialize byte arrays (#176)
  • Loading branch information
reckart committed Aug 14, 2021
2 parents cbf086e + 9db7617 commit 20f4f66
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 27 deletions.
26 changes: 26 additions & 0 deletions cassis/typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@
}


_PRIMITIVE_ARRAY_TYPES = {
"uima.cas.FloatArray",
"uima.cas.IntegerArray",
"uima.cas.BooleanArray",
"uima.cas.ByteArray",
"uima.cas.ShortArray",
"uima.cas.LongArray",
"uima.cas.DoubleArray",
}


def _string_to_valid_classname(name: str):
return re.sub("[^a-zA-Z0-9_]", "_", name)

Expand Down Expand Up @@ -598,6 +609,21 @@ def is_primitive_collection(self, type_name) -> bool:
else:
return self.is_primitive_collection(self.get_type(type_name).supertypeName)

def is_primitive_array(self, type_name) -> bool:
"""Checks if the type identified by `type_name` is a primitive array, e.g. array of primitives.
Args:
type_name: The name of the type to query for.
Returns:
Returns True if the type identified by `type_name` is a primitive array type, else False
"""
if type_name == TOP_TYPE_NAME:
return False
elif type_name in _PRIMITIVE_ARRAY_TYPES:
return True
else:
return self.is_primitive_array(self.get_type(type_name).supertypeName)

def subsumes(self, parent_name: str, child_name: str) -> bool:
"""Determines if the type `child_name` is a child of `parent_name`.
Expand Down
105 changes: 79 additions & 26 deletions cassis/xmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lxml import etree

from cassis.cas import Cas, IdGenerator, Sofa, View
from cassis.typesystem import FeatureStructure, TypeNotFoundError, TypeSystem
from cassis.typesystem import _PRIMITIVE_ARRAY_TYPES, FeatureStructure, TypeNotFoundError, TypeSystem


@attr.s
Expand Down Expand Up @@ -158,42 +158,50 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b

for feature in t.all_features:
feature_name = feature.name
value = getattr(fs, feature_name)

if feature_name == "sofa":
value = getattr(fs, feature_name)

sofa = sofas[value]
setattr(fs, feature_name, sofa)
continue

if (
typesystem.is_primitive(feature.rangeTypeName)
or typesystem.is_primitive_collection(feature.rangeTypeName)
or typesystem.is_primitive_collection(fs.type)
if typesystem.is_instance_of(fs.type, "uima.cas.StringArray"):
# We already parsed string arrays to a Python list of string
# before, so we do not need to work more on this
continue
elif typesystem.is_primitive(feature.rangeTypeName) or typesystem.is_primitive_collection(
feature.rangeTypeName
):
# TODO: Parse feature values to their real type here, e.g. parse ints or floats
continue

# Resolve references here
value = getattr(fs, feature_name)
if value is None:
continue

# Resolve references
if typesystem.is_collection(fs.type, feature):
# A collection of references is a list of integers separated
# by single spaces, e.g. <foo:bar elements="1 2 3 42" />
targets = []
for ref in value.split():
target_id = int(ref)
elif typesystem.is_primitive_array(fs.type) and feature_name == "elements":
elements = self._parse_primitive_array(fs.type, value)
setattr(fs, feature_name, elements)
elif typesystem.is_primitive_array(feature.rangeTypeName):
elements = self._parse_primitive_array(feature.rangeTypeName, value)
setattr(fs, feature_name, elements)
else:
# Resolve references here
if value is None:
continue

# Resolve references
if typesystem.is_collection(fs.type, feature):
# A collection of references is a list of integers separated
# by single spaces, e.g. <foo:bar elements="1 2 3 42" />
targets = []
for ref in value.split():
target_id = int(ref)
target = feature_structures[target_id]
targets.append(target)
referenced_fs.add(target_id)
setattr(fs, feature_name, targets)
else:
target_id = int(value)
target = feature_structures[target_id]
targets.append(target)
referenced_fs.add(target_id)
setattr(fs, feature_name, targets)
else:
target_id = int(value)
target = feature_structures[target_id]
referenced_fs.add(target_id)
setattr(fs, feature_name, target)
setattr(fs, feature_name, target)

cas = Cas(typesystem=typesystem, lenient=lenient)
for sofa in sofas.values():
Expand Down Expand Up @@ -282,6 +290,33 @@ def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[
self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id)
return AnnotationType(**attributes)

def _parse_primitive_array(self, type_name: str, value: str) -> List:
""" Primitive collections are serialized as white space seperated primitive values"""

# TODO: Use type name global variable here instead of hardcoded string literal
elements = value.split(" ")
if type_name == "uima.cas.FloatArray" or type_name == "uima.cas.DoubleArray":
return [float(e) for e in elements]
elif (
type_name == "uima.cas.IntegerArray"
or type_name == "uima.cas.ShortArray"
or type_name == "uima.cas.LongArray"
):
return [int(e) for e in elements]
elif type_name == "uima.cas.BooleanArray":
return [self._parse_bool(e) for e in elements]
elif type_name == "uima.cas.ByteArray":
return list(bytearray.fromhex(value))
else:
raise ValueError(f"Not a primitive collection: {type_name}")

def _parse_bool(self, s: str) -> bool:
if s == "true":
return True
if s == "false":
return False
raise ValueError(f"Not a boolean: {s}")

def _clear_elem(self, elem):
""" Frees XML nodes that already have been processed to save memory """
elem.clear()
Expand Down Expand Up @@ -402,6 +437,10 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur
for e in value:
child = etree.SubElement(elem, feature_name)
child.text = e
elif ts.is_primitive_array(fs.type) and feature_name == "elements":
elem.attrib[feature_name] = self._serialize_primitive_array(fs.type, value)
elif ts.is_primitive_array(feature.rangeTypeName):
elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeTypeName, value)
elif feature_name == "sofa":
elem.attrib[feature_name] = str(value.xmiID)
elif ts.is_primitive(feature.rangeTypeName):
Expand Down Expand Up @@ -429,3 +468,17 @@ def _serialize_view(self, root: etree.Element, view: View):

elem.attrib["sofa"] = str(view.sofa.xmiID)
elem.attrib["members"] = " ".join(sorted((str(x.xmiID) for x in view.get_all_annotations()), key=int))

def _serialize_primitive_array(self, type_name: str, values: List) -> str:
""" Primitive collections are serialized as white space seperated primitive values"""

# TODO: Use type name global variable here instead of hardcoded string literal
if type_name not in _PRIMITIVE_ARRAY_TYPES:
raise ValueError(f"Not a primitive array: {type_name}")

if type_name == "uima.cas.BooleanArray":
return " ".join(str(e).lower() for e in values)
elif type_name == "uima.cas.ByteArray":
return "".join("{:02X}".format(x) for x in values)
else:
return " ".join(str(e) for e in values)
17 changes: 16 additions & 1 deletion tests/test_files/xmi/cas_with_collections.xmi
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,22 @@
<cas:NonEmptyFSList xmi:id="9" head="5" tail="10" />
<cas:EmptyFSList xmi:id="10"/>

<cas:FloatArray xmi:id="11" elements="0.7275637 0.054665208 0.6832234 0.0479393"/>
<cas:IntegerArray xmi:id="12" elements="1325939940 -248792245 1190043011 -1255373459 -1436456258 392236186"/>
<cas:StringArray xmi:id="13">
<elements>aaiguewilz</elements>
<elements>orarzvmgty</elements>
<elements>mkshhvglpk</elements>
<elements>ffvdpcdvbx</elements>
<elements>jsqcoqzpxb</elements>
</cas:StringArray>
<cas:BooleanArray xmi:id="14" elements="false true false"/>
<cas:ByteArray xmi:id="15" elements="42DB3064"/>
<cas:ShortArray xmi:id="16" elements="1929 13467 15132 15893"/>
<cas:LongArray xmi:id="17" elements="2516571677013944794"/>
<cas:DoubleArray xmi:id="18" elements="0.4362829094329638 0.6487936445670887 0.6959691863162578"/>

<cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text/plain"
sofaString="Joe waited for the train . The train was late ."/>
<cas:View members="2 3 4 5 6 7 8 9 10" sofa="1"/>
<cas:View members="2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18" sofa="1"/>
</xmi:XMI>
34 changes: 34 additions & 0 deletions tests/test_typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,40 @@ def test_is_primitive_collection(type_name: str, expected: bool):
assert typesystem.is_primitive_collection(type_name) == expected


@pytest.mark.parametrize(
"type_name, expected",
[
("uima.cas.ArrayBase", False),
("uima.cas.FSArray", False),
("uima.cas.FloatArray", True),
("uima.cas.IntegerArray", True),
("uima.cas.StringArray", False),
("uima.cas.ListBase", False),
("uima.cas.FSList", False),
("uima.cas.EmptyFSList", False),
("uima.cas.NonEmptyFSList", False),
("uima.cas.FloatList", False),
("uima.cas.EmptyFloatList", False),
("uima.cas.NonEmptyFloatList", False),
("uima.cas.IntegerList", False),
("uima.cas.EmptyIntegerList", False),
("uima.cas.NonEmptyIntegerList", False),
("uima.cas.StringList", False),
("uima.cas.EmptyStringList", False),
("uima.cas.NonEmptyStringList", False),
("uima.cas.BooleanArray", True),
("uima.cas.ByteArray", True),
("uima.cas.ShortArray", True),
("uima.cas.LongArray", True),
("uima.cas.DoubleArray", True),
],
)
def test_is_primitive_collection(type_name: str, expected: bool):
typesystem = TypeSystem()

assert typesystem.is_primitive_array(type_name) == expected


@pytest.mark.parametrize(
"parent_name, child_name, expected",
[
Expand Down

0 comments on commit 20f4f66

Please sign in to comment.