diff --git a/cassis/xmi.py b/cassis/xmi.py
index 1af6223..8656c9d 100644
--- a/cassis/xmi.py
+++ b/cassis/xmi.py
@@ -7,7 +7,7 @@
from lxml import etree
from cassis.cas import Cas, IdGenerator, Sofa, View
-from cassis.typesystem import FeatureStructure, TypeNotFoundError, TypeSystem
+from cassis.typesystem import _PRIMITIVE_ARRAY_TYPES, FeatureStructure, TypeNotFoundError, TypeSystem
@attr.s
@@ -158,42 +158,50 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b
for feature in t.all_features:
feature_name = feature.name
+ value = getattr(fs, feature_name)
if feature_name == "sofa":
- value = getattr(fs, feature_name)
+
sofa = sofas[value]
setattr(fs, feature_name, sofa)
continue
- if (
- typesystem.is_primitive(feature.rangeTypeName)
- or typesystem.is_primitive_collection(feature.rangeTypeName)
- or typesystem.is_primitive_collection(fs.type)
+ if typesystem.is_instance_of(fs.type, "uima.cas.StringArray"):
+ # We already parsed string arrays to a Python list of string
+ # before, so we do not need to work more on this
+ continue
+ elif typesystem.is_primitive(feature.rangeTypeName) or typesystem.is_primitive_collection(
+ feature.rangeTypeName
):
# TODO: Parse feature values to their real type here, e.g. parse ints or floats
continue
-
- # Resolve references here
- value = getattr(fs, feature_name)
- if value is None:
- continue
-
- # Resolve references
- if typesystem.is_collection(fs.type, feature):
- # A collection of references is a list of integers separated
- # by single spaces, e.g.
- targets = []
- for ref in value.split():
- target_id = int(ref)
+ elif typesystem.is_primitive_array(fs.type) and feature_name == "elements":
+ elements = self._parse_primitive_array(fs.type, value)
+ setattr(fs, feature_name, elements)
+ elif typesystem.is_primitive_array(feature.rangeTypeName):
+ elements = self._parse_primitive_array(feature.rangeTypeName, value)
+ setattr(fs, feature_name, elements)
+ else:
+ # Resolve references here
+ if value is None:
+ continue
+
+ # Resolve references
+ if typesystem.is_collection(fs.type, feature):
+ # A collection of references is a list of integers separated
+ # by single spaces, e.g.
+ targets = []
+ for ref in value.split():
+ target_id = int(ref)
+ target = feature_structures[target_id]
+ targets.append(target)
+ referenced_fs.add(target_id)
+ setattr(fs, feature_name, targets)
+ else:
+ target_id = int(value)
target = feature_structures[target_id]
- targets.append(target)
referenced_fs.add(target_id)
- setattr(fs, feature_name, targets)
- else:
- target_id = int(value)
- target = feature_structures[target_id]
- referenced_fs.add(target_id)
- setattr(fs, feature_name, target)
+ setattr(fs, feature_name, target)
cas = Cas(typesystem=typesystem, lenient=lenient)
for sofa in sofas.values():
@@ -282,6 +290,33 @@ def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[
self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id)
return AnnotationType(**attributes)
+ def _parse_primitive_array(self, type_name: str, value: str) -> List:
+ """ Primitive collections are serialized as white space seperated primitive values"""
+
+ # TODO: Use type name global variable here instead of hardcoded string literal
+ elements = value.split(" ")
+ if type_name == "uima.cas.FloatArray" or type_name == "uima.cas.DoubleArray":
+ return [float(e) for e in elements]
+ elif (
+ type_name == "uima.cas.IntegerArray"
+ or type_name == "uima.cas.ShortArray"
+ or type_name == "uima.cas.LongArray"
+ ):
+ return [int(e) for e in elements]
+ elif type_name == "uima.cas.BooleanArray":
+ return [self._parse_bool(e) for e in elements]
+ elif type_name == "uima.cas.ByteArray":
+ return list(bytearray.fromhex(value))
+ else:
+ raise ValueError(f"Not a primitive collection: {type_name}")
+
+ def _parse_bool(self, s: str) -> bool:
+ if s == "true":
+ return True
+ if s == "false":
+ return False
+ raise ValueError(f"Not a boolean: {s}")
+
def _clear_elem(self, elem):
""" Frees XML nodes that already have been processed to save memory """
elem.clear()
@@ -402,6 +437,10 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur
for e in value:
child = etree.SubElement(elem, feature_name)
child.text = e
+ elif ts.is_primitive_array(fs.type) and feature_name == "elements":
+ elem.attrib[feature_name] = self._serialize_primitive_array(fs.type, value)
+ elif ts.is_primitive_array(feature.rangeTypeName):
+ elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeTypeName, value)
elif feature_name == "sofa":
elem.attrib[feature_name] = str(value.xmiID)
elif ts.is_primitive(feature.rangeTypeName):
@@ -429,3 +468,17 @@ def _serialize_view(self, root: etree.Element, view: View):
elem.attrib["sofa"] = str(view.sofa.xmiID)
elem.attrib["members"] = " ".join(sorted((str(x.xmiID) for x in view.get_all_annotations()), key=int))
+
+ def _serialize_primitive_array(self, type_name: str, values: List) -> str:
+ """ Primitive collections are serialized as white space seperated primitive values"""
+
+ # TODO: Use type name global variable here instead of hardcoded string literal
+ if type_name not in _PRIMITIVE_ARRAY_TYPES:
+ raise ValueError(f"Not a primitive array: {type_name}")
+
+ if type_name == "uima.cas.BooleanArray":
+ return " ".join(str(e).lower() for e in values)
+ elif type_name == "uima.cas.ByteArray":
+ return "".join("{:02X}".format(x) for x in values)
+ else:
+ return " ".join(str(e) for e in values)
diff --git a/tests/test_files/xmi/cas_with_collections.xmi b/tests/test_files/xmi/cas_with_collections.xmi
index ec231a5..7c65772 100644
--- a/tests/test_files/xmi/cas_with_collections.xmi
+++ b/tests/test_files/xmi/cas_with_collections.xmi
@@ -37,7 +37,22 @@
+
+
+
+ aaiguewilz
+ orarzvmgty
+ mkshhvglpk
+ ffvdpcdvbx
+ jsqcoqzpxb
+
+
+
+
+
+
+
-
+