Skip to content

Commit

Permalink
#167 - Cant serialize byte arrays
Browse files Browse the repository at this point in the history
- Add basic support for serializing/deserializing primitive arrays
  • Loading branch information
jcklie committed Aug 14, 2021
1 parent 01cd86d commit 4da53b7
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 27 deletions.
105 changes: 79 additions & 26 deletions cassis/xmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lxml import etree

from cassis.cas import Cas, IdGenerator, Sofa, View
from cassis.typesystem import FeatureStructure, TypeNotFoundError, TypeSystem
from cassis.typesystem import _PRIMITIVE_ARRAY_TYPES, FeatureStructure, TypeNotFoundError, TypeSystem


@attr.s
Expand Down Expand Up @@ -158,42 +158,50 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b

for feature in t.all_features:
feature_name = feature.name
value = getattr(fs, feature_name)

if feature_name == "sofa":
value = getattr(fs, feature_name)

sofa = sofas[value]
setattr(fs, feature_name, sofa)
continue

if (
typesystem.is_primitive(feature.rangeTypeName)
or typesystem.is_primitive_collection(feature.rangeTypeName)
or typesystem.is_primitive_collection(fs.type)
if typesystem.is_instance_of(fs.type, "uima.cas.StringArray"):
# We already parsed string arrays to a Python list of string
# before, so we do not need to work more on this
continue
elif typesystem.is_primitive(feature.rangeTypeName) or typesystem.is_primitive_collection(
feature.rangeTypeName
):
# TODO: Parse feature values to their real type here, e.g. parse ints or floats
continue

# Resolve references here
value = getattr(fs, feature_name)
if value is None:
continue

# Resolve references
if typesystem.is_collection(fs.type, feature):
# A collection of references is a list of integers separated
# by single spaces, e.g. <foo:bar elements="1 2 3 42" />
targets = []
for ref in value.split():
target_id = int(ref)
elif typesystem.is_primitive_array(fs.type) and feature_name == "elements":
elements = self._parse_primitive_array(fs.type, value)
setattr(fs, feature_name, elements)
elif typesystem.is_primitive_array(feature.rangeTypeName):
elements = self._parse_primitive_array(feature.rangeTypeName, value)
setattr(fs, feature_name, elements)
else:
# Resolve references here
if value is None:
continue

# Resolve references
if typesystem.is_collection(fs.type, feature):
# A collection of references is a list of integers separated
# by single spaces, e.g. <foo:bar elements="1 2 3 42" />
targets = []
for ref in value.split():
target_id = int(ref)
target = feature_structures[target_id]
targets.append(target)
referenced_fs.add(target_id)
setattr(fs, feature_name, targets)
else:
target_id = int(value)
target = feature_structures[target_id]
targets.append(target)
referenced_fs.add(target_id)
setattr(fs, feature_name, targets)
else:
target_id = int(value)
target = feature_structures[target_id]
referenced_fs.add(target_id)
setattr(fs, feature_name, target)
setattr(fs, feature_name, target)

cas = Cas(typesystem=typesystem, lenient=lenient)
for sofa in sofas.values():
Expand Down Expand Up @@ -282,6 +290,33 @@ def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[
self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id)
return AnnotationType(**attributes)

def _parse_primitive_array(self, type_name: str, value: str) -> List:
""" Primitive collections are serialized as white space seperated primitive values"""

# TODO: Use type name global variable here instead of hardcoded string literal
elements = value.split(" ")
if type_name == "uima.cas.FloatArray" or type_name == "uima.cas.DoubleArray":
return [float(e) for e in elements]
elif (
type_name == "uima.cas.IntegerArray"
or type_name == "uima.cas.ShortArray"
or type_name == "uima.cas.LongArray"
):
return [int(e) for e in elements]
elif type_name == "uima.cas.BooleanArray":
return [self._parse_bool(e) for e in elements]
elif type_name == "uima.cas.ByteArray":
return list(bytearray.fromhex(value))
else:
raise ValueError(f"Not a primitive collection: {type_name}")

def _parse_bool(self, s: str) -> bool:
if s == "true":
return True
if s == "false":
return False
raise ValueError(f"Not a boolean: {s}")

def _clear_elem(self, elem):
""" Frees XML nodes that already have been processed to save memory """
elem.clear()
Expand Down Expand Up @@ -402,6 +437,10 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur
for e in value:
child = etree.SubElement(elem, feature_name)
child.text = e
elif ts.is_primitive_array(fs.type) and feature_name == "elements":
elem.attrib[feature_name] = self._serialize_primitive_array(fs.type, value)
elif ts.is_primitive_array(feature.rangeTypeName):
elem.attrib[feature_name] = self._serialize_primitive_array(feature.rangeTypeName, value)
elif feature_name == "sofa":
elem.attrib[feature_name] = str(value.xmiID)
elif ts.is_primitive(feature.rangeTypeName):
Expand Down Expand Up @@ -429,3 +468,17 @@ def _serialize_view(self, root: etree.Element, view: View):

elem.attrib["sofa"] = str(view.sofa.xmiID)
elem.attrib["members"] = " ".join(sorted((str(x.xmiID) for x in view.get_all_annotations()), key=int))

def _serialize_primitive_array(self, type_name: str, values: List) -> str:
""" Primitive collections are serialized as white space seperated primitive values"""

# TODO: Use type name global variable here instead of hardcoded string literal
if type_name not in _PRIMITIVE_ARRAY_TYPES:
raise ValueError(f"Not a primitive array: {type_name}")

if type_name == "uima.cas.BooleanArray":
return " ".join(str(e).lower() for e in values)
elif type_name == "uima.cas.ByteArray":
return "".join("{:02X}".format(x) for x in values)
else:
return " ".join(str(e) for e in values)
17 changes: 16 additions & 1 deletion tests/test_files/xmi/cas_with_collections.xmi
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,22 @@
<cas:NonEmptyFSList xmi:id="9" head="5" tail="10" />
<cas:EmptyFSList xmi:id="10"/>

<cas:FloatArray xmi:id="11" elements="0.7275637 0.054665208 0.6832234 0.0479393"/>
<cas:IntegerArray xmi:id="12" elements="1325939940 -248792245 1190043011 -1255373459 -1436456258 392236186"/>
<cas:StringArray xmi:id="13">
<elements>aaiguewilz</elements>
<elements>orarzvmgty</elements>
<elements>mkshhvglpk</elements>
<elements>ffvdpcdvbx</elements>
<elements>jsqcoqzpxb</elements>
</cas:StringArray>
<cas:BooleanArray xmi:id="14" elements="false true false"/>
<cas:ByteArray xmi:id="15" elements="42DB3064"/>
<cas:ShortArray xmi:id="16" elements="1929 13467 15132 15893"/>
<cas:LongArray xmi:id="17" elements="2516571677013944794"/>
<cas:DoubleArray xmi:id="18" elements="0.4362829094329638 0.6487936445670887 0.6959691863162578"/>

<cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text/plain"
sofaString="Joe waited for the train . The train was late ."/>
<cas:View members="2 3 4 5 6 7 8 9 10" sofa="1"/>
<cas:View members="2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18" sofa="1"/>
</xmi:XMI>

0 comments on commit 4da53b7

Please sign in to comment.