Skip to content

Commit

Permalink
Merge pull request #169 from dkpro/feature/168-Experimental-JSON-CAS-…
Browse files Browse the repository at this point in the history
…support

#168 - Experimental JSON CAS support
  • Loading branch information
reckart authored Dec 12, 2021
2 parents 0b802b3 + e89ada4 commit 0753e05
Show file tree
Hide file tree
Showing 44 changed files with 1,631 additions and 11 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,5 @@ expected.xml
difference.diff

xml_issue.py
actual.json
expected.json
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PYTHON_FILES = cassis tests

test:
python -m pytest tests/
python -m pytest -m "not performance" tests/

format:
black -l 120 cassis/
Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ Some features are still under development, e.g.

- Proper type checking
- XML/XMI schema validation
- UIMA JSON CAS support (the format is not yet finalized)

Installation
------------
Expand Down
2 changes: 2 additions & 0 deletions cassis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""UIMA CAS processing library in Python."""

from .cas import Cas, Sofa, View
from .json import load_cas_from_json
from .typesystem import TypeSystem, load_dkpro_core_typesystem, load_typesystem, merge_typesystems
from .util import cas_to_comparable_text
from .xmi import load_cas_from_xmi
Expand All @@ -14,5 +15,6 @@
"load_dkpro_core_typesystem",
"merge_typesystems",
"load_cas_from_xmi",
"load_cas_from_json",
"cas_to_comparable_text",
]
77 changes: 69 additions & 8 deletions cassis/cas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import sys
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union

Expand All @@ -10,10 +9,12 @@
from sortedcontainers import SortedKeyList

from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem, TYPE_NAME_FS_LIST, \
TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD
TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD, TypeSystemMode

_validator_optional_string = validators.optional(validators.instance_of(str))

NAME_DEFAULT_SOFA = "_InitialView"


class IdGenerator:
def __init__(self, initial_id: int = 1):
Expand Down Expand Up @@ -107,6 +108,9 @@ class Sofa:
#: str: The sofa URI, it references remote sofa data
sofaURI = attr.ib(default=None, validator=_validator_optional_string)

#: str: The sofa data byte array
sofaArray = attr.ib(default=None)

#: OffsetConverter: Converts from UIMA UTF-16 based offsets to Unicode codepoint offsets and back
_offset_converter = attr.ib(factory=OffsetConverter, eq=False, hash=False)

Expand Down Expand Up @@ -543,6 +547,25 @@ def sofa_uri(self, value: str):
"""
self.get_sofa().sofaURI = value

@property
def sofa_array(self) -> str:
"""The sofa byte array references a uima.cas.ByteArray feature structure
Returns: The sofa data byte array.
"""
return self.get_sofa().sofaArray

@sofa_array.setter
def sofa_array(self, value):
"""Sets the sofa byte array to the given uima.cas.ByteArray feature structure.
Args:
value: The new sofa byte array feature structure.
"""
self.get_sofa().sofaArray = value

def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False) -> Optional[str]:
"""Creates a XMI representation of this CAS.
Expand All @@ -557,19 +580,57 @@ def to_xmi(self, path: Union[str, Path, None] = None, pretty_print: bool = False
"""
from cassis.xmi import CasXmiSerializer

serializer = CasXmiSerializer()
return self._serialize(CasXmiSerializer(), path, pretty_print=pretty_print)

def to_json(
self,
path: Union[str, Path, None] = None,
pretty_print: bool = False,
ensure_ascii=False,
type_system_mode: TypeSystemMode = TypeSystemMode.FULL,
) -> Optional[str]:
"""Creates a JSON representation of this CAS.
Args:
path: File path, if `None` is provided the result is returned as a string
pretty_print: `True` if the resulting JSON should be pretty-printed, else `False`
ensure_ascii: Whether to escape non-ASCII Unicode characters or not
type_system_mode: Whether to serialize the full type system (`FUL`), only the types used (`MINIMAL`), or no
type system information at all (`NONE`)
Returns:
If `path` is None, then the JSON representation of this CAS is returned as a string
"""
from cassis.json import CasJsonSerializer

return self._serialize(
CasJsonSerializer(),
path,
pretty_print=pretty_print,
ensure_ascii=ensure_ascii,
type_system_mode=type_system_mode,
)

def _serialize(self, serializer, path: Union[str, Path, None] = None, **kwargs):
"""Runs this CAS through the given serializer.
Args:
path: File path, if `None` is provided the result is returned as a string
Returns:
If `path` is None, then the data representation of this CAS is returned as a string
"""
# If `path` is None, then serialize to a string and return it
if path is None:
sink = BytesIO()
serializer.serialize(sink, self, pretty_print=pretty_print)
return sink.getvalue().decode("utf-8")
return serializer.serialize(None, self, **kwargs)
elif isinstance(path, str):
with open(path, "wb") as f:
serializer.serialize(f, self, pretty_print=pretty_print)
serializer.serialize(f, self, **kwargs)
elif isinstance(path, Path):
with path.open("wb") as f:
serializer.serialize(f, self, pretty_print=pretty_print)
serializer.serialize(f, self, **kwargs)
else:
raise TypeError(f"`path` needs to be one of [str, None, Path], but was <{type(path)}>")

Expand Down
Loading

0 comments on commit 0753e05

Please sign in to comment.