Skip to content

Commit

Permalink
Merge pull request #303 from dkpro/feature/249-Set-sofa-string-and-do…
Browse files Browse the repository at this point in the history
…cument-language-in-Cas-constructor

#249 - Set sofa string and document language in Cas constructor
  • Loading branch information
reckart authored Feb 4, 2024
2 parents 90b6bdb + b4e49e2 commit ef1d5f4
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 8 deletions.
19 changes: 17 additions & 2 deletions cassis/cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
TypeCheckError,
TypeSystem,
TypeSystemMode,
TYPE_NAME_DOCUMENT_ANNOTATION,
FEATURE_BASE_NAME_LANGUAGE,
)

_validator_optional_string = validators.optional(validators.instance_of(str))
Expand All @@ -32,6 +34,7 @@
def _get_size_in_utf16_bytes(c: str) -> int:
return len(c.encode("utf-16-le")) // 2


class IdGenerator:
def __init__(self, initial_id: int = 1):
self._next_id = initial_id
Expand All @@ -56,7 +59,6 @@ def __init__(self):
self._external_to_python: Union[Dict[int, int], None] = None
self._python_to_external: Union[Dict[int, int], None] = None


def create_offset_mapping(self, sofa_string: str) -> None:
if sofa_string is None:
return
Expand Down Expand Up @@ -202,7 +204,13 @@ def __init__(self, typesystem: TypeSystem):
class Cas:
"""A CAS object is a container for text (sofa) and annotations"""

def __init__(self, typesystem: TypeSystem = None, lenient: bool = False):
def __init__(
self,
typesystem: TypeSystem = None,
lenient: bool = False,
sofa_string: str = None,
sofa_mime: str = None,
):
"""Creates a CAS with the specified typesystem. If no typesystem is given, then the default one
is used which only contains UIMA-predefined types.
Expand All @@ -226,6 +234,13 @@ def __init__(self, typesystem: TypeSystem = None, lenient: bool = False):
self._add_view("_InitialView")
self._current_view: View = self._views["_InitialView"]

if sofa_string is not None:
self.sofa_string = sofa_string
if sofa_mime is not None:
self.sofa_mime = sofa_mime
else:
self.sofa_mime = "text/plain"

@property
def typesystem(self) -> TypeSystem:
return self._typesystem
Expand Down
18 changes: 14 additions & 4 deletions cassis/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
NEGATIVE_INFINITE_VALUE_ABBR = "-Inf"


def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None, lenient: bool = False, merge_typesystem: bool =True) -> Cas:
def load_cas_from_json(
source: Union[IO, str], typesystem: TypeSystem = None, lenient: bool = False, merge_typesystem: bool = True
) -> Cas:
"""Loads a CAS from a JSON source.
Args:
Expand All @@ -64,7 +66,13 @@ def __init__(self):
self._max_sofa_num = 0
self._post_processors = []

def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] = None, lenient: bool = False, merge_typesystem: bool =True) -> Cas:
def deserialize(
self,
source: Union[IO, str],
typesystem: Optional[TypeSystem] = None,
lenient: bool = False,
merge_typesystem: bool = True,
) -> Cas:
if isinstance(source, str):
data = json.loads(source)
else:
Expand Down Expand Up @@ -103,6 +111,7 @@ def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] =
feature_structures = {}
json_feature_structures = data.get(FEATURE_STRUCTURES_FIELD)
if isinstance(json_feature_structures, list):

def parse_and_add(json_fs_):
parsed = self._parse_feature_structure(typesystem, json_fs_.get(ID_FIELD), json_fs_, feature_structures)
feature_structures[parsed.xmiID] = parsed
Expand All @@ -126,6 +135,7 @@ def parse_and_add(json_fs_):
parse_and_add(json_fs)

if isinstance(json_feature_structures, dict):

def parse_and_add(fs_id_, json_fs_):
parsed = self._parse_feature_structure(typesystem, int(fs_id_), json_fs_, feature_structures)
feature_structures[parsed.xmiID] = parsed
Expand Down Expand Up @@ -174,7 +184,7 @@ def _parse_features(self, typesystem: TypeSystem, type_name: str, json_type: Dic

range_type = json_feature[RANGE_FIELD]
element_type = json_feature.get(ELEMENT_TYPE_FIELD)
if range_type.endswith('[]'):
if range_type.endswith("[]"):
element_type = range_type[:-2]
range_type = array_type_name_for_type(element_type)
typesystem.create_feature(
Expand Down Expand Up @@ -222,7 +232,7 @@ def _parse_feature_structure(
self, typesystem: TypeSystem, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any]
):
type_name = json_fs.get(TYPE_FIELD)
if type_name.endswith('[]'):
if type_name.endswith("[]"):
type_name = array_type_name_for_type(type_name)
AnnotationType = typesystem.get_type(type_name)

Expand Down
3 changes: 3 additions & 0 deletions cassis/typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class TypeSystemMode(Enum):
MINIMAL = auto()
NONE = auto()


def array_type_name_for_type(type_: Union[str, "Type"]) -> str:
type_name = type_ if isinstance(type_, str) else type_.name
if type_name == TYPE_NAME_BYTE:
Expand Down Expand Up @@ -238,6 +239,8 @@ def element_type_name_for_array_type(type_: Union[str, "Type"]) -> str:
if type_name == TYPE_NAME_STRING_ARRAY:
return TYPE_NAME_STRING
return TYPE_NAME_TOP


def _string_to_valid_classname(name: str):
return re.sub("[^a-zA-Z0-9_]", "_", name)

Expand Down
14 changes: 14 additions & 0 deletions tests/test_cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,20 @@ def test_sofa_uri_can_be_set_and_read():
assert cas.sofa_uri == "https://raw.githubusercontent.com/dkpro/dkpro-cassis/master/README.rst"


def test_sofa_string_can_be_set_using_constructor():
cas = Cas(sofa_string="I am a test sofa string!")

assert cas.sofa_string == "I am a test sofa string!"
assert cas.sofa_mime == "text/plain"


def test_sofa_string_and_mime_type_can_be_set_using_constructor():
cas = Cas(sofa_string="I am a <b>test sofa string!</b>", sofa_mime="text/html")

assert cas.sofa_string == "I am a <b>test sofa string!</b>"
assert cas.sofa_mime == "text/html"


# Select


Expand Down
3 changes: 2 additions & 1 deletion tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,10 @@
(
os.path.join(ONE_WAY_DIR, "tsv3-testSimpleSlotFeature"),
[],
)
),
]


@pytest.mark.parametrize("json_path, annotations", ROUND_TRIP_FIXTURES)
def test_deserialization_serialization(json_path, annotations):
with open(os.path.join(json_path, "data.json"), "rb") as f:
Expand Down
3 changes: 2 additions & 1 deletion tests/test_typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
TYPE_NAME_STRING_ARRAY,
TYPE_NAME_TOP,
TypeCheckError,
is_predefined, TYPE_NAME_DOCUMENT_ANNOTATION,
is_predefined,
TYPE_NAME_DOCUMENT_ANNOTATION,
)
from tests.fixtures import *
from tests.util import assert_xml_equal
Expand Down

0 comments on commit ef1d5f4

Please sign in to comment.