Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
reckart authored Oct 4, 2023
2 parents c47bef5 + ed6b4f5 commit 7968342
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 22 deletions.
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
dkpro-cassis
============

.. image:: https://travis-ci.org/dkpro/dkpro-cassis.svg?branch=master
:target: https://travis-ci.org/dkpro/dkpro-cassis
.. image:: https://github.com/dkpro/dkpro-cassis/actions/workflows/run_tests.yml/badge.svg
:target: https://github.com/dkpro/dkpro-cassis/actions/workflows/run_tests.yml

.. image:: https://readthedocs.org/projects/cassis/badge/?version=latest
:target: https://cassis.readthedocs.io/en/latest/?badge=latest
Expand Down
31 changes: 12 additions & 19 deletions cassis/cas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import itertools
import sys
import warnings
from collections import defaultdict
from functools import lru_cache
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union

Expand All @@ -26,6 +28,10 @@
NAME_DEFAULT_SOFA = "_InitialView"


@lru_cache(maxsize=5000)
def _get_size_in_utf16_bytes(c: str) -> int:
return len(c.encode("utf-16-le")) // 2

class IdGenerator:
def __init__(self, initial_id: int = 1):
self._next_id = initial_id
Expand All @@ -50,29 +56,16 @@ def __init__(self):
self._external_to_python: Union[Dict[int, int], None] = None
self._python_to_external: Union[Dict[int, int], None] = None

def create_offset_mapping(self, sofa_string: str):

def create_offset_mapping(self, sofa_string: str) -> None:
if sofa_string is None:
return

self._external_to_python = {0: 0}
self._python_to_external = {0: 0}

count_uima = 0
count_cassis = 0

for c in sofa_string:
size_in_utf16_bytes = len(c.encode("utf-16-le")) // 2

self._external_to_python[count_uima] = count_cassis
self._python_to_external[count_cassis] = count_uima

count_uima += size_in_utf16_bytes
count_cassis += 1
sizes_in_utf16_bytes = map(_get_size_in_utf16_bytes, sofa_string)
accumulated_sizes = [0] + list(itertools.accumulate(sizes_in_utf16_bytes))

# End offsets in UIMA are exclusive, we need to therefore add
# the offset after the last char also to this index
self._external_to_python[count_uima] = count_cassis
self._python_to_external[count_cassis] = count_uima
self._python_to_external = dict(zip(range(len(accumulated_sizes)), accumulated_sizes))
self._external_to_python = dict(zip(accumulated_sizes, range(len(accumulated_sizes))))

def external_to_python(self, idx: Optional[int]) -> Optional[int]:
if idx is None:
Expand Down
4 changes: 3 additions & 1 deletion cassis/xmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,9 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b
else:
view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum)

view.sofa_string = sofa.sofaString
# Directly set the sofaString and offsetConverter for the sofa to avoid recomputing the offset convertion (slow!) when using the setter
view.get_sofa()._sofaString = sofa.sofaString
view.get_sofa()._offset_converter = sofa._offset_converter
view.sofa_mime = sofa.mimeType

# If a sofa has no members, then UIMA might omit the view. In that case,
Expand Down

0 comments on commit 7968342

Please sign in to comment.