Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#290 - Speedup load_cas_from_xmi by improving offset mapping #291

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 12 additions & 19 deletions cassis/cas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import itertools
import sys
import warnings
from collections import defaultdict
from functools import lru_cache
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union

Expand All @@ -26,6 +28,10 @@
NAME_DEFAULT_SOFA = "_InitialView"


@lru_cache(maxsize=5000)
def _get_size_in_utf16_bytes(c: str) -> int:
return len(c.encode("utf-16-le")) // 2

class IdGenerator:
def __init__(self, initial_id: int = 1):
self._next_id = initial_id
Expand All @@ -50,29 +56,16 @@ def __init__(self):
self._external_to_python: Union[Dict[int, int], None] = None
self._python_to_external: Union[Dict[int, int], None] = None

def create_offset_mapping(self, sofa_string: str):

def create_offset_mapping(self, sofa_string: str) -> None:
if sofa_string is None:
return

self._external_to_python = {0: 0}
self._python_to_external = {0: 0}

count_uima = 0
count_cassis = 0

for c in sofa_string:
size_in_utf16_bytes = len(c.encode("utf-16-le")) // 2

self._external_to_python[count_uima] = count_cassis
self._python_to_external[count_cassis] = count_uima

count_uima += size_in_utf16_bytes
count_cassis += 1
sizes_in_utf16_bytes = map(_get_size_in_utf16_bytes, sofa_string)
accumulated_sizes = [0] + list(itertools.accumulate(sizes_in_utf16_bytes))

# End offsets in UIMA are exclusive, we need to therefore add
# the offset after the last char also to this index
self._external_to_python[count_uima] = count_cassis
self._python_to_external[count_cassis] = count_uima
self._python_to_external = dict(zip(range(len(accumulated_sizes)), accumulated_sizes))
self._external_to_python = dict(zip(accumulated_sizes, range(len(accumulated_sizes))))

def external_to_python(self, idx: Optional[int]) -> Optional[int]:
if idx is None:
Expand Down
4 changes: 3 additions & 1 deletion cassis/xmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,9 @@ def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: b
else:
view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum)

view.sofa_string = sofa.sofaString
# Directly set the sofaString and offsetConverter for the sofa to avoid recomputing the offset convertion (slow!) when using the setter
view.get_sofa()._sofaString = sofa.sofaString
view.get_sofa()._offset_converter = sofa._offset_converter
view.sofa_mime = sofa.mimeType

# If a sofa has no members, then UIMA might omit the view. In that case,
Expand Down