Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lenient loading and type system merging to load_cas_from_json #288

Merged
merged 3 commits into from
Oct 4, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 22 additions & 21 deletions cassis/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
NEGATIVE_INFINITE_VALUE_ABBR = "-Inf"


def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None) -> Cas:
def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None, lenient: bool = False, merge_typesystem: bool =True) -> Cas:
"""Loads a CAS from a JSON source.

Args:
Expand All @@ -55,7 +55,7 @@ def load_cas_from_json(source: Union[IO, str], typesystem: TypeSystem = None) ->
typesystem = TypeSystem()

deserializer = CasJsonDeserializer()
return deserializer.deserialize(source, typesystem=typesystem)
return deserializer.deserialize(source, typesystem=typesystem, lenient=lenient, merge_typesystem=merge_typesystem)


class CasJsonDeserializer:
Expand All @@ -64,7 +64,7 @@ def __init__(self):
self._max_sofa_num = 0
self._post_processors = []

def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] = None) -> Cas:
def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] = None, lenient: bool = False, merge_typesystem: bool =True) -> Cas:
if isinstance(source, str):
data = json.loads(source)
else:
Expand All @@ -74,30 +74,31 @@ def deserialize(self, source: Union[IO, str], typesystem: Optional[TypeSystem] =
self._max_sofa_num = 0
self._post_processors = []

json_typesystem = data.get(TYPES_FIELD)
embedded_typesystem = TypeSystem(
add_document_annotation_type=not (json_typesystem.get(FLAG_DOCUMENT_ANNOTATION))
)
if merge_typesystem:
json_typesystem = data.get(TYPES_FIELD)
embedded_typesystem = TypeSystem(
add_document_annotation_type=not (json_typesystem.get(FLAG_DOCUMENT_ANNOTATION))
)

# First, build a dependency graph to support cases where a child type is defined before its super type
type_dependencies = defaultdict(set)
for type_name, json_type in json_typesystem.items():
type_dependencies[type_name].add(json_type[SUPER_TYPE_FIELD])
# First, build a dependency graph to support cases where a child type is defined before its super type
type_dependencies = defaultdict(set)
for type_name, json_type in json_typesystem.items():
type_dependencies[type_name].add(json_type[SUPER_TYPE_FIELD])

# Second, load all the types but no features since features of a type X might be of a later loaded type Y
for type_name in toposort_flatten(type_dependencies):
if is_predefined(type_name) or embedded_typesystem.contains_type(type_name):
continue
# Second, load all the types but no features since features of a type X might be of a later loaded type Y
for type_name in toposort_flatten(type_dependencies):
if is_predefined(type_name) or embedded_typesystem.contains_type(type_name):
continue

self._parse_type(embedded_typesystem, type_name, json_typesystem[type_name])
self._parse_type(embedded_typesystem, type_name, json_typesystem[type_name])

# Now we are sure we know all the types, we can create the features
for type_name, json_type in json_typesystem.items():
self._parse_features(embedded_typesystem, type_name, json_type)
# Now we are sure we know all the types, we can create the features
for type_name, json_type in json_typesystem.items():
self._parse_features(embedded_typesystem, type_name, json_type)

typesystem = merge_typesystems(typesystem, embedded_typesystem)
typesystem = merge_typesystems(typesystem, embedded_typesystem)

cas = Cas(typesystem=typesystem)
cas = Cas(typesystem=typesystem, lenient=lenient)

feature_structures = {}
json_feature_structures = data.get(FEATURE_STRUCTURES_FIELD)
Expand Down