Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make equivalence test less strict for translations #45

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion src/babelon/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import logging
import os
import re
import string
from typing import Dict, List

import llm
Expand Down Expand Up @@ -222,8 +224,11 @@ def prepare_translation_for_ontology(
f"{predicate_id} value for {subject_id} is ambiguous,"
f"picking first one ({term_metadata[predicate_id]})."
)
if ontology_value != source_value:
if not _is_equivalent_string(ontology_value, source_value):
# If the translated string and the ontology literal are not equivalent, change status:
translation_value = row["translation_value"]
# Set the ontology value as the source value, so that the translation profiles are consistent
# With what is in the ontology
df_augmented.at[index, "source_value"] = ontology_value
new_translation_status = (
"CANDIDATE" if translation_value != "NOT_TRANSLATED" else "NOT_TRANSLATED"
Expand All @@ -235,6 +240,10 @@ def prepare_translation_for_ontology(
f"but {ontology_value} in the ontology."
)
output_source_changed_data.append(row)
else:
# Because `_is_equivalent_string` is a bit forgiving, we still want to replace the source value,
# so that the translation profiles are consistent
df_augmented.at[index, "source_value"] = ontology_value
else:
logging.warning(
f"{predicate_id} value for {subject_id} does not exist in ontology. "
Expand Down Expand Up @@ -285,6 +294,22 @@ def prepare_translation_for_ontology(
return df_augmented, df_output_source_changed, df_output_not_translated


def _is_equivalent_string(string1, string2):
"""Compare two strings after they are whitespace, punctuation and case normalised."""

def _normalize(s):
# Remove punctuation
s = s.translate(str.maketrans("", "", string.punctuation))
# Normalize whitespace and convert to lowercase
return re.sub(r"\s+", " ", s).strip().lower()

normalized_string1 = _normalize(string1)
normalized_string2 = _normalize(string2)

# Compare the normalized strings
return normalized_string1 == normalized_string2


def _get_metadata_for_term(ontology, term):
term_metadata = ontology.entity_metadata_map(term)
term_label = ontology.label(term)
Expand Down
20 changes: 15 additions & 5 deletions tests/test_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
import unittest

from dotenv import load_dotenv
from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation
from oaklib.resource import OntologyResource
from oaklib import get_adapter

from babelon.translate import OpenAITranslator, prepare_translation_for_ontology, translate_profile
from babelon.translate import (
OpenAITranslator,
_is_equivalent_string,
prepare_translation_for_ontology,
translate_profile,
)
from tests.constants import _create_simple_example_for_testing
from tests.test_data import data_dir as test_data_dir
from tests.test_data import env_file
Expand Down Expand Up @@ -40,8 +44,7 @@ def test_translate_profile(self):
def test_prepare_translation_for_ontology(self):
"""Test the update method for babelon profiles."""
test_file = f"{test_data_dir}/hp-testsubset.obo"
resource = OntologyResource(slug=test_file, local=True)
ontology = ProntoImplementation(resource)
ontology = get_adapter(f"pronto:{test_file}")
terms = ["HP:0001707"]
fields = ["rdfs:label"]
df_babelon = _create_simple_example_for_testing()
Expand All @@ -59,3 +62,10 @@ def test_prepare_translation_for_ontology(self):
["HP:0001945", "HP:0001297", "HP:0001707"],
df_output_not_translated["subject_id"].tolist(),
)

def test_equivalent_string(self):
"""Test if _is_equivalent_string() catches important cases."""
string1 = "Hello, my."
string2 = "hello my"

self.assertTrue(_is_equivalent_string(string1, string2))
Loading