Skip to content

Commit

Permalink
Make equivalence test less strict for translations
Browse files Browse the repository at this point in the history
  • Loading branch information
matentzn committed Mar 1, 2024
1 parent e3dbe8e commit 7668989
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 6 deletions.
27 changes: 26 additions & 1 deletion src/babelon/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import logging
import os
import re
import string
from typing import Dict, List

import llm
Expand Down Expand Up @@ -222,8 +224,11 @@ def prepare_translation_for_ontology(
f"{predicate_id} value for {subject_id} is ambiguous,"
f"picking first one ({term_metadata[predicate_id]})."
)
if ontology_value != source_value:
if not _is_equivalent_string(ontology_value, source_value):
# If the translated string and the ontology literal are not equivalent, change status:
translation_value = row["translation_value"]
# Set the ontology value as the source value, so that the translation profiles are consistent
# With what is in the ontology
df_augmented.at[index, "source_value"] = ontology_value
new_translation_status = (
"CANDIDATE" if translation_value != "NOT_TRANSLATED" else "NOT_TRANSLATED"
Expand All @@ -235,6 +240,10 @@ def prepare_translation_for_ontology(
f"but {ontology_value} in the ontology."
)
output_source_changed_data.append(row)
else:
# Because `_is_equivalent_string` is a bit forgiving, we still want to replace the source value,
# so that the translation profiles are consistent
df_augmented.at[index, "source_value"] = ontology_value
else:
logging.warning(
f"{predicate_id} value for {subject_id} does not exist in ontology. "
Expand Down Expand Up @@ -285,6 +294,22 @@ def prepare_translation_for_ontology(
return df_augmented, df_output_source_changed, df_output_not_translated


def _is_equivalent_string(string1, string2):
"""Compare two strings after they are whitespace, punctuation and case normalised."""

def _normalize(s):
# Remove punctuation
s = s.translate(str.maketrans("", "", string.punctuation))
# Normalize whitespace and convert to lowercase
return re.sub(r"\s+", " ", s).strip().lower()

normalized_string1 = _normalize(string1)
normalized_string2 = _normalize(string2)

# Compare the normalized strings
return normalized_string1 == normalized_string2


def _get_metadata_for_term(ontology, term):
term_metadata = ontology.entity_metadata_map(term)
term_label = ontology.label(term)
Expand Down
20 changes: 15 additions & 5 deletions tests/test_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
import unittest

from dotenv import load_dotenv
from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation
from oaklib.resource import OntologyResource
from oaklib import get_adapter

from babelon.translate import OpenAITranslator, prepare_translation_for_ontology, translate_profile
from babelon.translate import (
OpenAITranslator,
_is_equivalent_string,
prepare_translation_for_ontology,
translate_profile,
)
from tests.constants import _create_simple_example_for_testing
from tests.test_data import data_dir as test_data_dir
from tests.test_data import env_file
Expand Down Expand Up @@ -40,8 +44,7 @@ def test_translate_profile(self):
def test_prepare_translation_for_ontology(self):
"""Test the update method for babelon profiles."""
test_file = f"{test_data_dir}/hp-testsubset.obo"
resource = OntologyResource(slug=test_file, local=True)
ontology = ProntoImplementation(resource)
ontology = get_adapter(f"pronto:{test_file}")
terms = ["HP:0001707"]
fields = ["rdfs:label"]
df_babelon = _create_simple_example_for_testing()
Expand All @@ -59,3 +62,10 @@ def test_prepare_translation_for_ontology(self):
["HP:0001945", "HP:0001297", "HP:0001707"],
df_output_not_translated["subject_id"].tolist(),
)

def test_equivalent_string(self):
"""Test if _is_equivalent_string() catches important cases."""
string1 = "Hello, my."
string2 = "hello my"

self.assertTrue(_is_equivalent_string(string1, string2))

0 comments on commit 7668989

Please sign in to comment.