From e9f2b60672a0812d2804b6295d31078f2fe391c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Fri, 10 Nov 2023 11:41:14 +0100
Subject: [PATCH] feat: add ingredient parsing information

---
 robotoff/off.py                               |  72 +++++++++++++
 .../prediction/ingredient_list/__init__.py    |   2 +-
 robotoff/taxonomy.py                          |   7 +-
 robotoff/workers/tasks/import_image.py        |  50 ++++++++-
 .../workers/tasks/test_import_image.py        | 101 +++++++++++++++---
 5 files changed, 208 insertions(+), 24 deletions(-)

diff --git a/robotoff/off.py b/robotoff/off.py
index 5d67228ea6..b55eb93b1e 100644
--- a/robotoff/off.py
+++ b/robotoff/off.py
@@ -806,6 +806,78 @@ def send_image(
     return r
 
 
+def parse_ingredients(text: str, lang: str, timeout: int = 10) -> list[JSONType]:
+    """Parse ingredients text using Product Opener API.
+
+    It is only available for `off` flavor (food).
+
+    The result is a list of ingredients, each ingredient is a dict with the
+    following keys:
+
+    - id: the ingredient ID. Having an ID does not means that the ingredient
+        is recognized, you must check if it exists in the taxonomy.
+    - text: the ingredient text (as it appears in the input ingredients list)
+    - percent_min: the minimum percentage of the ingredient in the product
+    - percent_max: the maximum percentage of the ingredient in the product
+    - percent_estimate: the estimated percentage of the ingredient in the
+        product
+    - vegan (bool): optional key indicating if the ingredient is vegan
+    - vegetarian (bool): optional key indicating if the ingredient is
+        vegetarian
+
+
+    :param server_type: the server type (project) to use
+    :param text: the ingredients text to parse
+    :param lang: the language of the text (used for parsing) as a 2-letter code
+    :param timeout: the request timeout in seconds, defaults to 10s
+    :raises RuntimeError: a RuntimeError is raised if the parsing fails
+    :return: the list of parsed ingredients
+    """
+    base_url = settings.BaseURLProvider.world(ServerType.off)
+    # by using "test" as code, we don't save any information to database
+    # This endpoint is specifically designed for testing purposes
+    url = f"{base_url}/api/v3/product/test"
+
+    if len(text) == 0:
+        raise ValueError("text must be a non-empty string")
+
+    try:
+        r = http_session.patch(
+            url,
+            auth=settings._off_request_auth,
+            json={
+                "fields": "ingredients",
+                "lc": lang,
+                "tags_lc": lang,
+                "product": {
+                    "lang": lang,
+                    f"ingredients_text_{lang}": text,
+                },
+            },
+            timeout=timeout,
+        )
+    except (
+        requests.exceptions.ConnectionError,
+        requests.exceptions.SSLError,
+        requests.exceptions.Timeout,
+    ) as e:
+        raise RuntimeError(
+            f"Unable to parse ingredients: error during HTTP request: {e}"
+        )
+
+    if not r.ok:
+        raise RuntimeError(
+            f"Unable to parse ingredients (non-200 status code): {r.status_code}, {r.text}"
+        )
+
+    response_data = r.json()
+
+    if response_data.get("status") != "success":
+        raise RuntimeError(f"Unable to parse ingredients: {response_data}")
+
+    return response_data["product"]["ingredients"]
+
+
 def normalize_tag(value, lowercase=True):
     """Given a value normalize it to a tag (as in taxonomies).
 
diff --git a/robotoff/prediction/ingredient_list/__init__.py b/robotoff/prediction/ingredient_list/__init__.py
index 1f71bf01a7..0eb55789a7 100644
--- a/robotoff/prediction/ingredient_list/__init__.py
+++ b/robotoff/prediction/ingredient_list/__init__.py
@@ -36,7 +36,7 @@ class IngredientPredictionAggregatedEntity:
     raw_end: int
     # confidence score
     score: float
-    # entity text
+    # entity text (without organic or allergen mentions)
     text: str
     # language prediction of the entity text
     lang: Optional[LanguagePrediction] = None
diff --git a/robotoff/taxonomy.py b/robotoff/taxonomy.py
index 24ef7b1a02..fd5e9143a9 100644
--- a/robotoff/taxonomy.py
+++ b/robotoff/taxonomy.py
@@ -40,7 +40,7 @@ def generate_category_hierarchy(
 
 
 @cachetools.cached(cache=cachetools.TTLCache(maxsize=100, ttl=12 * 60 * 60))  # 12h
-def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy:
+def get_taxonomy(taxonomy_type: TaxonomyType | str, offline: bool = False) -> Taxonomy:
     """Return the taxonomy of type `taxonomy_type`.
 
     The taxonomy is cached in memory and locally on disk. Every 12h, we check
@@ -57,8 +57,11 @@ def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy:
     if offline:
         return Taxonomy.from_path(str(settings.TAXONOMY_PATHS[taxonomy_type]))
 
+    taxonomy_type_enum = (
+        TaxonomyType[taxonomy_type] if isinstance(taxonomy_type, str) else taxonomy_type
+    )
     return _get_taxonomy(
-        TaxonomyType[taxonomy_type],
+        taxonomy_type_enum,
         force_download=False,
         cache_dir=settings.DATA_DIR / "taxonomies",
     )
diff --git a/robotoff/workers/tasks/import_image.py b/robotoff/workers/tasks/import_image.py
index 85e503ed7f..f570e35b0a 100644
--- a/robotoff/workers/tasks/import_image.py
+++ b/robotoff/workers/tasks/import_image.py
@@ -6,6 +6,7 @@
 import elasticsearch
 from elasticsearch.helpers import BulkIndexError
 from openfoodfacts import OCRResult
+from openfoodfacts.types import TaxonomyType
 from PIL import Image
 
 from robotoff import settings
@@ -33,11 +34,12 @@
     db,
     with_db,
 )
-from robotoff.off import generate_image_url, get_source_from_url
+from robotoff.off import generate_image_url, get_source_from_url, parse_ingredients
 from robotoff.prediction import ingredient_list
 from robotoff.prediction.upc_image import UPCImageType, find_image_is_upc
 from robotoff.products import get_product_store
 from robotoff.slack import NotifierFactory
+from robotoff.taxonomy import get_taxonomy
 from robotoff.triton import generate_clip_embedding
 from robotoff.types import (
     JSONType,
@@ -122,7 +124,10 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
         enqueue_job(
             extract_ingredients_job,
             get_high_queue(product_id),
-            job_kwargs={"result_ttl": 0},
+            # We add a higher timeout, as we request Product Opener to
+            # parse ingredient list, which may take a while depending on
+            # the number of ingredient list (~1s per ingredient list)
+            job_kwargs={"result_ttl": 0, "timeout": "2m"},
             product_id=product_id,
             ocr_url=ocr_url,
         )
@@ -618,15 +623,50 @@ def extract_ingredients_job(product_id: ProductIdentifier, ocr_url: str):
         logger.warning("predict_from_ocr output: %s", output)
         entities: list[
             ingredient_list.IngredientPredictionAggregatedEntity
-        ] = output.entities  # type: ignore (we know it's an
-        # aggregated entity)
+        ] = output.entities  # type: ignore
+        # (we know it's an aggregated entity, so we can ignore the type)
+
+        image_prediction_data = dataclasses.asdict(output)
+        ingredient_taxonomy = get_taxonomy(TaxonomyType.ingredient)
+
+        for entity in image_prediction_data["entities"]:
+            # This is just an extra check, we should have lang information
+            # available
+            if entity["lang"]:
+                lang_id = entity["lang"]["lang"]
+                try:
+                    # Parse ingredients using Product Opener ingredient parser,
+                    # and add it to the entity data
+                    parsed_ingredients = parse_ingredients(entity["text"], lang_id)
+                except RuntimeError as e:
+                    logger.info(
+                        "Error while parsing ingredients, skipping "
+                        "to the next ingredient list",
+                        exc_info=e,
+                    )
+                    continue
+
+                known_ingredients_n = 0
+                ingredients_n = len(parsed_ingredients)
+                for ingredient_data in parsed_ingredients:
+                    ingredient_id = ingredient_data["id"]
+                    ingredient_data["in_taxonomy"] = (
+                        ingredient_id in ingredient_taxonomy
+                    )
+                    known_ingredients_n += int(ingredient_data["in_taxonomy"])
+
+                # We use the same terminology as Product Opener
+                entity["ingredients_n"] = ingredients_n
+                entity["known_ingredients_n"] = known_ingredients_n
+                entity["unknown_ingredients_n"] = ingredients_n - known_ingredients_n
+                entity["ingredients"] = parsed_ingredients
 
         ImagePrediction.create(
             image=image_model,
             type="ner",
             model_name=ingredient_list.MODEL_NAME,
             model_version=ingredient_list.MODEL_VERSION,
-            data=dataclasses.asdict(output),
+            data=image_prediction_data,
             timestamp=datetime.datetime.utcnow(),
             max_confidence=max(entity.score for entity in entities),
         )
diff --git a/tests/integration/workers/tasks/test_import_image.py b/tests/integration/workers/tasks/test_import_image.py
index 4466be3e40..aa98a0f7ed 100644
--- a/tests/integration/workers/tasks/test_import_image.py
+++ b/tests/integration/workers/tasks/test_import_image.py
@@ -1,6 +1,3 @@
-import dataclasses
-from unittest.mock import patch
-
 import pytest
 
 from robotoff.models import ImagePrediction
@@ -8,6 +5,7 @@
     IngredientPredictionAggregatedEntity,
     IngredientPredictionOutput,
 )
+from robotoff.prediction.langid import LanguagePrediction
 from robotoff.types import ProductIdentifier, ServerType
 from robotoff.workers.tasks.import_image import extract_ingredients_job
 
@@ -25,19 +23,61 @@ def _set_up_and_tear_down(peewee_db):
         clean_db()
 
 
-@patch("robotoff.workers.tasks.import_image.ingredient_list")
 def test_extract_ingredients_job(mocker, peewee_db):
     full_text = "Best product ever!\ningredients: water, salt, sugar."
     entities = [
         IngredientPredictionAggregatedEntity(
-            start=19, end=51, score=0.9, text="water, salt, sugar."
+            start=19,
+            end=51,
+            raw_end=51,
+            score=0.9,
+            text="water, salt, sugar.",
+            lang=LanguagePrediction(lang="en", confidence=0.9),
         )
     ]
-    mocker.predict_from_ocr.return_value = IngredientPredictionOutput(
+    parsed_ingredients = [
+        {
+            "ciqual_food_code": "18066",
+            "id": "en:water",
+            "percent_estimate": 66.6666666666667,
+            "percent_max": 100,
+            "percent_min": 33.3333333333333,
+            "text": "water",
+            "vegan": "yes",
+            "vegetarian": "yes",
+        },
+        {
+            "ciqual_food_code": "11058",
+            "id": "en:salt",
+            "percent_estimate": 16.6666666666667,
+            "percent_max": 50,
+            "percent_min": 0,
+            "text": "salt",
+            "vegan": "yes",
+            "vegetarian": "yes",
+        },
+        {
+            "id": "en:sugar",
+            "percent_estimate": 16.6666666666667,
+            "percent_max": 33.3333333333333,
+            "percent_min": 0,
+            "text": "sugar",
+            "vegan": "yes",
+            "vegetarian": "yes",
+        },
+    ]
+    ingredient_list_mocker = mocker.patch(
+        "robotoff.workers.tasks.import_image.ingredient_list"
+    )
+    parse_ingredients_mocker = mocker.patch(
+        "robotoff.workers.tasks.import_image.parse_ingredients",
+        return_value=parsed_ingredients,
+    )
+    ingredient_list_mocker.predict_from_ocr.return_value = IngredientPredictionOutput(
         entities=entities, text=full_text
     )
-    mocker.MODEL_NAME = "ingredient-detection"
-    mocker.MODEL_VERSION = "ingredient-detection-1.0"
+    ingredient_list_mocker.MODEL_NAME = "ingredient-detection"
+    ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0"
 
     barcode = "1234567890123"
     ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json"
@@ -49,7 +89,8 @@ def test_extract_ingredients_job(mocker, peewee_db):
         extract_ingredients_job(
             ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url
         )
-        mocker.predict_from_ocr.assert_called_once_with(ocr_url)
+        ingredient_list_mocker.predict_from_ocr.assert_called_once_with(ocr_url)
+        parse_ingredients_mocker.assert_called_once_with("water, salt, sugar.", "en")
         image_prediction = ImagePrediction.get_or_none(
             ImagePrediction.model_name == "ingredient-detection",
             ImagePrediction.image_id == image.id,
@@ -57,7 +98,23 @@ def test_extract_ingredients_job(mocker, peewee_db):
         assert image_prediction is not None
         assert image_prediction.data == {
             "text": full_text,
-            "entities": [dataclasses.asdict(entities[0])],
+            "entities": [
+                {
+                    "end": 51,
+                    "lang": {"lang": "en", "confidence": 0.9},
+                    "text": "water, salt, sugar.",
+                    "score": 0.9,
+                    "start": 19,
+                    "raw_end": 51,
+                    "ingredients_n": 3,
+                    "known_ingredients_n": 3,
+                    "unknown_ingredients_n": 0,
+                    "ingredients": [
+                        {"in_taxonomy": True, **ingredient}
+                        for ingredient in parsed_ingredients
+                    ],
+                }
+            ],
         }
         assert image_prediction.max_confidence == 0.9
         assert image_prediction.type == "ner"
@@ -65,8 +122,13 @@ def test_extract_ingredients_job(mocker, peewee_db):
         assert image_prediction.model_version == "ingredient-detection-1.0"
 
 
-@patch("robotoff.workers.tasks.import_image.ingredient_list")
 def test_extract_ingredients_job_missing_image(mocker, peewee_db):
+    ingredient_list_mocker = mocker.patch(
+        "robotoff.workers.tasks.import_image.ingredient_list"
+    )
+    parse_ingredients_mocker = mocker.patch(
+        "robotoff.workers.tasks.import_image.parse_ingredients"
+    )
     barcode = "1234567890123"
     ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json"
 
@@ -74,13 +136,19 @@ def test_extract_ingredients_job_missing_image(mocker, peewee_db):
         extract_ingredients_job(
             ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url
         )
-        mocker.predict_from_ocr.assert_not_called()
+        ingredient_list_mocker.predict_from_ocr.assert_not_called()
+        parse_ingredients_mocker.assert_not_called()
 
 
-@patch("robotoff.workers.tasks.import_image.ingredient_list")
 def test_extract_ingredients_job_existing_image_prediction(mocker, peewee_db):
-    mocker.MODEL_NAME = "ingredient-detection"
-    mocker.MODEL_VERSION = "ingredient-detection-1.0"
+    ingredient_list_mocker = mocker.patch(
+        "robotoff.workers.tasks.import_image.ingredient_list"
+    )
+    parse_ingredients_mocker = mocker.patch(
+        "robotoff.workers.tasks.import_image.parse_ingredients"
+    )
+    ingredient_list_mocker.MODEL_NAME = "ingredient-detection"
+    ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0"
     barcode = "1234567890123"
     ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json"
 
@@ -96,4 +164,5 @@ def test_extract_ingredients_job_existing_image_prediction(mocker, peewee_db):
         extract_ingredients_job(
             ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url
         )
-        mocker.predict_from_ocr.assert_not_called()
+        ingredient_list_mocker.predict_from_ocr.assert_not_called()
+        parse_ingredients_mocker.assert_not_called()