From e9f2b60672a0812d2804b6295d31078f2fe391c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 10 Nov 2023 11:41:14 +0100 Subject: [PATCH] feat: add ingredient parsing information --- robotoff/off.py | 72 +++++++++++++ .../prediction/ingredient_list/__init__.py | 2 +- robotoff/taxonomy.py | 7 +- robotoff/workers/tasks/import_image.py | 50 ++++++++- .../workers/tasks/test_import_image.py | 101 +++++++++++++++--- 5 files changed, 208 insertions(+), 24 deletions(-) diff --git a/robotoff/off.py b/robotoff/off.py index 5d67228ea6..b55eb93b1e 100644 --- a/robotoff/off.py +++ b/robotoff/off.py @@ -806,6 +806,78 @@ def send_image( return r +def parse_ingredients(text: str, lang: str, timeout: int = 10) -> list[JSONType]: + """Parse ingredients text using Product Opener API. + + It is only available for `off` flavor (food). + + The result is a list of ingredients, each ingredient is a dict with the + following keys: + + - id: the ingredient ID. Having an ID does not means that the ingredient + is recognized, you must check if it exists in the taxonomy. + - text: the ingredient text (as it appears in the input ingredients list) + - percent_min: the minimum percentage of the ingredient in the product + - percent_max: the maximum percentage of the ingredient in the product + - percent_estimate: the estimated percentage of the ingredient in the + product + - vegan (bool): optional key indicating if the ingredient is vegan + - vegetarian (bool): optional key indicating if the ingredient is + vegetarian + + + :param server_type: the server type (project) to use + :param text: the ingredients text to parse + :param lang: the language of the text (used for parsing) as a 2-letter code + :param timeout: the request timeout in seconds, defaults to 10s + :raises RuntimeError: a RuntimeError is raised if the parsing fails + :return: the list of parsed ingredients + """ + base_url = settings.BaseURLProvider.world(ServerType.off) + # by using "test" as code, we don't save any information to database + # This endpoint is specifically designed for testing purposes + url = f"{base_url}/api/v3/product/test" + + if len(text) == 0: + raise ValueError("text must be a non-empty string") + + try: + r = http_session.patch( + url, + auth=settings._off_request_auth, + json={ + "fields": "ingredients", + "lc": lang, + "tags_lc": lang, + "product": { + "lang": lang, + f"ingredients_text_{lang}": text, + }, + }, + timeout=timeout, + ) + except ( + requests.exceptions.ConnectionError, + requests.exceptions.SSLError, + requests.exceptions.Timeout, + ) as e: + raise RuntimeError( + f"Unable to parse ingredients: error during HTTP request: {e}" + ) + + if not r.ok: + raise RuntimeError( + f"Unable to parse ingredients (non-200 status code): {r.status_code}, {r.text}" + ) + + response_data = r.json() + + if response_data.get("status") != "success": + raise RuntimeError(f"Unable to parse ingredients: {response_data}") + + return response_data["product"]["ingredients"] + + def normalize_tag(value, lowercase=True): """Given a value normalize it to a tag (as in taxonomies). diff --git a/robotoff/prediction/ingredient_list/__init__.py b/robotoff/prediction/ingredient_list/__init__.py index 1f71bf01a7..0eb55789a7 100644 --- a/robotoff/prediction/ingredient_list/__init__.py +++ b/robotoff/prediction/ingredient_list/__init__.py @@ -36,7 +36,7 @@ class IngredientPredictionAggregatedEntity: raw_end: int # confidence score score: float - # entity text + # entity text (without organic or allergen mentions) text: str # language prediction of the entity text lang: Optional[LanguagePrediction] = None diff --git a/robotoff/taxonomy.py b/robotoff/taxonomy.py index 24ef7b1a02..fd5e9143a9 100644 --- a/robotoff/taxonomy.py +++ b/robotoff/taxonomy.py @@ -40,7 +40,7 @@ def generate_category_hierarchy( @cachetools.cached(cache=cachetools.TTLCache(maxsize=100, ttl=12 * 60 * 60)) # 12h -def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy: +def get_taxonomy(taxonomy_type: TaxonomyType | str, offline: bool = False) -> Taxonomy: """Return the taxonomy of type `taxonomy_type`. The taxonomy is cached in memory and locally on disk. Every 12h, we check @@ -57,8 +57,11 @@ def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy: if offline: return Taxonomy.from_path(str(settings.TAXONOMY_PATHS[taxonomy_type])) + taxonomy_type_enum = ( + TaxonomyType[taxonomy_type] if isinstance(taxonomy_type, str) else taxonomy_type + ) return _get_taxonomy( - TaxonomyType[taxonomy_type], + taxonomy_type_enum, force_download=False, cache_dir=settings.DATA_DIR / "taxonomies", ) diff --git a/robotoff/workers/tasks/import_image.py b/robotoff/workers/tasks/import_image.py index 85e503ed7f..f570e35b0a 100644 --- a/robotoff/workers/tasks/import_image.py +++ b/robotoff/workers/tasks/import_image.py @@ -6,6 +6,7 @@ import elasticsearch from elasticsearch.helpers import BulkIndexError from openfoodfacts import OCRResult +from openfoodfacts.types import TaxonomyType from PIL import Image from robotoff import settings @@ -33,11 +34,12 @@ db, with_db, ) -from robotoff.off import generate_image_url, get_source_from_url +from robotoff.off import generate_image_url, get_source_from_url, parse_ingredients from robotoff.prediction import ingredient_list from robotoff.prediction.upc_image import UPCImageType, find_image_is_upc from robotoff.products import get_product_store from robotoff.slack import NotifierFactory +from robotoff.taxonomy import get_taxonomy from robotoff.triton import generate_clip_embedding from robotoff.types import ( JSONType, @@ -122,7 +124,10 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url: enqueue_job( extract_ingredients_job, get_high_queue(product_id), - job_kwargs={"result_ttl": 0}, + # We add a higher timeout, as we request Product Opener to + # parse ingredient list, which may take a while depending on + # the number of ingredient list (~1s per ingredient list) + job_kwargs={"result_ttl": 0, "timeout": "2m"}, product_id=product_id, ocr_url=ocr_url, ) @@ -618,15 +623,50 @@ def extract_ingredients_job(product_id: ProductIdentifier, ocr_url: str): logger.warning("predict_from_ocr output: %s", output) entities: list[ ingredient_list.IngredientPredictionAggregatedEntity - ] = output.entities # type: ignore (we know it's an - # aggregated entity) + ] = output.entities # type: ignore + # (we know it's an aggregated entity, so we can ignore the type) + + image_prediction_data = dataclasses.asdict(output) + ingredient_taxonomy = get_taxonomy(TaxonomyType.ingredient) + + for entity in image_prediction_data["entities"]: + # This is just an extra check, we should have lang information + # available + if entity["lang"]: + lang_id = entity["lang"]["lang"] + try: + # Parse ingredients using Product Opener ingredient parser, + # and add it to the entity data + parsed_ingredients = parse_ingredients(entity["text"], lang_id) + except RuntimeError as e: + logger.info( + "Error while parsing ingredients, skipping " + "to the next ingredient list", + exc_info=e, + ) + continue + + known_ingredients_n = 0 + ingredients_n = len(parsed_ingredients) + for ingredient_data in parsed_ingredients: + ingredient_id = ingredient_data["id"] + ingredient_data["in_taxonomy"] = ( + ingredient_id in ingredient_taxonomy + ) + known_ingredients_n += int(ingredient_data["in_taxonomy"]) + + # We use the same terminology as Product Opener + entity["ingredients_n"] = ingredients_n + entity["known_ingredients_n"] = known_ingredients_n + entity["unknown_ingredients_n"] = ingredients_n - known_ingredients_n + entity["ingredients"] = parsed_ingredients ImagePrediction.create( image=image_model, type="ner", model_name=ingredient_list.MODEL_NAME, model_version=ingredient_list.MODEL_VERSION, - data=dataclasses.asdict(output), + data=image_prediction_data, timestamp=datetime.datetime.utcnow(), max_confidence=max(entity.score for entity in entities), ) diff --git a/tests/integration/workers/tasks/test_import_image.py b/tests/integration/workers/tasks/test_import_image.py index 4466be3e40..aa98a0f7ed 100644 --- a/tests/integration/workers/tasks/test_import_image.py +++ b/tests/integration/workers/tasks/test_import_image.py @@ -1,6 +1,3 @@ -import dataclasses -from unittest.mock import patch - import pytest from robotoff.models import ImagePrediction @@ -8,6 +5,7 @@ IngredientPredictionAggregatedEntity, IngredientPredictionOutput, ) +from robotoff.prediction.langid import LanguagePrediction from robotoff.types import ProductIdentifier, ServerType from robotoff.workers.tasks.import_image import extract_ingredients_job @@ -25,19 +23,61 @@ def _set_up_and_tear_down(peewee_db): clean_db() -@patch("robotoff.workers.tasks.import_image.ingredient_list") def test_extract_ingredients_job(mocker, peewee_db): full_text = "Best product ever!\ningredients: water, salt, sugar." entities = [ IngredientPredictionAggregatedEntity( - start=19, end=51, score=0.9, text="water, salt, sugar." + start=19, + end=51, + raw_end=51, + score=0.9, + text="water, salt, sugar.", + lang=LanguagePrediction(lang="en", confidence=0.9), ) ] - mocker.predict_from_ocr.return_value = IngredientPredictionOutput( + parsed_ingredients = [ + { + "ciqual_food_code": "18066", + "id": "en:water", + "percent_estimate": 66.6666666666667, + "percent_max": 100, + "percent_min": 33.3333333333333, + "text": "water", + "vegan": "yes", + "vegetarian": "yes", + }, + { + "ciqual_food_code": "11058", + "id": "en:salt", + "percent_estimate": 16.6666666666667, + "percent_max": 50, + "percent_min": 0, + "text": "salt", + "vegan": "yes", + "vegetarian": "yes", + }, + { + "id": "en:sugar", + "percent_estimate": 16.6666666666667, + "percent_max": 33.3333333333333, + "percent_min": 0, + "text": "sugar", + "vegan": "yes", + "vegetarian": "yes", + }, + ] + ingredient_list_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.ingredient_list" + ) + parse_ingredients_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.parse_ingredients", + return_value=parsed_ingredients, + ) + ingredient_list_mocker.predict_from_ocr.return_value = IngredientPredictionOutput( entities=entities, text=full_text ) - mocker.MODEL_NAME = "ingredient-detection" - mocker.MODEL_VERSION = "ingredient-detection-1.0" + ingredient_list_mocker.MODEL_NAME = "ingredient-detection" + ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0" barcode = "1234567890123" ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json" @@ -49,7 +89,8 @@ def test_extract_ingredients_job(mocker, peewee_db): extract_ingredients_job( ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url ) - mocker.predict_from_ocr.assert_called_once_with(ocr_url) + ingredient_list_mocker.predict_from_ocr.assert_called_once_with(ocr_url) + parse_ingredients_mocker.assert_called_once_with("water, salt, sugar.", "en") image_prediction = ImagePrediction.get_or_none( ImagePrediction.model_name == "ingredient-detection", ImagePrediction.image_id == image.id, @@ -57,7 +98,23 @@ def test_extract_ingredients_job(mocker, peewee_db): assert image_prediction is not None assert image_prediction.data == { "text": full_text, - "entities": [dataclasses.asdict(entities[0])], + "entities": [ + { + "end": 51, + "lang": {"lang": "en", "confidence": 0.9}, + "text": "water, salt, sugar.", + "score": 0.9, + "start": 19, + "raw_end": 51, + "ingredients_n": 3, + "known_ingredients_n": 3, + "unknown_ingredients_n": 0, + "ingredients": [ + {"in_taxonomy": True, **ingredient} + for ingredient in parsed_ingredients + ], + } + ], } assert image_prediction.max_confidence == 0.9 assert image_prediction.type == "ner" @@ -65,8 +122,13 @@ def test_extract_ingredients_job(mocker, peewee_db): assert image_prediction.model_version == "ingredient-detection-1.0" -@patch("robotoff.workers.tasks.import_image.ingredient_list") def test_extract_ingredients_job_missing_image(mocker, peewee_db): + ingredient_list_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.ingredient_list" + ) + parse_ingredients_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.parse_ingredients" + ) barcode = "1234567890123" ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json" @@ -74,13 +136,19 @@ def test_extract_ingredients_job_missing_image(mocker, peewee_db): extract_ingredients_job( ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url ) - mocker.predict_from_ocr.assert_not_called() + ingredient_list_mocker.predict_from_ocr.assert_not_called() + parse_ingredients_mocker.assert_not_called() -@patch("robotoff.workers.tasks.import_image.ingredient_list") def test_extract_ingredients_job_existing_image_prediction(mocker, peewee_db): - mocker.MODEL_NAME = "ingredient-detection" - mocker.MODEL_VERSION = "ingredient-detection-1.0" + ingredient_list_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.ingredient_list" + ) + parse_ingredients_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.parse_ingredients" + ) + ingredient_list_mocker.MODEL_NAME = "ingredient-detection" + ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0" barcode = "1234567890123" ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json" @@ -96,4 +164,5 @@ def test_extract_ingredients_job_existing_image_prediction(mocker, peewee_db): extract_ingredients_job( ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url ) - mocker.predict_from_ocr.assert_not_called() + ingredient_list_mocker.predict_from_ocr.assert_not_called() + parse_ingredients_mocker.assert_not_called()