Skip to content

Commit

Permalink
feat: add ingredient parsing information
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Nov 10, 2023
1 parent 8f2a4b4 commit e9f2b60
Show file tree
Hide file tree
Showing 5 changed files with 208 additions and 24 deletions.
72 changes: 72 additions & 0 deletions robotoff/off.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,78 @@ def send_image(
return r


def parse_ingredients(text: str, lang: str, timeout: int = 10) -> list[JSONType]:
"""Parse ingredients text using Product Opener API.
It is only available for `off` flavor (food).
The result is a list of ingredients, each ingredient is a dict with the
following keys:
- id: the ingredient ID. Having an ID does not means that the ingredient
is recognized, you must check if it exists in the taxonomy.
- text: the ingredient text (as it appears in the input ingredients list)
- percent_min: the minimum percentage of the ingredient in the product
- percent_max: the maximum percentage of the ingredient in the product
- percent_estimate: the estimated percentage of the ingredient in the
product
- vegan (bool): optional key indicating if the ingredient is vegan
- vegetarian (bool): optional key indicating if the ingredient is
vegetarian
:param server_type: the server type (project) to use
:param text: the ingredients text to parse
:param lang: the language of the text (used for parsing) as a 2-letter code
:param timeout: the request timeout in seconds, defaults to 10s
:raises RuntimeError: a RuntimeError is raised if the parsing fails
:return: the list of parsed ingredients
"""
base_url = settings.BaseURLProvider.world(ServerType.off)
# by using "test" as code, we don't save any information to database
# This endpoint is specifically designed for testing purposes
url = f"{base_url}/api/v3/product/test"

if len(text) == 0:
raise ValueError("text must be a non-empty string")

try:
r = http_session.patch(
url,
auth=settings._off_request_auth,
json={
"fields": "ingredients",
"lc": lang,
"tags_lc": lang,
"product": {
"lang": lang,
f"ingredients_text_{lang}": text,
},
},
timeout=timeout,
)
except (
requests.exceptions.ConnectionError,
requests.exceptions.SSLError,
requests.exceptions.Timeout,
) as e:
raise RuntimeError(
f"Unable to parse ingredients: error during HTTP request: {e}"
)

if not r.ok:
raise RuntimeError(
f"Unable to parse ingredients (non-200 status code): {r.status_code}, {r.text}"
)

response_data = r.json()

if response_data.get("status") != "success":
raise RuntimeError(f"Unable to parse ingredients: {response_data}")

return response_data["product"]["ingredients"]


def normalize_tag(value, lowercase=True):
"""Given a value normalize it to a tag (as in taxonomies).
Expand Down
2 changes: 1 addition & 1 deletion robotoff/prediction/ingredient_list/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class IngredientPredictionAggregatedEntity:
raw_end: int
# confidence score
score: float
# entity text
# entity text (without organic or allergen mentions)
text: str
# language prediction of the entity text
lang: Optional[LanguagePrediction] = None
Expand Down
7 changes: 5 additions & 2 deletions robotoff/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def generate_category_hierarchy(


@cachetools.cached(cache=cachetools.TTLCache(maxsize=100, ttl=12 * 60 * 60)) # 12h
def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy:
def get_taxonomy(taxonomy_type: TaxonomyType | str, offline: bool = False) -> Taxonomy:
"""Return the taxonomy of type `taxonomy_type`.
The taxonomy is cached in memory and locally on disk. Every 12h, we check
Expand All @@ -57,8 +57,11 @@ def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy:
if offline:
return Taxonomy.from_path(str(settings.TAXONOMY_PATHS[taxonomy_type]))

taxonomy_type_enum = (
TaxonomyType[taxonomy_type] if isinstance(taxonomy_type, str) else taxonomy_type
)
return _get_taxonomy(
TaxonomyType[taxonomy_type],
taxonomy_type_enum,
force_download=False,
cache_dir=settings.DATA_DIR / "taxonomies",
)
Expand Down
50 changes: 45 additions & 5 deletions robotoff/workers/tasks/import_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import elasticsearch
from elasticsearch.helpers import BulkIndexError
from openfoodfacts import OCRResult
from openfoodfacts.types import TaxonomyType
from PIL import Image

from robotoff import settings
Expand Down Expand Up @@ -33,11 +34,12 @@
db,
with_db,
)
from robotoff.off import generate_image_url, get_source_from_url
from robotoff.off import generate_image_url, get_source_from_url, parse_ingredients
from robotoff.prediction import ingredient_list
from robotoff.prediction.upc_image import UPCImageType, find_image_is_upc
from robotoff.products import get_product_store
from robotoff.slack import NotifierFactory
from robotoff.taxonomy import get_taxonomy
from robotoff.triton import generate_clip_embedding
from robotoff.types import (
JSONType,
Expand Down Expand Up @@ -122,7 +124,10 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
enqueue_job(
extract_ingredients_job,
get_high_queue(product_id),
job_kwargs={"result_ttl": 0},
# We add a higher timeout, as we request Product Opener to
# parse ingredient list, which may take a while depending on
# the number of ingredient list (~1s per ingredient list)
job_kwargs={"result_ttl": 0, "timeout": "2m"},
product_id=product_id,
ocr_url=ocr_url,
)
Expand Down Expand Up @@ -618,15 +623,50 @@ def extract_ingredients_job(product_id: ProductIdentifier, ocr_url: str):
logger.warning("predict_from_ocr output: %s", output)
entities: list[
ingredient_list.IngredientPredictionAggregatedEntity
] = output.entities # type: ignore (we know it's an
# aggregated entity)
] = output.entities # type: ignore
# (we know it's an aggregated entity, so we can ignore the type)

image_prediction_data = dataclasses.asdict(output)
ingredient_taxonomy = get_taxonomy(TaxonomyType.ingredient)

for entity in image_prediction_data["entities"]:
# This is just an extra check, we should have lang information
# available
if entity["lang"]:
lang_id = entity["lang"]["lang"]
try:
# Parse ingredients using Product Opener ingredient parser,
# and add it to the entity data
parsed_ingredients = parse_ingredients(entity["text"], lang_id)
except RuntimeError as e:
logger.info(
"Error while parsing ingredients, skipping "
"to the next ingredient list",
exc_info=e,
)
continue

known_ingredients_n = 0
ingredients_n = len(parsed_ingredients)
for ingredient_data in parsed_ingredients:
ingredient_id = ingredient_data["id"]
ingredient_data["in_taxonomy"] = (
ingredient_id in ingredient_taxonomy
)
known_ingredients_n += int(ingredient_data["in_taxonomy"])

# We use the same terminology as Product Opener
entity["ingredients_n"] = ingredients_n
entity["known_ingredients_n"] = known_ingredients_n
entity["unknown_ingredients_n"] = ingredients_n - known_ingredients_n
entity["ingredients"] = parsed_ingredients

ImagePrediction.create(
image=image_model,
type="ner",
model_name=ingredient_list.MODEL_NAME,
model_version=ingredient_list.MODEL_VERSION,
data=dataclasses.asdict(output),
data=image_prediction_data,
timestamp=datetime.datetime.utcnow(),
max_confidence=max(entity.score for entity in entities),
)
Expand Down
101 changes: 85 additions & 16 deletions tests/integration/workers/tasks/test_import_image.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import dataclasses
from unittest.mock import patch

import pytest

from robotoff.models import ImagePrediction
from robotoff.prediction.ingredient_list import (
IngredientPredictionAggregatedEntity,
IngredientPredictionOutput,
)
from robotoff.prediction.langid import LanguagePrediction
from robotoff.types import ProductIdentifier, ServerType
from robotoff.workers.tasks.import_image import extract_ingredients_job

Expand All @@ -25,19 +23,61 @@ def _set_up_and_tear_down(peewee_db):
clean_db()


@patch("robotoff.workers.tasks.import_image.ingredient_list")
def test_extract_ingredients_job(mocker, peewee_db):
full_text = "Best product ever!\ningredients: water, salt, sugar."
entities = [
IngredientPredictionAggregatedEntity(
start=19, end=51, score=0.9, text="water, salt, sugar."
start=19,
end=51,
raw_end=51,
score=0.9,
text="water, salt, sugar.",
lang=LanguagePrediction(lang="en", confidence=0.9),
)
]
mocker.predict_from_ocr.return_value = IngredientPredictionOutput(
parsed_ingredients = [
{
"ciqual_food_code": "18066",
"id": "en:water",
"percent_estimate": 66.6666666666667,
"percent_max": 100,
"percent_min": 33.3333333333333,
"text": "water",
"vegan": "yes",
"vegetarian": "yes",
},
{
"ciqual_food_code": "11058",
"id": "en:salt",
"percent_estimate": 16.6666666666667,
"percent_max": 50,
"percent_min": 0,
"text": "salt",
"vegan": "yes",
"vegetarian": "yes",
},
{
"id": "en:sugar",
"percent_estimate": 16.6666666666667,
"percent_max": 33.3333333333333,
"percent_min": 0,
"text": "sugar",
"vegan": "yes",
"vegetarian": "yes",
},
]
ingredient_list_mocker = mocker.patch(
"robotoff.workers.tasks.import_image.ingredient_list"
)
parse_ingredients_mocker = mocker.patch(
"robotoff.workers.tasks.import_image.parse_ingredients",
return_value=parsed_ingredients,
)
ingredient_list_mocker.predict_from_ocr.return_value = IngredientPredictionOutput(
entities=entities, text=full_text
)
mocker.MODEL_NAME = "ingredient-detection"
mocker.MODEL_VERSION = "ingredient-detection-1.0"
ingredient_list_mocker.MODEL_NAME = "ingredient-detection"
ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0"

barcode = "1234567890123"
ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json"
Expand All @@ -49,38 +89,66 @@ def test_extract_ingredients_job(mocker, peewee_db):
extract_ingredients_job(
ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url
)
mocker.predict_from_ocr.assert_called_once_with(ocr_url)
ingredient_list_mocker.predict_from_ocr.assert_called_once_with(ocr_url)
parse_ingredients_mocker.assert_called_once_with("water, salt, sugar.", "en")
image_prediction = ImagePrediction.get_or_none(
ImagePrediction.model_name == "ingredient-detection",
ImagePrediction.image_id == image.id,
)
assert image_prediction is not None
assert image_prediction.data == {
"text": full_text,
"entities": [dataclasses.asdict(entities[0])],
"entities": [
{
"end": 51,
"lang": {"lang": "en", "confidence": 0.9},
"text": "water, salt, sugar.",
"score": 0.9,
"start": 19,
"raw_end": 51,
"ingredients_n": 3,
"known_ingredients_n": 3,
"unknown_ingredients_n": 0,
"ingredients": [
{"in_taxonomy": True, **ingredient}
for ingredient in parsed_ingredients
],
}
],
}
assert image_prediction.max_confidence == 0.9
assert image_prediction.type == "ner"
assert image_prediction.model_name == "ingredient-detection"
assert image_prediction.model_version == "ingredient-detection-1.0"


@patch("robotoff.workers.tasks.import_image.ingredient_list")
def test_extract_ingredients_job_missing_image(mocker, peewee_db):
ingredient_list_mocker = mocker.patch(
"robotoff.workers.tasks.import_image.ingredient_list"
)
parse_ingredients_mocker = mocker.patch(
"robotoff.workers.tasks.import_image.parse_ingredients"
)
barcode = "1234567890123"
ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json"

with peewee_db:
extract_ingredients_job(
ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url
)
mocker.predict_from_ocr.assert_not_called()
ingredient_list_mocker.predict_from_ocr.assert_not_called()
parse_ingredients_mocker.assert_not_called()


@patch("robotoff.workers.tasks.import_image.ingredient_list")
def test_extract_ingredients_job_existing_image_prediction(mocker, peewee_db):
mocker.MODEL_NAME = "ingredient-detection"
mocker.MODEL_VERSION = "ingredient-detection-1.0"
ingredient_list_mocker = mocker.patch(
"robotoff.workers.tasks.import_image.ingredient_list"
)
parse_ingredients_mocker = mocker.patch(
"robotoff.workers.tasks.import_image.parse_ingredients"
)
ingredient_list_mocker.MODEL_NAME = "ingredient-detection"
ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0"
barcode = "1234567890123"
ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json"

Expand All @@ -96,4 +164,5 @@ def test_extract_ingredients_job_existing_image_prediction(mocker, peewee_db):
extract_ingredients_job(
ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url
)
mocker.predict_from_ocr.assert_not_called()
ingredient_list_mocker.predict_from_ocr.assert_not_called()
parse_ingredients_mocker.assert_not_called()

0 comments on commit e9f2b60

Please sign in to comment.