Skip to content

Commit

Permalink
feat: save logo text (extracted using OCR in DB)
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Nov 2, 2023
1 parent e4080ca commit bf4632f
Show file tree
Hide file tree
Showing 17 changed files with 456 additions and 262 deletions.
17 changes: 17 additions & 0 deletions migrations/003_add_logo_text_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Peewee migrations -- 003_add_logo_text_field.py.
"""

import peewee as pw
from peewee_migrate import Migrator


def migrate(migrator: Migrator, database: pw.Database, *, fake=False):
"""Write your migrations here."""

migrator.add_fields("logo_annotation", text=pw.TextField(null=True))


def rollback(migrator: Migrator, database: pw.Database, *, fake=False):
"""Write your rollback migrations here."""

migrator.remove_fields("logo_annotation", "text")
362 changes: 184 additions & 178 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ lark = "~1.1.4"
h5py = "~3.8.0"
opencv-contrib-python = "~4.7.0.72"
toml = "~0.10.2"
openfoodfacts = "0.1.10"
openfoodfacts = "0.1.11"
imagehash = "~4.3.1"
peewee-migrate = "~1.12.2"
diskcache = "~5.6.3"
Expand Down
2 changes: 2 additions & 0 deletions robotoff/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,8 @@ class LogoAnnotation(BaseModel):
nearest_neighbors = BinaryJSONField(null=True)
barcode = peewee.CharField(max_length=100, null=True, index=True)
source_image = peewee.TextField(null=True, index=True)
# The logo text extracted from the image using OCR
text = peewee.TextField(null=True)

class Meta:
constraints = [peewee.SQL("UNIQUE(image_prediction_id, index)")]
Expand Down
4 changes: 2 additions & 2 deletions robotoff/prediction/object_detection/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

@dataclasses.dataclass
class ObjectDetectionResult:
bounding_box: tuple
bounding_box: tuple[int, int, int, int]
score: float
label: str

Expand Down Expand Up @@ -59,7 +59,7 @@ def select(self, threshold: Optional[float] = None) -> list[ObjectDetectionResul
label_str = self.label_names[label_int]
if label_str is not None:
result = ObjectDetectionResult(
bounding_box=tuple(bounding_box.tolist()),
bounding_box=tuple(bounding_box.tolist()), # type: ignore
score=float(score),
label=label_str,
)
Expand Down
2 changes: 1 addition & 1 deletion robotoff/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from robotoff import settings
from robotoff.types import JSONType

from .image import ImageLoadingException, get_image_from_url # noqa: F401
from .image import get_image_from_url # noqa: F401
from .logger import get_logger

logger = get_logger(__name__)
Expand Down
7 changes: 5 additions & 2 deletions robotoff/utils/cache.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Callable

import requests
from diskcache import Cache

from robotoff import settings
Expand All @@ -15,7 +16,8 @@

def cache_http_request(
key: str,
func: Callable,
cache: Cache,
func: Callable[..., requests.Response | None],
cache_expire: int | None = None,
tag: str | None = None,
*args,
Expand All @@ -24,6 +26,7 @@ def cache_http_request(
"""Cache raw response (bytes) of HTTP requests.
:param key: the cache key
:param cache: the cache to use
:param func: the function to call, must return a Request object
:param cache_expire: expiration time of the item in the cache, defaults to
None (no expiration)
Expand All @@ -33,7 +36,7 @@ def cache_http_request(
"""
# Check if the item is already cached, and use it instead of sending
# the HTTP request if it is
content_bytes = disk_cache.get(key)
content_bytes = cache.get(key)
if content_bytes is None:
r = func(*args, **kwargs)
if r is None:
Expand Down
94 changes: 94 additions & 0 deletions robotoff/utils/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import logging
from typing import Optional
from urllib.parse import urlparse

import requests
from diskcache import Cache
from requests.exceptions import ConnectionError as RequestConnectionError
from requests.exceptions import SSLError, Timeout

from robotoff import settings
from robotoff.utils.cache import cache_http_request, disk_cache

from .logger import get_logger

logger = get_logger(__name__)


class AssetLoadingException(Exception):
"""Exception raised by `get_asset_from_url` when an asset cannot be fetched
from URL or if loading failed.
"""

pass


def get_asset_from_url(
asset_url: str,
error_raise: bool = True,
session: Optional[requests.Session] = None,
) -> requests.Response | None:
auth = (
settings._off_net_auth
if urlparse(asset_url).netloc.endswith("openfoodfacts.net")
else None
)
try:
if session:
r = session.get(asset_url, auth=auth)
else:
r = requests.get(asset_url, auth=auth)
except (RequestConnectionError, SSLError, Timeout) as e:
error_message = "Cannot download %s"
if error_raise:
raise AssetLoadingException(error_message % asset_url) from e
logger.info(error_message, asset_url, exc_info=e)
return None

if not r.ok:
error_message = "Cannot download %s: HTTP %s"
error_args = (asset_url, r.status_code)
if error_raise:
raise AssetLoadingException(error_message % error_args)
logger.log(
logging.INFO if r.status_code < 500 else logging.WARNING,
error_message,
*error_args,
)
return None

return r


def cache_asset_from_url(
key: str,
cache: Cache | None = None,
cache_expire: int | None = None,
tag: str | None = None,
*args,
**kwargs,
) -> bytes | None:
"""Cache response on disk from `get_asset_from_url`.
args and kwargs are passed to `get_asset_from_url`.
:param key: the cache key
:param url: the URL of the asset to fetch
:param session: the requests session to use
:param cache: the cache to use, defaults to Robotoff default cache
:param cache_expire: expiration time of the item in the cache, defaults to
None (no expiration)
:param tag: a tag of the item in the cache (optional), defaults to None
:return: the response bytes or None if an error occured while calling
`func`
"""
cache = cache or disk_cache
return cache_http_request(
key,
cache,
get_asset_from_url,
cache_expire,
tag,
*args,
**kwargs,
)
71 changes: 11 additions & 60 deletions robotoff/utils/image.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
import logging
from io import BytesIO
from typing import Optional
from urllib.parse import urlparse

import numpy as np
import PIL
import requests
from PIL import Image
from requests.exceptions import ConnectionError as RequestConnectionError
from requests.exceptions import SSLError, Timeout

from robotoff import settings
from robotoff.utils.download import (
AssetLoadingException,
cache_asset_from_url,
get_asset_from_url,
)

from .cache import cache_http_request
from .logger import get_logger

logger = get_logger(__name__)
Expand All @@ -34,14 +32,6 @@ def convert_image_to_array(image: Image.Image) -> np.ndarray:
return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)


class ImageLoadingException(Exception):
"""Exception raised by `get_image_from_url`` when image cannot be fetched
from URL or if loading failed.
"""

pass


def get_image_from_url(
image_url: str,
error_raise: bool = True,
Expand All @@ -52,7 +42,7 @@ def get_image_from_url(
"""Fetch an image from `image_url` and load it.
:param image_url: URL of the image to load
:param error_raise: if True, raises a `ImageLoadingException` if an error
:param error_raise: if True, raises a `AssetLoadingException` if an error
occured, defaults to False. If False, None is returned if an error
occured.
:param session: requests Session to use, by default no session is used.
Expand All @@ -63,21 +53,19 @@ def get_image_from_url(
:return: the Pillow Image or None.
"""
if use_cache:
content_bytes = cache_http_request(
content_bytes = cache_asset_from_url(
key=f"image:{image_url}",
cache_expire=cache_expire,
tag="image",
func=_get_image_from_url,
# kwargs passed to func
# kwargs passed to get_asset_from_url
image_url=image_url,
error_raise=error_raise,
session=session,
)

if content_bytes is None:
return None
else:
r = _get_image_from_url(image_url, error_raise, session)
r = get_asset_from_url(image_url, error_raise, session)
if r is None:
return None
content_bytes = r.content
Expand All @@ -87,49 +75,12 @@ def get_image_from_url(
except PIL.UnidentifiedImageError:
error_message = f"Cannot identify image {image_url}"
if error_raise:
raise ImageLoadingException(error_message)
raise AssetLoadingException(error_message)
logger.info(error_message)
except PIL.Image.DecompressionBombError:
error_message = f"Decompression bomb error for image {image_url}"
if error_raise:
raise ImageLoadingException(error_message)
raise AssetLoadingException(error_message)
logger.info(error_message)

return None


def _get_image_from_url(
image_url: str,
error_raise: bool = True,
session: Optional[requests.Session] = None,
) -> requests.Response | None:
auth = (
settings._off_net_auth
if urlparse(image_url).netloc.endswith("openfoodfacts.net")
else None
)
try:
if session:
r = session.get(image_url, auth=auth)
else:
r = requests.get(image_url, auth=auth)
except (RequestConnectionError, SSLError, Timeout) as e:
error_message = "Cannot download image %s"
if error_raise:
raise ImageLoadingException(error_message % image_url) from e
logger.info(error_message, image_url, exc_info=e)
return None

if not r.ok:
error_message = "Cannot download image %s: HTTP %s"
error_args = (image_url, r.status_code)
if error_raise:
raise ImageLoadingException(error_message % error_args)
logger.log(
logging.INFO if r.status_code < 500 else logging.WARNING,
error_message,
*error_args,
)
return None

return r
Loading

0 comments on commit bf4632f

Please sign in to comment.