Skip to content

Commit

Permalink
Re-enable Amazon imports from /isbn (internetarchive#8690)
Browse files Browse the repository at this point in the history
* Renable `/isbn` and AMZ imports

This PR reenables AMZ imports from `/isbn` and `/api/books.json`.

See this comment examples of how to use the endpoints and what
to expect:
internetarchive#8690 (comment)

The logic upon visiting `/isbn/[some isbn]` is now:
- attempt to fetch the book from the OL database;
- attempt to fetch the book from the `import_item` table (likely ISBNdb);
- attempt to fetch the metadata from the Amazon Products API, clean that
  metadata for import, add the as a `staged` import in the `import_item`
  table, and then immediately import it `load()`, by way of
  `ImportItem.import_first_staged()`.

If any of these find or create an edition, the ention is returned.

* Stop bulk imports from AMZ records

Import AMZ records as `staged`.
See internetarchive#8541

* Modify the affiliate server to accept a GET parameter, `high_priority`, at
`/isbn`. E.g., `http://localhost:31337/isbn/059035342X?high_priority=true`.

`high_priority=true` will put the ISBN straight to the front of the queue
for an AMZ Products API look up, and attempt for three seconds to fetch
the cached AMZ data (if it becomes available), returning that data,
marshalled into a form suitable for creating an Edition, if possible.

* Use `high_priority=false` (the default) on the affiliate server to fetch AM
data if available,then queue for import and immediately import the item,
returning the resulting `Edition`, or `None`.

* Feature: `/api/books` will attempt to import from ISBN
- adds an `high_priority` parameter to `/api/books` to make the API try
  to import books from ISBN.
- relies on changes to `Edition.from_isbn()` which attempt to import
  editions first from `staged` items in the `import_item` table, and
  second from Amazon via the affiliate server.

---------

Co-authored-by: Mek <michael.karpeles@gmail.com>
  • Loading branch information
scottbarnes and mekarpeles authored Feb 13, 2024
1 parent 5098757 commit 2f59017
Show file tree
Hide file tree
Showing 8 changed files with 338 additions and 51 deletions.
28 changes: 23 additions & 5 deletions openlibrary/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
from datetime import datetime, timedelta
import logging
from openlibrary.core.vendors import get_amazon_metadata

import web
import json
Expand All @@ -24,7 +25,6 @@
from openlibrary.core.imports import ImportItem
from openlibrary.core.observations import Observations
from openlibrary.core.ratings import Ratings
from openlibrary.core.vendors import create_edition_from_amazon_metadata
from openlibrary.utils import extract_numeric_id_from_olid, dateutil
from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical

Expand Down Expand Up @@ -373,11 +373,16 @@ def get_ia_download_link(self, suffix):
return f"https://archive.org/download/{self.ocaid}/{filename}"

@classmethod
def from_isbn(cls, isbn: str) -> "Edition | None": # type: ignore[return]
def from_isbn(cls, isbn: str, high_priority: bool = False) -> "Edition | None":
"""
Attempts to fetch an edition by ISBN, or if no edition is found, then
check the import_item table for a match, then as a last result, attempt
to import from Amazon.
:param bool high_priority: If `True`, (1) any AMZ import requests will block
until AMZ has fetched data, and (2) the AMZ request will go to
the front of the queue. If `False`, the import will simply be
queued up if the item is not in the AMZ cache, and the affiliate
server will return a promise.
:return: an open library edition for this ISBN or None.
"""
isbn = canonical(isbn)
Expand All @@ -401,10 +406,23 @@ def from_isbn(cls, isbn: str) -> "Edition | None": # type: ignore[return]
return web.ctx.site.get(matches[0])

# Attempt to fetch the book from the import_item table
if result := ImportItem.import_first_staged(identifiers=isbns):
return result
if edition := ImportItem.import_first_staged(identifiers=isbns):
return edition

# TODO: Final step - call affiliate server, with retry code migrated there.
# Finally, try to fetch the book data from Amazon + import.
# If `high_priority=True`, then the affiliate-server, which `get_amazon_metadata()`
# uses, will block + wait until the Product API responds and the result, if any,
# is staged in `import_item`.
try:
get_amazon_metadata(
id_=isbn10 or isbn13, id_type="isbn", high_priority=high_priority
)
return ImportItem.import_first_staged(identifiers=isbns)
except requests.exceptions.ConnectionError:
logger.exception("Affiliate Server unreachable")
except requests.exceptions.HTTPError:
logger.exception(f"Affiliate Server: id {isbn10 or isbn13} not found")
return None

def is_ia_scan(self):
metadata = self.get_ia_meta_fields()
Expand Down
37 changes: 21 additions & 16 deletions openlibrary/core/vendors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
import re
import time

from datetime import date
from typing import Any, Literal

import requests
Expand Down Expand Up @@ -280,16 +282,21 @@ def get_amazon_metadata(
id_: str,
id_type: Literal['asin', 'isbn'] = 'isbn',
resources: Any = None,
retries: int = 0,
high_priority: bool = False,
) -> dict | None:
"""Main interface to Amazon LookupItem API. Will cache results.
:param str id_: The item id: isbn (10/13), or Amazon ASIN.
:param str id_type: 'isbn' or 'asin'.
:param bool high_priority: Priority in the import queue. High priority
goes to the front of the queue.
:return: A single book item's metadata, or None.
"""
return cached_get_amazon_metadata(
id_, id_type=id_type, resources=resources, retries=retries
id_,
id_type=id_type,
resources=resources,
high_priority=high_priority,
)


Expand All @@ -307,8 +314,7 @@ def _get_amazon_metadata(
id_: str,
id_type: Literal['asin', 'isbn'] = 'isbn',
resources: Any = None,
retries: int = 0,
sleep_sec: float = 1,
high_priority: bool = False,
) -> dict | None:
"""Uses the Amazon Product Advertising API ItemLookup operation to locate a
specific book by identifier; either 'isbn' or 'asin'.
Expand All @@ -318,12 +324,10 @@ def _get_amazon_metadata(
:param str id_type: 'isbn' or 'asin'.
:param Any resources: Used for AWSE Commerce Service lookup
See https://webservices.amazon.com/paapi5/documentation/get-items.html
:param int retries: Number of times to query affiliate server before returning None
:param float sleep_sec: Delay time.sleep(sleep_sec) seconds before each retry
:param bool high_priority: Priority in the import queue. High priority
goes to the front of the queue.
:return: A single book item's metadata, or None.
"""
# TMP: This is causing a bunch of duplicate imports
return None
if not affiliate_server_url:
return None

Expand All @@ -339,14 +343,15 @@ def _get_amazon_metadata(
id_ = isbn

try:
r = requests.get(f'http://{affiliate_server_url}/isbn/{id_}')
priority = "true" if high_priority else "false"
r = requests.get(
f'http://{affiliate_server_url}/isbn/{id_}?high_priority={priority}'
)
r.raise_for_status()
if hit := r.json().get('hit'):
return hit
if retries <= 1:
if data := r.json().get('hit'):
return data
else:
return None
time.sleep(sleep_sec) # sleep before recursive call
return _get_amazon_metadata(id_, id_type, resources, retries - 1, sleep_sec)
except requests.exceptions.ConnectionError:
logger.exception("Affiliate Server unreachable")
except requests.exceptions.HTTPError:
Expand Down Expand Up @@ -415,7 +420,7 @@ def clean_amazon_metadata_for_load(metadata: dict) -> dict:


def create_edition_from_amazon_metadata(
id_: str, id_type: Literal['asin', 'isbn'] = 'isbn', retries: int = 0
id_: str, id_type: Literal['asin', 'isbn'] = 'isbn'
) -> str | None:
"""Fetches Amazon metadata by id from Amazon Product Advertising API, attempts to
create OL edition from metadata, and returns the resulting edition key `/key/OL..M`
Expand All @@ -426,7 +431,7 @@ def create_edition_from_amazon_metadata(
:return: Edition key '/key/OL..M' or None
"""

md = get_amazon_metadata(id_, id_type=id_type, retries=retries)
md = get_amazon_metadata(id_, id_type=id_type)

if md and md.get('product_group') == 'Book':
with accounts.RunAs('ImportBot'):
Expand Down
30 changes: 28 additions & 2 deletions openlibrary/plugins/books/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,40 @@


class books_json(delegate.page):
"""
Endpoint for mapping bib keys (e.g. ISBN, LCCN) to certain links associated
with Open Library editions, such as the thumbnail URL.
- `bibkeys` is expected a comma separated string of ISBNs, LCCNs, etc.
- `'high_priority=true'` will attempt to import an edition from a supplied ISBN
if no matching edition is found. If not `high_priority`, then missed bib_keys
are queued for lookup on the affiliate-server, and any responses are `staged`
in `import_item`.
Example call:
http://localhost:8080/api/books.json?bibkeys=059035342X,0312368615&high_priority=true
Returns a JSONified dictionary of the form:
{"059035342X": {
"bib_key": "059035342X",
"info_url": "http://localhost:8080/books/OL43M/Harry_Potter_and_the_Sorcerer's_Stone",
"preview": "noview",
"preview_url": "https://archive.org/details/lccn_078073006991",
"thumbnail_url": "https://covers.openlibrary.org/b/id/21-S.jpg"
}
"0312368615": {...}
}
"""

path = "/api/books"

@jsonapi
def GET(self):
i = web.input(bibkeys='', callback=None, details="false")
i = web.input(bibkeys='', callback=None, details="false", high_priority="false")
if web.ctx.path.endswith('.json'):
i.format = 'json'
return dynlinks.dynlinks(i.bibkeys.split(","), i)
i.high_priority = i.get("high_priority") == "true"
return dynlinks.dynlinks(bib_keys=i.bibkeys.split(","), options=i)


class read_singleget(delegate.page):
Expand Down
76 changes: 69 additions & 7 deletions openlibrary/plugins/books/dynlinks.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Any
import json
import sys
from collections.abc import Hashable, Iterable, Mapping
from collections.abc import Generator, Hashable, Iterable, Mapping

import web
from openlibrary.core.models import Edition

from openlibrary.plugins.openlibrary.processors import urlsafe
from openlibrary.core import helpers as h
Expand Down Expand Up @@ -426,7 +428,7 @@ def process_doc_for_viewapi(bib_key, page):
return d


def format_result(result, options):
def format_result(result: dict, options: web.storage) -> str:
"""Format result as js or json.
>>> format_result({'x': 1}, {})
Expand All @@ -447,20 +449,80 @@ def format_result(result, options):
return "var _OLBookInfo = %s;" % json_data


def dynlinks(bib_keys, options):
def is_isbn(bib_key: str) -> bool:
"""Return True if the bib_key is ostensibly an ISBN (i.e. 10 or 13 characters)."""
return len(bib_key) in {10, 13}


def get_missed_isbn_bib_keys(
bib_keys: Iterable[str], found_records: dict
) -> Generator[str, None, None]:
"""
Return a Generator[str, None, None] with all ISBN bib_keys not in `found_records`.
"""
return (
bib_key
for bib_key in bib_keys
if bib_key not in found_records and is_isbn(bib_key)
)


def get_isbn_editiondict_map(
isbns: Iterable, high_priority: bool = False
) -> dict[str, Any]:
"""
Attempts to import items from their ISBN, returning a mapping of possibly
imported edition_dicts in the following form:
{isbn_string: edition_dict...}}
"""
# Get a mapping of ISBNs to new Editions (or `None`)
isbn_edition_map = {
isbn: Edition.from_isbn(isbn=isbn, high_priority=high_priority)
for isbn in isbns
}

# Convert edictions to dicts, dropping ISBNs for which no edition was created.
return {
isbn: edition.dict() for isbn, edition in isbn_edition_map.items() if edition
}


def dynlinks(bib_keys: Iterable[str], options: web.storage) -> str:
"""
Return a JSONified dictionary of bib_keys (e.g. ISBN, LCCN) and select URLs
associated with the corresponding edition, if any.
If a bib key is an ISBN, options.high_priority=True, and no edition is found,
an import is attempted with high priority; otherwise missed bib_keys are queued
for lookup via the affiliate-server and responses are `staged` in `import_item`.
Example return value for a bib key of the ISBN "1452303886":
'{"1452303886": {"bib_key": "1452303886", "info_url": '
'"http://localhost:8080/books/OL24630277M/Fires_of_Prophecy_The_Morcyth_Saga_Book_Two", '
'"preview": "restricted", "preview_url": '
'"https://archive.org/details/978-1-4523-0388-8"}}'
"""
# for backward-compatibility
if options.get("details", "").lower() == "true":
options["jscmd"] = "details"

try:
result = query_docs(bib_keys)
result = process_result(result, options.get('jscmd'))
edition_dicts = query_docs(bib_keys)
# For any ISBN bib_keys without hits, attempt to import+use immediately if
# `high_priority`. Otherwise, queue them for lookup via the AMZ Products
# API and process whatever editions were found in existing data.
if missed_isbns := get_missed_isbn_bib_keys(bib_keys, edition_dicts):
new_editions = get_isbn_editiondict_map(
isbns=missed_isbns, high_priority=options.get("high_priority")
)
edition_dicts.update(new_editions)
edition_dicts = process_result(edition_dicts, options.get('jscmd'))
except:
print("Error in processing Books API", file=sys.stderr)
register_exception()

result = {}
return format_result(result, options)
edition_dicts = {}
return format_result(edition_dicts, options)


if __name__ == "__main__":
Expand Down
26 changes: 25 additions & 1 deletion openlibrary/plugins/openlibrary/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Open Library Plugin.
"""

from urllib.parse import parse_qs, urlparse, urlencode, urlunparse
import requests
import web
import json
Expand Down Expand Up @@ -461,10 +462,33 @@ def GET(self):
return web.ok('OK')


def remove_high_priority(query: str) -> str:
"""
Remove `high_priority=true` and `high_priority=false` from query parameters,
as the API expects to pass URL parameters through to another query, and
these may interfere with that query.
>>> remove_high_priority('high_priority=true&v=1')
'v=1'
"""
query_params = parse_qs(query)
query_params.pop("high_priority", None)
new_query = urlencode(query_params, doseq=True)
return new_query


class isbn_lookup(delegate.page):
path = r'/(?:isbn|ISBN)/([0-9xX-]+)'

def GET(self, isbn):
input = web.input(high_priority=False)

high_priority = input.get("high_priority") == "true"
if "high_priority" in web.ctx.env.get('QUERY_STRING'):
web.ctx.env['QUERY_STRING'] = remove_high_priority(
web.ctx.env.get('QUERY_STRING')
)

# Preserve the url type (e.g. `.json`) and query params
ext = ''
if web.ctx.encoding and web.ctx.path.endswith('.' + web.ctx.encoding):
Expand All @@ -473,7 +497,7 @@ def GET(self, isbn):
ext += '?' + web.ctx.env['QUERY_STRING']

try:
if ed := Edition.from_isbn(isbn):
if ed := Edition.from_isbn(isbn=isbn, high_priority=high_priority):
return web.found(ed.key + ext)
except Exception as e:
logger.error(e)
Expand Down
1 change: 1 addition & 0 deletions openlibrary/plugins/upstream/borrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from infogami.utils.view import public, render_template, add_flash_message
from infogami.infobase.utils import parse_datetime

from openlibrary.core import models
from openlibrary.core import stats
from openlibrary.core import lending
from openlibrary.core import vendors
Expand Down
Loading

0 comments on commit 2f59017

Please sign in to comment.