From 19a15661a67f3c073b4b329ca86a63a344a09a21 Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Mon, 2 Dec 2024 21:04:03 -0500 Subject: [PATCH 01/11] author identifiers in import --- openlibrary/catalog/add_book/load_book.py | 112 +++++++++++++----- .../catalog/add_book/tests/test_load_book.py | 74 +++++++++++- openlibrary/core/models.py | 31 ++++- 3 files changed, 182 insertions(+), 35 deletions(-) diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py index 67e86fc7e4a..30e3e9b7941 100644 --- a/openlibrary/catalog/add_book/load_book.py +++ b/openlibrary/catalog/add_book/load_book.py @@ -4,6 +4,7 @@ from openlibrary.catalog.utils import author_dates_match, flip_name, key_int from openlibrary.core.helpers import extract_year +from openlibrary.utils import uniq, extract_numeric_id_from_olid if TYPE_CHECKING: from openlibrary.plugins.upstream.models import Author @@ -144,8 +145,56 @@ def walk_redirects(obj, seen): seen.add(obj['key']) return obj - # Try for an 'exact' (case-insensitive) name match, but fall back to alternate_names, - # then last name with identical birth and death dates (that are not themselves `None` or ''). + def get_redirected_authors(authors: list["Author"]): + if any(a.type.key != '/type/author' for a in authors): + seen: set[dict] = set() + all_authors = [ + walk_redirects(a, seen) for a in authors if a['key'] not in seen + ] + return all_authors + return authors + + # Look for OL ID first. + if key := author.get("key"): + if reply := list(web.ctx.site.things({"type": "/type/author", "key~": key})): + # Always match on OL ID, even if remote identifiers don't match. + return get_redirected_authors([web.ctx.site.get(k) for k in reply]) + # Try other identifiers next. + if identifiers := author.get("identifiers"): + queries = [] + matched_authors = [] + # Get all the authors that match any incoming identifier. + for identifier, val in identifiers.items(): + queries.append({"type": "/type/author", f"remote_ids.{identifier}~": val}) + for query in queries: + if reply := list(web.ctx.site.things(query)): + matched_authors.extend( + get_redirected_authors([web.ctx.site.get(k) for k in reply]) + ) + matched_authors = uniq(matched_authors) + # The match is whichever one has the most identifiers in common AND does not have more conflicts than matched identifiers. + highest_matches = 0 + selected_match = None + for a in matched_authors: + try: + _, matches = a.merge_remote_ids(identifiers) + if matches > highest_matches: + selected_match = a + highest_matches = matches + elif matches == highest_matches and matches > 0: + # Prioritize the lower OL ID when matched identifiers are equal + selected_match = ( + a + if extract_numeric_id_from_olid(a.key) < extract_numeric_id_from_olid(selected_match.key) + else selected_match + ) + except: + # Reject if too many conflicts + # TODO: raise a flag to librarians here? + pass + if highest_matches > 0 and selected_match is not None: + return [selected_match] + # Fall back to name/date matching, which we did before introducing identifiers. name = author["name"].replace("*", r"\*") queries = [ {"type": "/type/author", "name~": name}, @@ -157,29 +206,11 @@ def walk_redirects(obj, seen): "death_date~": f"*{extract_year(author.get('death_date', '')) or -1}*", }, # Use `-1` to ensure an empty string from extract_year doesn't match empty dates. ] + things = [] for query in queries: if reply := list(web.ctx.site.things(query)): + things = get_redirected_authors([web.ctx.site.get(k) for k in reply]) break - - authors = [web.ctx.site.get(k) for k in reply] - if any(a.type.key != '/type/author' for a in authors): - seen: set[dict] = set() - authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen] - return authors - - -def find_entity(author: dict[str, Any]) -> "Author | None": - """ - Looks for an existing Author record in OL - and returns it if found. - - :param dict author: Author import dict {"name": "Some One"} - :return: Existing Author record if found, or None. - """ - assert isinstance(author, dict) - things = find_author(author) - if author.get('entity_type', 'person') != 'person': - return things[0] if things else None match = [] seen = set() for a in things: @@ -187,7 +218,6 @@ def find_entity(author: dict[str, Any]) -> "Author | None": if key in seen: continue seen.add(key) - orig_key = key assert a.type.key == '/type/author' if 'birth_date' in author and 'birth_date' not in a: continue @@ -197,10 +227,27 @@ def find_entity(author: dict[str, Any]) -> "Author | None": continue match.append(a) if not match: - return None + return [] if len(match) == 1: - return match[0] - return pick_from_matches(author, match) + return [match[0]] + return [pick_from_matches(author, match)] + + +def find_entity(author: dict[str, Any]) -> "Author | None": + """ + Looks for an existing Author record in OL + and returns it if found. + + :param dict author: Author import dict {"name": "Some One"} + :return: Existing Author record if found, or None. + """ + assert isinstance(author, dict) + things = find_author(author) + if "identifiers" in author: + for index, t in enumerate(things): + t.remote_ids, _ = t.merge_remote_ids(author["identifiers"]) + things[index] = t + return things[0] if things else None def remove_author_honorifics(name: str) -> str: @@ -248,9 +295,20 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str, new['death_date'] = author['death_date'] return new a = {'type': {'key': '/type/author'}} - for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date': + for f in ( + 'name', + 'title', + 'personal_name', + 'birth_date', + 'death_date', + 'date', + 'remote_ids', + ): if f in author: a[f] = author[f] + # Import record hitting endpoing should list external IDs under "identifiers", but needs to be "remote_ids" when going into the DB. + if "identifiers" in author: + a["remote_ids"] = author["identifiers"] return a diff --git a/openlibrary/catalog/add_book/tests/test_load_book.py b/openlibrary/catalog/add_book/tests/test_load_book.py index f53b1153566..3e4c49beb1b 100644 --- a/openlibrary/catalog/add_book/tests/test_load_book.py +++ b/openlibrary/catalog/add_book/tests/test_load_book.py @@ -137,9 +137,77 @@ def test_author_wildcard_match_with_no_matches_creates_author_with_wildcard( new_author_name = import_author(author) assert author["name"] == new_author_name["name"] - def test_first_match_priority_name_and_dates(self, mock_site): + def test_first_match_ol_key(self, mock_site): """ - Highest priority match is name, birth date, and death date. + Highest priority match is OL key. + """ + self.add_three_existing_authors(mock_site) + + # Author with VIAF + author = { + "name": "William H. Brewer", + "key": "/authors/OL3A", + "type": {"key": "/type/author"}, + "remote_ids": {"viaf": "12345678"}, + } + + # Another author with VIAF + author_different_key = { + "name": "William Brewer", + "key": "/authors/OL4A", + "type": {"key": "/type/author"}, + "remote_ids": {"viaf": "87654321"}, + } + + mock_site.save(author) + mock_site.save(author_different_key) + + # Look for exact match on OL ID, regardless of other fields. + # We ideally shouldn't ever have a case where different authors have the same VIAF, but this demonstrates priority. + searched_author = { + "name": "William H. Brewer", + "key": "/authors/OL4A", + "identifiers": {"viaf": "12345678"}, + } + found = import_author(searched_author) + assert found.key == author_different_key["key"] + + def test_second_match_remote_identifier(self, mock_site): + """ + Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc. + """ + self.add_three_existing_authors(mock_site) + + # Author with VIAF + author = { + "name": "William H. Brewer", + "key": "/authors/OL3A", + "type": {"key": "/type/author"}, + "remote_ids": {"viaf": "12345678"}, + } + + # Another author with VIAF + author_different_viaf = { + "name": "William Brewer", + "key": "/authors/OL4A", + "type": {"key": "/type/author"}, + "remote_ids": {"viaf": "87654321"}, + } + + mock_site.save(author) + mock_site.save(author_different_viaf) + + # Look for exact match on VIAF, regardless of name field. + searched_author = { + "name": "William Brewer", + "identifiers": {"viaf": "12345678"}, + } + found = import_author(searched_author) + assert found.key == author["key"] + + def test_third_match_priority_name_and_dates(self, mock_site): + """ + Next highest priority match is name, birth date, and death date. """ self.add_three_existing_authors(mock_site) @@ -202,7 +270,7 @@ def test_non_matching_birth_death_creates_new_author(self, mock_site): assert isinstance(found, dict) assert found["death_date"] == searched_and_not_found_author["death_date"] - def test_second_match_priority_alternate_names_and_dates(self, mock_site): + def test_match_priority_alternate_names_and_dates(self, mock_site): """ Matching, as a unit, alternate name, birth date, and death date, get second match priority. diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py index df78d4f407c..78866784dc0 100644 --- a/openlibrary/core/models.py +++ b/openlibrary/core/models.py @@ -2,6 +2,10 @@ """ import json +import requests +import re +import web +from typing import Any, TypedDict import logging from collections import defaultdict from dataclasses import dataclass, field @@ -9,8 +13,6 @@ from typing import Any, TypedDict from urllib.parse import urlencode -import requests -import web from infogami.infobase import client @@ -30,10 +32,10 @@ from openlibrary.core.imports import ImportItem from openlibrary.core.observations import Observations from openlibrary.core.ratings import Ratings -from openlibrary.core.vendors import get_amazon_metadata -from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity from openlibrary.utils import extract_numeric_id_from_olid -from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13 +from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical +from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity, REMOTE_IDS +from openlibrary.core.vendors import get_amazon_metadata from ..accounts import OpenLibraryAccount # noqa: F401 side effects may be needed from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url @@ -807,6 +809,25 @@ def get_edition_count(self): def get_lists(self, limit=50, offset=0, sort=True): return self._get_lists(limit=limit, offset=offset, sort=sort) + def merge_remote_ids( + self, incoming_ids: dict[str, str] + ) -> tuple[dict[str, str], int]: + output = {**self.remote_ids} + if len(incoming_ids.items()) == 0: + return output, 0 + matches = 0 + conflicts = 0 + for identifier in REMOTE_IDS: + if identifier in output and identifier in incoming_ids: + if output[identifier] != incoming_ids[identifier]: + conflicts = conflicts + 1 + else: + output[identifier] = incoming_ids[identifier] + matches = matches + 1 + if conflicts > matches: + raise Exception("wikidata json conflicts with existing remote ids") + return output, matches + class User(Thing): DEFAULT_PREFERENCES = { From 5912f75021b621a26dbd61b730edc1d933a5bca1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Dec 2024 02:11:29 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openlibrary/catalog/add_book/load_book.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py index 30e3e9b7941..e69c78e4c0b 100644 --- a/openlibrary/catalog/add_book/load_book.py +++ b/openlibrary/catalog/add_book/load_book.py @@ -185,7 +185,8 @@ def get_redirected_authors(authors: list["Author"]): # Prioritize the lower OL ID when matched identifiers are equal selected_match = ( a - if extract_numeric_id_from_olid(a.key) < extract_numeric_id_from_olid(selected_match.key) + if extract_numeric_id_from_olid(a.key) + < extract_numeric_id_from_olid(selected_match.key) else selected_match ) except: From 8305af5158e576f8fa3b6aff65d1e45f263d2922 Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Mon, 2 Dec 2024 21:13:29 -0500 Subject: [PATCH 03/11] this wasnt supposed to be here --- openlibrary/core/models.py | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py index 78866784dc0..df78d4f407c 100644 --- a/openlibrary/core/models.py +++ b/openlibrary/core/models.py @@ -2,10 +2,6 @@ """ import json -import requests -import re -import web -from typing import Any, TypedDict import logging from collections import defaultdict from dataclasses import dataclass, field @@ -13,6 +9,8 @@ from typing import Any, TypedDict from urllib.parse import urlencode +import requests +import web from infogami.infobase import client @@ -32,10 +30,10 @@ from openlibrary.core.imports import ImportItem from openlibrary.core.observations import Observations from openlibrary.core.ratings import Ratings -from openlibrary.utils import extract_numeric_id_from_olid -from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical -from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity, REMOTE_IDS from openlibrary.core.vendors import get_amazon_metadata +from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity +from openlibrary.utils import extract_numeric_id_from_olid +from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13 from ..accounts import OpenLibraryAccount # noqa: F401 side effects may be needed from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url @@ -809,25 +807,6 @@ def get_edition_count(self): def get_lists(self, limit=50, offset=0, sort=True): return self._get_lists(limit=limit, offset=offset, sort=sort) - def merge_remote_ids( - self, incoming_ids: dict[str, str] - ) -> tuple[dict[str, str], int]: - output = {**self.remote_ids} - if len(incoming_ids.items()) == 0: - return output, 0 - matches = 0 - conflicts = 0 - for identifier in REMOTE_IDS: - if identifier in output and identifier in incoming_ids: - if output[identifier] != incoming_ids[identifier]: - conflicts = conflicts + 1 - else: - output[identifier] = incoming_ids[identifier] - matches = matches + 1 - if conflicts > matches: - raise Exception("wikidata json conflicts with existing remote ids") - return output, matches - class User(Thing): DEFAULT_PREFERENCES = { From 280d715b295bc89156a34b6dad80beba35d39595 Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Mon, 2 Dec 2024 21:14:27 -0500 Subject: [PATCH 04/11] this was supposed to be here --- .../plugins/importapi/import_edition_builder.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/openlibrary/plugins/importapi/import_edition_builder.py b/openlibrary/plugins/importapi/import_edition_builder.py index 579ddc8d75e..b6bfbee2716 100644 --- a/openlibrary/plugins/importapi/import_edition_builder.py +++ b/openlibrary/plugins/importapi/import_edition_builder.py @@ -100,10 +100,15 @@ def add_list(self, key, val): self.edition_dict[key] = [val] def add_author(self, key, val): - # We don't know birth_date or death_date. - # Should name and personal_name be the same value? - author_dict = {'personal_name': val, 'name': val, 'entity_type': 'person'} - self.add_list('authors', author_dict) + if isinstance(val, dict): + author_dict = val + if "name" in author_dict: + author_dict['personal_name'] = author_dict['name'] + self.add_list('authors', author_dict) + else: + self.add_list( + 'authors', {'personal_name': val, 'name': val, 'entity_type': 'person'} + ) def add_illustrator(self, key, val): self.add_list('contributions', val + ' (Illustrator)') @@ -151,3 +156,4 @@ def add(self, key, val, restrict_keys=True): add_func(new_key, val) else: self.add_string(key, val) + From a3b8412e6ee0f8ec97ba0f65526daeafcc840070 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Dec 2024 02:15:11 +0000 Subject: [PATCH 05/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openlibrary/plugins/importapi/import_edition_builder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openlibrary/plugins/importapi/import_edition_builder.py b/openlibrary/plugins/importapi/import_edition_builder.py index b6bfbee2716..b66146039ec 100644 --- a/openlibrary/plugins/importapi/import_edition_builder.py +++ b/openlibrary/plugins/importapi/import_edition_builder.py @@ -156,4 +156,3 @@ def add(self, key, val, restrict_keys=True): add_func(new_key, val) else: self.add_string(key, val) - From 810dc33487f33aa734e9a047694e279a90d5a5d3 Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Mon, 2 Dec 2024 21:17:46 -0500 Subject: [PATCH 06/11] notes --- openlibrary/catalog/add_book/load_book.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py index e69c78e4c0b..24172501f25 100644 --- a/openlibrary/catalog/add_book/load_book.py +++ b/openlibrary/catalog/add_book/load_book.py @@ -177,6 +177,7 @@ def get_redirected_authors(authors: list["Author"]): selected_match = None for a in matched_authors: try: + # merge_remote_ids will be in #10092 _, matches = a.merge_remote_ids(identifiers) if matches > highest_matches: selected_match = a @@ -246,6 +247,7 @@ def find_entity(author: dict[str, Any]) -> "Author | None": things = find_author(author) if "identifiers" in author: for index, t in enumerate(things): + # merge_remote_ids will be in #10092 t.remote_ids, _ = t.merge_remote_ids(author["identifiers"]) things[index] = t return things[0] if things else None From 9d54ff7a69dd51090e5d7a0fadbf50230a21eb69 Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Mon, 2 Dec 2024 22:07:42 -0500 Subject: [PATCH 07/11] no more try/catch --- openlibrary/catalog/add_book/load_book.py | 31 ++++++++++------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py index 24172501f25..76994ae4ed4 100644 --- a/openlibrary/catalog/add_book/load_book.py +++ b/openlibrary/catalog/add_book/load_book.py @@ -176,24 +176,19 @@ def get_redirected_authors(authors: list["Author"]): highest_matches = 0 selected_match = None for a in matched_authors: - try: - # merge_remote_ids will be in #10092 - _, matches = a.merge_remote_ids(identifiers) - if matches > highest_matches: - selected_match = a - highest_matches = matches - elif matches == highest_matches and matches > 0: - # Prioritize the lower OL ID when matched identifiers are equal - selected_match = ( - a - if extract_numeric_id_from_olid(a.key) - < extract_numeric_id_from_olid(selected_match.key) - else selected_match - ) - except: - # Reject if too many conflicts - # TODO: raise a flag to librarians here? - pass + # merge_remote_ids will be in #10092 + _, matches = a.merge_remote_ids(identifiers) + if matches > highest_matches: + selected_match = a + highest_matches = matches + elif matches == highest_matches and matches > 0: + # Prioritize the lower OL ID when matched identifiers are equal + selected_match = ( + a + if extract_numeric_id_from_olid(a.key) + < extract_numeric_id_from_olid(selected_match.key) + else selected_match + ) if highest_matches > 0 and selected_match is not None: return [selected_match] # Fall back to name/date matching, which we did before introducing identifiers. From 0be86b7f856ac8f403fca7b6e17ae43bddc0650f Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Mon, 2 Dec 2024 22:46:48 -0500 Subject: [PATCH 08/11] precommits --- openlibrary/catalog/add_book/load_book.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py index 76994ae4ed4..1d4bf0755a9 100644 --- a/openlibrary/catalog/add_book/load_book.py +++ b/openlibrary/catalog/add_book/load_book.py @@ -4,7 +4,7 @@ from openlibrary.catalog.utils import author_dates_match, flip_name, key_int from openlibrary.core.helpers import extract_year -from openlibrary.utils import uniq, extract_numeric_id_from_olid +from openlibrary.utils import extract_numeric_id_from_olid, uniq if TYPE_CHECKING: from openlibrary.plugins.upstream.models import Author @@ -155,10 +155,9 @@ def get_redirected_authors(authors: list["Author"]): return authors # Look for OL ID first. - if key := author.get("key"): - if reply := list(web.ctx.site.things({"type": "/type/author", "key~": key})): - # Always match on OL ID, even if remote identifiers don't match. - return get_redirected_authors([web.ctx.site.get(k) for k in reply]) + if (key := author.get("key")) and (reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))): + # Always match on OL ID, even if remote identifiers don't match. + return get_redirected_authors([web.ctx.site.get(k) for k in reply]) # Try other identifiers next. if identifiers := author.get("identifiers"): queries = [] @@ -181,7 +180,7 @@ def get_redirected_authors(authors: list["Author"]): if matches > highest_matches: selected_match = a highest_matches = matches - elif matches == highest_matches and matches > 0: + elif matches == highest_matches and matches > 0 and selected_match is not None: # Prioritize the lower OL ID when matched identifiers are equal selected_match = ( a From f57f9973aab471bb6b2955f97cd89d5bd98cbba9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Dec 2024 03:47:53 +0000 Subject: [PATCH 09/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openlibrary/catalog/add_book/load_book.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py index 1d4bf0755a9..1d1e1153442 100644 --- a/openlibrary/catalog/add_book/load_book.py +++ b/openlibrary/catalog/add_book/load_book.py @@ -155,7 +155,9 @@ def get_redirected_authors(authors: list["Author"]): return authors # Look for OL ID first. - if (key := author.get("key")) and (reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))): + if (key := author.get("key")) and ( + reply := list(web.ctx.site.things({"type": "/type/author", "key~": key})) + ): # Always match on OL ID, even if remote identifiers don't match. return get_redirected_authors([web.ctx.site.get(k) for k in reply]) # Try other identifiers next. @@ -180,7 +182,11 @@ def get_redirected_authors(authors: list["Author"]): if matches > highest_matches: selected_match = a highest_matches = matches - elif matches == highest_matches and matches > 0 and selected_match is not None: + elif ( + matches == highest_matches + and matches > 0 + and selected_match is not None + ): # Prioritize the lower OL ID when matched identifiers are equal selected_match = ( a From 82b198c5bfef6050d87c55814bf67e62666c1104 Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Thu, 5 Dec 2024 17:56:14 -0500 Subject: [PATCH 10/11] ? --- openlibrary/core/models.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py index df78d4f407c..9eb41c4c473 100644 --- a/openlibrary/core/models.py +++ b/openlibrary/core/models.py @@ -34,6 +34,7 @@ from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity from openlibrary.utils import extract_numeric_id_from_olid from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13 +from openlibrary.plugins.upstream.utils import get_author_config from ..accounts import OpenLibraryAccount # noqa: F401 side effects may be needed from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url @@ -806,7 +807,28 @@ def get_edition_count(self): def get_lists(self, limit=50, offset=0, sort=True): return self._get_lists(limit=limit, offset=offset, sort=sort) - + + def merge_remote_ids( + self, incoming_ids: dict[str, str] + ) -> tuple[dict[str, str], int]: + output = {**self.remote_ids} + if len(incoming_ids.items()) == 0: + return output, -1 + matches = 0 + conflicts = 0 + for id in get_author_config(): + identifier: str = id.name + if identifier in output and identifier in incoming_ids: + if output[identifier] != incoming_ids[identifier]: + conflicts = conflicts + 1 + else: + output[identifier] = incoming_ids[identifier] + matches = matches + 1 + if conflicts > matches: + # This means that the identifiers we already have for this author have too many conflicts with whichever identifiers we're trying to merge into it. + # TODO: Raise this to librarians, somehow. + return self.remote_ids, -1 + return output, matches class User(Thing): DEFAULT_PREFERENCES = { From 87d9f25b0b5d05cb1d1970da99ff1d373bb3d920 Mon Sep 17 00:00:00 2001 From: pidgezero-one Date: Thu, 12 Dec 2024 08:59:44 -0500 Subject: [PATCH 11/11] re: slack convo, go ahead and import this and use get_author_config over hardcoded IDs --- openlibrary/catalog/add_book/load_book.py | 2 +- openlibrary/core/models.py | 48 ++++++++++++----------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py index 1d1e1153442..9286ed2ba1b 100644 --- a/openlibrary/catalog/add_book/load_book.py +++ b/openlibrary/catalog/add_book/load_book.py @@ -155,7 +155,7 @@ def get_redirected_authors(authors: list["Author"]): return authors # Look for OL ID first. - if (key := author.get("key")) and ( + if (key := author.get("ol_id")) and ( reply := list(web.ctx.site.things({"type": "/type/author", "key~": key})) ): # Always match on OL ID, even if remote identifiers don't match. diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py index 9eb41c4c473..0c92a9b5dd1 100644 --- a/openlibrary/core/models.py +++ b/openlibrary/core/models.py @@ -32,9 +32,9 @@ from openlibrary.core.ratings import Ratings from openlibrary.core.vendors import get_amazon_metadata from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity +from openlibrary.plugins.upstream.utils import get_author_config from openlibrary.utils import extract_numeric_id_from_olid from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13 -from openlibrary.plugins.upstream.utils import get_author_config from ..accounts import OpenLibraryAccount # noqa: F401 side effects may be needed from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url @@ -807,28 +807,32 @@ def get_edition_count(self): def get_lists(self, limit=50, offset=0, sort=True): return self._get_lists(limit=limit, offset=offset, sort=sort) - + def merge_remote_ids( - self, incoming_ids: dict[str, str] - ) -> tuple[dict[str, str], int]: - output = {**self.remote_ids} - if len(incoming_ids.items()) == 0: - return output, -1 - matches = 0 - conflicts = 0 - for id in get_author_config(): - identifier: str = id.name - if identifier in output and identifier in incoming_ids: - if output[identifier] != incoming_ids[identifier]: - conflicts = conflicts + 1 - else: - output[identifier] = incoming_ids[identifier] - matches = matches + 1 - if conflicts > matches: - # This means that the identifiers we already have for this author have too many conflicts with whichever identifiers we're trying to merge into it. - # TODO: Raise this to librarians, somehow. - return self.remote_ids, -1 - return output, matches + self, incoming_ids: dict[str, str] + ) -> tuple[dict[str, str], int]: + """Returns the author's remote IDs merged with a given remote IDs object, as well as a count for how many IDs had conflicts. + If incoming_ids is empty, or if there are more conflicts than matches, no merge will be attempted, and the output will be (author.remote_ids, -1). + """ + output = {**self.remote_ids} + if len(incoming_ids.items()) == 0: + return output, -1 + # Count + matches = 0 + conflicts = 0 + for id in get_author_config(): + identifier: str = id.name + if identifier in output and identifier in incoming_ids: + if output[identifier] != incoming_ids[identifier]: + conflicts = conflicts + 1 + else: + output[identifier] = incoming_ids[identifier] + matches = matches + 1 + if conflicts > matches: + # TODO: Raise this to librarians, somehow. + return self.remote_ids, -1 + return output, matches + class User(Thing): DEFAULT_PREFERENCES = {