From 19a15661a67f3c073b4b329ca86a63a344a09a21 Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Mon, 2 Dec 2024 21:04:03 -0500
Subject: [PATCH 01/11] author identifiers in import

---
 openlibrary/catalog/add_book/load_book.py     | 112 +++++++++++++-----
 .../catalog/add_book/tests/test_load_book.py  |  74 +++++++++++-
 openlibrary/core/models.py                    |  31 ++++-
 3 files changed, 182 insertions(+), 35 deletions(-)

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
index 67e86fc7e4a..30e3e9b7941 100644
--- a/openlibrary/catalog/add_book/load_book.py
+++ b/openlibrary/catalog/add_book/load_book.py
@@ -4,6 +4,7 @@
 
 from openlibrary.catalog.utils import author_dates_match, flip_name, key_int
 from openlibrary.core.helpers import extract_year
+from openlibrary.utils import uniq, extract_numeric_id_from_olid
 
 if TYPE_CHECKING:
     from openlibrary.plugins.upstream.models import Author
@@ -144,8 +145,56 @@ def walk_redirects(obj, seen):
             seen.add(obj['key'])
         return obj
 
-    # Try for an 'exact' (case-insensitive) name match, but fall back to alternate_names,
-    # then last name with identical birth and death dates (that are not themselves `None` or '').
+    def get_redirected_authors(authors: list["Author"]):
+        if any(a.type.key != '/type/author' for a in authors):
+            seen: set[dict] = set()
+            all_authors = [
+                walk_redirects(a, seen) for a in authors if a['key'] not in seen
+            ]
+            return all_authors
+        return authors
+
+    # Look for OL ID first.
+    if key := author.get("key"):
+        if reply := list(web.ctx.site.things({"type": "/type/author", "key~": key})):
+            # Always match on OL ID, even if remote identifiers don't match.
+            return get_redirected_authors([web.ctx.site.get(k) for k in reply])
+    # Try other identifiers next.
+    if identifiers := author.get("identifiers"):
+        queries = []
+        matched_authors = []
+        # Get all the authors that match any incoming identifier.
+        for identifier, val in identifiers.items():
+            queries.append({"type": "/type/author", f"remote_ids.{identifier}~": val})
+        for query in queries:
+            if reply := list(web.ctx.site.things(query)):
+                matched_authors.extend(
+                    get_redirected_authors([web.ctx.site.get(k) for k in reply])
+                )
+        matched_authors = uniq(matched_authors)
+        # The match is whichever one has the most identifiers in common AND does not have more conflicts than matched identifiers.
+        highest_matches = 0
+        selected_match = None
+        for a in matched_authors:
+            try:
+                _, matches = a.merge_remote_ids(identifiers)
+                if matches > highest_matches:
+                    selected_match = a
+                    highest_matches = matches
+                elif matches == highest_matches and matches > 0:
+                    # Prioritize the lower OL ID when matched identifiers are equal
+                    selected_match = (
+                        a
+                        if extract_numeric_id_from_olid(a.key) < extract_numeric_id_from_olid(selected_match.key)
+                        else selected_match
+                    )
+            except:
+                # Reject if too many conflicts
+                # TODO: raise a flag to librarians here?
+                pass
+        if highest_matches > 0 and selected_match is not None:
+            return [selected_match]
+    # Fall back to name/date matching, which we did before introducing identifiers.
     name = author["name"].replace("*", r"\*")
     queries = [
         {"type": "/type/author", "name~": name},
@@ -157,29 +206,11 @@ def walk_redirects(obj, seen):
             "death_date~": f"*{extract_year(author.get('death_date', '')) or -1}*",
         },  # Use `-1` to ensure an empty string from extract_year doesn't match empty dates.
     ]
+    things = []
     for query in queries:
         if reply := list(web.ctx.site.things(query)):
+            things = get_redirected_authors([web.ctx.site.get(k) for k in reply])
             break
-
-    authors = [web.ctx.site.get(k) for k in reply]
-    if any(a.type.key != '/type/author' for a in authors):
-        seen: set[dict] = set()
-        authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen]
-    return authors
-
-
-def find_entity(author: dict[str, Any]) -> "Author | None":
-    """
-    Looks for an existing Author record in OL
-    and returns it if found.
-
-    :param dict author: Author import dict {"name": "Some One"}
-    :return: Existing Author record if found, or None.
-    """
-    assert isinstance(author, dict)
-    things = find_author(author)
-    if author.get('entity_type', 'person') != 'person':
-        return things[0] if things else None
     match = []
     seen = set()
     for a in things:
@@ -187,7 +218,6 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
         if key in seen:
             continue
         seen.add(key)
-        orig_key = key
         assert a.type.key == '/type/author'
         if 'birth_date' in author and 'birth_date' not in a:
             continue
@@ -197,10 +227,27 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
             continue
         match.append(a)
     if not match:
-        return None
+        return []
     if len(match) == 1:
-        return match[0]
-    return pick_from_matches(author, match)
+        return [match[0]]
+    return [pick_from_matches(author, match)]
+
+
+def find_entity(author: dict[str, Any]) -> "Author | None":
+    """
+    Looks for an existing Author record in OL
+    and returns it if found.
+
+    :param dict author: Author import dict {"name": "Some One"}
+    :return: Existing Author record if found, or None.
+    """
+    assert isinstance(author, dict)
+    things = find_author(author)
+    if "identifiers" in author:
+        for index, t in enumerate(things):
+            t.remote_ids, _ = t.merge_remote_ids(author["identifiers"])
+            things[index] = t
+    return things[0] if things else None
 
 
 def remove_author_honorifics(name: str) -> str:
@@ -248,9 +295,20 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
             new['death_date'] = author['death_date']
         return new
     a = {'type': {'key': '/type/author'}}
-    for f in 'name', 'title', 'personal_name', 'birth_date', 'death_date', 'date':
+    for f in (
+        'name',
+        'title',
+        'personal_name',
+        'birth_date',
+        'death_date',
+        'date',
+        'remote_ids',
+    ):
         if f in author:
             a[f] = author[f]
+    # Import record hitting endpoing should list external IDs under "identifiers", but needs to be "remote_ids" when going into the DB.
+    if "identifiers" in author:
+        a["remote_ids"] = author["identifiers"]
     return a
 
 
diff --git a/openlibrary/catalog/add_book/tests/test_load_book.py b/openlibrary/catalog/add_book/tests/test_load_book.py
index f53b1153566..3e4c49beb1b 100644
--- a/openlibrary/catalog/add_book/tests/test_load_book.py
+++ b/openlibrary/catalog/add_book/tests/test_load_book.py
@@ -137,9 +137,77 @@ def test_author_wildcard_match_with_no_matches_creates_author_with_wildcard(
         new_author_name = import_author(author)
         assert author["name"] == new_author_name["name"]
 
-    def test_first_match_priority_name_and_dates(self, mock_site):
+    def test_first_match_ol_key(self, mock_site):
         """
-        Highest priority match is name, birth date, and death date.
+        Highest priority match is OL key.
+        """
+        self.add_three_existing_authors(mock_site)
+
+        # Author with VIAF
+        author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL3A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "12345678"},
+        }
+
+        # Another author with VIAF
+        author_different_key = {
+            "name": "William Brewer",
+            "key": "/authors/OL4A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "87654321"},
+        }
+
+        mock_site.save(author)
+        mock_site.save(author_different_key)
+
+        # Look for exact match on OL ID, regardless of other fields.
+        # We ideally shouldn't ever have a case where different authors have the same VIAF, but this demonstrates priority.
+        searched_author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL4A",
+            "identifiers": {"viaf": "12345678"},
+        }
+        found = import_author(searched_author)
+        assert found.key == author_different_key["key"]
+
+    def test_second_match_remote_identifier(self, mock_site):
+        """
+        Next highest priority match is any other remote identifier, such as VIAF, Goodreads ID, Amazon ID, etc.
+        """
+        self.add_three_existing_authors(mock_site)
+
+        # Author with VIAF
+        author = {
+            "name": "William H. Brewer",
+            "key": "/authors/OL3A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "12345678"},
+        }
+
+        # Another author with VIAF
+        author_different_viaf = {
+            "name": "William Brewer",
+            "key": "/authors/OL4A",
+            "type": {"key": "/type/author"},
+            "remote_ids": {"viaf": "87654321"},
+        }
+
+        mock_site.save(author)
+        mock_site.save(author_different_viaf)
+
+        # Look for exact match on VIAF, regardless of name field.
+        searched_author = {
+            "name": "William Brewer",
+            "identifiers": {"viaf": "12345678"},
+        }
+        found = import_author(searched_author)
+        assert found.key == author["key"]
+
+    def test_third_match_priority_name_and_dates(self, mock_site):
+        """
+        Next highest priority match is name, birth date, and death date.
         """
         self.add_three_existing_authors(mock_site)
 
@@ -202,7 +270,7 @@ def test_non_matching_birth_death_creates_new_author(self, mock_site):
         assert isinstance(found, dict)
         assert found["death_date"] == searched_and_not_found_author["death_date"]
 
-    def test_second_match_priority_alternate_names_and_dates(self, mock_site):
+    def test_match_priority_alternate_names_and_dates(self, mock_site):
         """
         Matching, as a unit, alternate name, birth date, and death date, get
         second match priority.
diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py
index df78d4f407c..78866784dc0 100644
--- a/openlibrary/core/models.py
+++ b/openlibrary/core/models.py
@@ -2,6 +2,10 @@
 """
 
 import json
+import requests
+import re
+import web
+from typing import Any, TypedDict
 import logging
 from collections import defaultdict
 from dataclasses import dataclass, field
@@ -9,8 +13,6 @@
 from typing import Any, TypedDict
 from urllib.parse import urlencode
 
-import requests
-import web
 
 from infogami.infobase import client
 
@@ -30,10 +32,10 @@
 from openlibrary.core.imports import ImportItem
 from openlibrary.core.observations import Observations
 from openlibrary.core.ratings import Ratings
-from openlibrary.core.vendors import get_amazon_metadata
-from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
 from openlibrary.utils import extract_numeric_id_from_olid
-from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13
+from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical
+from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity, REMOTE_IDS
+from openlibrary.core.vendors import get_amazon_metadata
 
 from ..accounts import OpenLibraryAccount  # noqa: F401 side effects may be needed
 from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url
@@ -807,6 +809,25 @@ def get_edition_count(self):
     def get_lists(self, limit=50, offset=0, sort=True):
         return self._get_lists(limit=limit, offset=offset, sort=sort)
 
+    def merge_remote_ids(
+        self, incoming_ids: dict[str, str]
+    ) -> tuple[dict[str, str], int]:
+        output = {**self.remote_ids}
+        if len(incoming_ids.items()) == 0:
+            return output, 0
+        matches = 0
+        conflicts = 0
+        for identifier in REMOTE_IDS:
+            if identifier in output and identifier in incoming_ids:
+                if output[identifier] != incoming_ids[identifier]:
+                    conflicts = conflicts + 1
+                else:
+                    output[identifier] = incoming_ids[identifier]
+                    matches = matches + 1
+        if conflicts > matches:
+            raise Exception("wikidata json conflicts with existing remote ids")
+        return output, matches
+
 
 class User(Thing):
     DEFAULT_PREFERENCES = {

From 5912f75021b621a26dbd61b730edc1d933a5bca1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 02:11:29 +0000
Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 openlibrary/catalog/add_book/load_book.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
index 30e3e9b7941..e69c78e4c0b 100644
--- a/openlibrary/catalog/add_book/load_book.py
+++ b/openlibrary/catalog/add_book/load_book.py
@@ -185,7 +185,8 @@ def get_redirected_authors(authors: list["Author"]):
                     # Prioritize the lower OL ID when matched identifiers are equal
                     selected_match = (
                         a
-                        if extract_numeric_id_from_olid(a.key) < extract_numeric_id_from_olid(selected_match.key)
+                        if extract_numeric_id_from_olid(a.key)
+                        < extract_numeric_id_from_olid(selected_match.key)
                         else selected_match
                     )
             except:

From 8305af5158e576f8fa3b6aff65d1e45f263d2922 Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Mon, 2 Dec 2024 21:13:29 -0500
Subject: [PATCH 03/11] this wasnt supposed to be here

---
 openlibrary/core/models.py | 31 +++++--------------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py
index 78866784dc0..df78d4f407c 100644
--- a/openlibrary/core/models.py
+++ b/openlibrary/core/models.py
@@ -2,10 +2,6 @@
 """
 
 import json
-import requests
-import re
-import web
-from typing import Any, TypedDict
 import logging
 from collections import defaultdict
 from dataclasses import dataclass, field
@@ -13,6 +9,8 @@
 from typing import Any, TypedDict
 from urllib.parse import urlencode
 
+import requests
+import web
 
 from infogami.infobase import client
 
@@ -32,10 +30,10 @@
 from openlibrary.core.imports import ImportItem
 from openlibrary.core.observations import Observations
 from openlibrary.core.ratings import Ratings
-from openlibrary.utils import extract_numeric_id_from_olid
-from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical
-from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity, REMOTE_IDS
 from openlibrary.core.vendors import get_amazon_metadata
+from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
+from openlibrary.utils import extract_numeric_id_from_olid
+from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13
 
 from ..accounts import OpenLibraryAccount  # noqa: F401 side effects may be needed
 from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url
@@ -809,25 +807,6 @@ def get_edition_count(self):
     def get_lists(self, limit=50, offset=0, sort=True):
         return self._get_lists(limit=limit, offset=offset, sort=sort)
 
-    def merge_remote_ids(
-        self, incoming_ids: dict[str, str]
-    ) -> tuple[dict[str, str], int]:
-        output = {**self.remote_ids}
-        if len(incoming_ids.items()) == 0:
-            return output, 0
-        matches = 0
-        conflicts = 0
-        for identifier in REMOTE_IDS:
-            if identifier in output and identifier in incoming_ids:
-                if output[identifier] != incoming_ids[identifier]:
-                    conflicts = conflicts + 1
-                else:
-                    output[identifier] = incoming_ids[identifier]
-                    matches = matches + 1
-        if conflicts > matches:
-            raise Exception("wikidata json conflicts with existing remote ids")
-        return output, matches
-
 
 class User(Thing):
     DEFAULT_PREFERENCES = {

From 280d715b295bc89156a34b6dad80beba35d39595 Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Mon, 2 Dec 2024 21:14:27 -0500
Subject: [PATCH 04/11] this was supposed to be here

---
 .../plugins/importapi/import_edition_builder.py    | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/openlibrary/plugins/importapi/import_edition_builder.py b/openlibrary/plugins/importapi/import_edition_builder.py
index 579ddc8d75e..b6bfbee2716 100644
--- a/openlibrary/plugins/importapi/import_edition_builder.py
+++ b/openlibrary/plugins/importapi/import_edition_builder.py
@@ -100,10 +100,15 @@ def add_list(self, key, val):
             self.edition_dict[key] = [val]
 
     def add_author(self, key, val):
-        # We don't know birth_date or death_date.
-        # Should name and personal_name be the same value?
-        author_dict = {'personal_name': val, 'name': val, 'entity_type': 'person'}
-        self.add_list('authors', author_dict)
+        if isinstance(val, dict):
+            author_dict = val
+            if "name" in author_dict:
+                author_dict['personal_name'] = author_dict['name']
+            self.add_list('authors', author_dict)
+        else:
+            self.add_list(
+                'authors', {'personal_name': val, 'name': val, 'entity_type': 'person'}
+            )
 
     def add_illustrator(self, key, val):
         self.add_list('contributions', val + ' (Illustrator)')
@@ -151,3 +156,4 @@ def add(self, key, val, restrict_keys=True):
             add_func(new_key, val)
         else:
             self.add_string(key, val)
+

From a3b8412e6ee0f8ec97ba0f65526daeafcc840070 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 02:15:11 +0000
Subject: [PATCH 05/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 openlibrary/plugins/importapi/import_edition_builder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openlibrary/plugins/importapi/import_edition_builder.py b/openlibrary/plugins/importapi/import_edition_builder.py
index b6bfbee2716..b66146039ec 100644
--- a/openlibrary/plugins/importapi/import_edition_builder.py
+++ b/openlibrary/plugins/importapi/import_edition_builder.py
@@ -156,4 +156,3 @@ def add(self, key, val, restrict_keys=True):
             add_func(new_key, val)
         else:
             self.add_string(key, val)
-

From 810dc33487f33aa734e9a047694e279a90d5a5d3 Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Mon, 2 Dec 2024 21:17:46 -0500
Subject: [PATCH 06/11] notes

---
 openlibrary/catalog/add_book/load_book.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
index e69c78e4c0b..24172501f25 100644
--- a/openlibrary/catalog/add_book/load_book.py
+++ b/openlibrary/catalog/add_book/load_book.py
@@ -177,6 +177,7 @@ def get_redirected_authors(authors: list["Author"]):
         selected_match = None
         for a in matched_authors:
             try:
+                # merge_remote_ids will be in #10092
                 _, matches = a.merge_remote_ids(identifiers)
                 if matches > highest_matches:
                     selected_match = a
@@ -246,6 +247,7 @@ def find_entity(author: dict[str, Any]) -> "Author | None":
     things = find_author(author)
     if "identifiers" in author:
         for index, t in enumerate(things):
+            # merge_remote_ids will be in #10092
             t.remote_ids, _ = t.merge_remote_ids(author["identifiers"])
             things[index] = t
     return things[0] if things else None

From 9d54ff7a69dd51090e5d7a0fadbf50230a21eb69 Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Mon, 2 Dec 2024 22:07:42 -0500
Subject: [PATCH 07/11] no more try/catch

---
 openlibrary/catalog/add_book/load_book.py | 31 ++++++++++-------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
index 24172501f25..76994ae4ed4 100644
--- a/openlibrary/catalog/add_book/load_book.py
+++ b/openlibrary/catalog/add_book/load_book.py
@@ -176,24 +176,19 @@ def get_redirected_authors(authors: list["Author"]):
         highest_matches = 0
         selected_match = None
         for a in matched_authors:
-            try:
-                # merge_remote_ids will be in #10092
-                _, matches = a.merge_remote_ids(identifiers)
-                if matches > highest_matches:
-                    selected_match = a
-                    highest_matches = matches
-                elif matches == highest_matches and matches > 0:
-                    # Prioritize the lower OL ID when matched identifiers are equal
-                    selected_match = (
-                        a
-                        if extract_numeric_id_from_olid(a.key)
-                        < extract_numeric_id_from_olid(selected_match.key)
-                        else selected_match
-                    )
-            except:
-                # Reject if too many conflicts
-                # TODO: raise a flag to librarians here?
-                pass
+            # merge_remote_ids will be in #10092
+            _, matches = a.merge_remote_ids(identifiers)
+            if matches > highest_matches:
+                selected_match = a
+                highest_matches = matches
+            elif matches == highest_matches and matches > 0:
+                # Prioritize the lower OL ID when matched identifiers are equal
+                selected_match = (
+                    a
+                    if extract_numeric_id_from_olid(a.key)
+                    < extract_numeric_id_from_olid(selected_match.key)
+                    else selected_match
+                )
         if highest_matches > 0 and selected_match is not None:
             return [selected_match]
     # Fall back to name/date matching, which we did before introducing identifiers.

From 0be86b7f856ac8f403fca7b6e17ae43bddc0650f Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Mon, 2 Dec 2024 22:46:48 -0500
Subject: [PATCH 08/11] precommits

---
 openlibrary/catalog/add_book/load_book.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
index 76994ae4ed4..1d4bf0755a9 100644
--- a/openlibrary/catalog/add_book/load_book.py
+++ b/openlibrary/catalog/add_book/load_book.py
@@ -4,7 +4,7 @@
 
 from openlibrary.catalog.utils import author_dates_match, flip_name, key_int
 from openlibrary.core.helpers import extract_year
-from openlibrary.utils import uniq, extract_numeric_id_from_olid
+from openlibrary.utils import extract_numeric_id_from_olid, uniq
 
 if TYPE_CHECKING:
     from openlibrary.plugins.upstream.models import Author
@@ -155,10 +155,9 @@ def get_redirected_authors(authors: list["Author"]):
         return authors
 
     # Look for OL ID first.
-    if key := author.get("key"):
-        if reply := list(web.ctx.site.things({"type": "/type/author", "key~": key})):
-            # Always match on OL ID, even if remote identifiers don't match.
-            return get_redirected_authors([web.ctx.site.get(k) for k in reply])
+    if (key := author.get("key")) and (reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))):
+        # Always match on OL ID, even if remote identifiers don't match.
+        return get_redirected_authors([web.ctx.site.get(k) for k in reply])
     # Try other identifiers next.
     if identifiers := author.get("identifiers"):
         queries = []
@@ -181,7 +180,7 @@ def get_redirected_authors(authors: list["Author"]):
             if matches > highest_matches:
                 selected_match = a
                 highest_matches = matches
-            elif matches == highest_matches and matches > 0:
+            elif matches == highest_matches and matches > 0 and selected_match is not None:
                 # Prioritize the lower OL ID when matched identifiers are equal
                 selected_match = (
                     a

From f57f9973aab471bb6b2955f97cd89d5bd98cbba9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 03:47:53 +0000
Subject: [PATCH 09/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 openlibrary/catalog/add_book/load_book.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
index 1d4bf0755a9..1d1e1153442 100644
--- a/openlibrary/catalog/add_book/load_book.py
+++ b/openlibrary/catalog/add_book/load_book.py
@@ -155,7 +155,9 @@ def get_redirected_authors(authors: list["Author"]):
         return authors
 
     # Look for OL ID first.
-    if (key := author.get("key")) and (reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))):
+    if (key := author.get("key")) and (
+        reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))
+    ):
         # Always match on OL ID, even if remote identifiers don't match.
         return get_redirected_authors([web.ctx.site.get(k) for k in reply])
     # Try other identifiers next.
@@ -180,7 +182,11 @@ def get_redirected_authors(authors: list["Author"]):
             if matches > highest_matches:
                 selected_match = a
                 highest_matches = matches
-            elif matches == highest_matches and matches > 0 and selected_match is not None:
+            elif (
+                matches == highest_matches
+                and matches > 0
+                and selected_match is not None
+            ):
                 # Prioritize the lower OL ID when matched identifiers are equal
                 selected_match = (
                     a

From 82b198c5bfef6050d87c55814bf67e62666c1104 Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Thu, 5 Dec 2024 17:56:14 -0500
Subject: [PATCH 10/11] ?

---
 openlibrary/core/models.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py
index df78d4f407c..9eb41c4c473 100644
--- a/openlibrary/core/models.py
+++ b/openlibrary/core/models.py
@@ -34,6 +34,7 @@
 from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
 from openlibrary.utils import extract_numeric_id_from_olid
 from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13
+from openlibrary.plugins.upstream.utils import get_author_config
 
 from ..accounts import OpenLibraryAccount  # noqa: F401 side effects may be needed
 from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url
@@ -806,7 +807,28 @@ def get_edition_count(self):
 
     def get_lists(self, limit=50, offset=0, sort=True):
         return self._get_lists(limit=limit, offset=offset, sort=sort)
-
+    
+    def merge_remote_ids(
+            self, incoming_ids: dict[str, str]
+        ) -> tuple[dict[str, str], int]:
+            output = {**self.remote_ids}
+            if len(incoming_ids.items()) == 0:
+                return output, -1
+            matches = 0
+            conflicts = 0
+            for id in get_author_config():
+                identifier: str = id.name
+                if identifier in output and identifier in incoming_ids:
+                    if output[identifier] != incoming_ids[identifier]:
+                        conflicts = conflicts + 1
+                    else:
+                        output[identifier] = incoming_ids[identifier]
+                        matches = matches + 1
+            if conflicts > matches:
+                # This means that the identifiers we already have for this author have too many conflicts with whichever identifiers we're trying to merge into it.
+                # TODO: Raise this to librarians, somehow.
+                return self.remote_ids, -1
+            return output, matches
 
 class User(Thing):
     DEFAULT_PREFERENCES = {

From 87d9f25b0b5d05cb1d1970da99ff1d373bb3d920 Mon Sep 17 00:00:00 2001
From: pidgezero-one <s.kischak@gmail.com>
Date: Thu, 12 Dec 2024 08:59:44 -0500
Subject: [PATCH 11/11] re: slack convo, go ahead and import this and use
 get_author_config over hardcoded IDs

---
 openlibrary/catalog/add_book/load_book.py |  2 +-
 openlibrary/core/models.py                | 48 ++++++++++++-----------
 2 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
index 1d1e1153442..9286ed2ba1b 100644
--- a/openlibrary/catalog/add_book/load_book.py
+++ b/openlibrary/catalog/add_book/load_book.py
@@ -155,7 +155,7 @@ def get_redirected_authors(authors: list["Author"]):
         return authors
 
     # Look for OL ID first.
-    if (key := author.get("key")) and (
+    if (key := author.get("ol_id")) and (
         reply := list(web.ctx.site.things({"type": "/type/author", "key~": key}))
     ):
         # Always match on OL ID, even if remote identifiers don't match.
diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py
index 9eb41c4c473..0c92a9b5dd1 100644
--- a/openlibrary/core/models.py
+++ b/openlibrary/core/models.py
@@ -32,9 +32,9 @@
 from openlibrary.core.ratings import Ratings
 from openlibrary.core.vendors import get_amazon_metadata
 from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity
+from openlibrary.plugins.upstream.utils import get_author_config
 from openlibrary.utils import extract_numeric_id_from_olid
 from openlibrary.utils.isbn import canonical, isbn_13_to_isbn_10, to_isbn_13
-from openlibrary.plugins.upstream.utils import get_author_config
 
 from ..accounts import OpenLibraryAccount  # noqa: F401 side effects may be needed
 from ..plugins.upstream.utils import get_coverstore_public_url, get_coverstore_url
@@ -807,28 +807,32 @@ def get_edition_count(self):
 
     def get_lists(self, limit=50, offset=0, sort=True):
         return self._get_lists(limit=limit, offset=offset, sort=sort)
-    
+
     def merge_remote_ids(
-            self, incoming_ids: dict[str, str]
-        ) -> tuple[dict[str, str], int]:
-            output = {**self.remote_ids}
-            if len(incoming_ids.items()) == 0:
-                return output, -1
-            matches = 0
-            conflicts = 0
-            for id in get_author_config():
-                identifier: str = id.name
-                if identifier in output and identifier in incoming_ids:
-                    if output[identifier] != incoming_ids[identifier]:
-                        conflicts = conflicts + 1
-                    else:
-                        output[identifier] = incoming_ids[identifier]
-                        matches = matches + 1
-            if conflicts > matches:
-                # This means that the identifiers we already have for this author have too many conflicts with whichever identifiers we're trying to merge into it.
-                # TODO: Raise this to librarians, somehow.
-                return self.remote_ids, -1
-            return output, matches
+        self, incoming_ids: dict[str, str]
+    ) -> tuple[dict[str, str], int]:
+        """Returns the author's remote IDs merged with a given remote IDs object, as well as a count for how many IDs had conflicts.
+        If incoming_ids is empty, or if there are more conflicts than matches, no merge will be attempted, and the output will be (author.remote_ids, -1).
+        """
+        output = {**self.remote_ids}
+        if len(incoming_ids.items()) == 0:
+            return output, -1
+        # Count
+        matches = 0
+        conflicts = 0
+        for id in get_author_config():
+            identifier: str = id.name
+            if identifier in output and identifier in incoming_ids:
+                if output[identifier] != incoming_ids[identifier]:
+                    conflicts = conflicts + 1
+                else:
+                    output[identifier] = incoming_ids[identifier]
+                    matches = matches + 1
+        if conflicts > matches:
+            # TODO: Raise this to librarians, somehow.
+            return self.remote_ids, -1
+        return output, matches
+
 
 class User(Thing):
     DEFAULT_PREFERENCES = {