Prevent rate limit errors in wayback machine API (#339)

The Wayback Machine Save API only allows a limited number of requests within a timespan. This introduces several changes to avoid rate limit errors: - There will be max. 1 attempt to create a new snapshot - If a new snapshot could not be created, then attempt to use the latest existing snapshot - Bulk snapshot updates (bookmark import, load missing snapshots after login) will only attempt to load the latest snapshot instead of creating new ones
sissbruecker · Sep 10, 2022 · 1b35d5b · 1b35d5b
1 parent 6420ec1
commit 1b35d5b
Show file tree

Hide file tree

Showing 4 changed files with 363 additions and 187 deletions.
diff --git a/bookmarks/services/tasks.py b/bookmarks/services/tasks.py
@@ -5,8 +5,9 @@
 from django.conf import settings
 from django.contrib.auth import get_user_model
 from django.contrib.auth.models import User
-from waybackpy.exceptions import WaybackError
+from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound
 
+import bookmarks.services.wayback
 from bookmarks.models import Bookmark, UserProfile
 from bookmarks.services.website_loader import DEFAULT_USER_AGENT
 
@@ -26,6 +27,32 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo
         _create_web_archive_snapshot_task(bookmark.id, force_update)
 
 
+def _load_newest_snapshot(bookmark: Bookmark):
+    try:
+        logger.debug(f'Load existing snapshot for bookmark. url={bookmark.url}')
+        cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(bookmark.url)
+        existing_snapshot = cdx_api.newest()
+
+        if existing_snapshot:
+            bookmark.web_archive_snapshot_url = existing_snapshot.archive_url
+            bookmark.save()
+            logger.debug(f'Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}')
+
+    except NoCDXRecordFound:
+        logger.error(f'Could not find any snapshots for bookmark. url={bookmark.url}')
+    except WaybackError as error:
+        logger.error(f'Failed to load existing snapshot. url={bookmark.url}', exc_info=error)
+
+
+def _create_snapshot(bookmark: Bookmark):
+    logger.debug(f'Create new snapshot for bookmark. url={bookmark.url}...')
+    archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT, max_tries=1)
+    archive.save()
+    bookmark.web_archive_snapshot_url = archive.archive_url
+    bookmark.save()
+    logger.debug(f'Successfully created new snapshot for bookmark:. url={bookmark.url}')
+
+
 @background()
 def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
     try:
@@ -37,19 +64,31 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
     if bookmark.web_archive_snapshot_url and not force_update:
         return
 
-    logger.debug(f'Create web archive link for bookmark: {bookmark}...')
+    # Create new snapshot
+    try:
+        _create_snapshot(bookmark)
+        return
+    except TooManyRequestsError:
+        logger.error(
+            f'Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}')
+    except WaybackError:
+        logger.error(f'Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}')
+
+    # Load the newest snapshot as fallback
+    _load_newest_snapshot(bookmark)
 
-    archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT)
 
+@background()
+def _load_web_archive_snapshot_task(bookmark_id: int):
     try:
-        archive.save()
-    except WaybackError as error:
-        logger.exception(f'Error creating web archive link for bookmark: {bookmark}...', exc_info=error)
-        raise
-
-    bookmark.web_archive_snapshot_url = archive.archive_url
-    bookmark.save()
-    logger.debug(f'Successfully created web archive link for bookmark: {bookmark}...')
+        bookmark = Bookmark.objects.get(id=bookmark_id)
+    except Bookmark.DoesNotExist:
+        return
+    # Skip if snapshot exists
+    if bookmark.web_archive_snapshot_url:
+        return
+    # Load the newest snapshot
+    _load_newest_snapshot(bookmark)
 
 
 def schedule_bookmarks_without_snapshots(user: User):
@@ -63,4 +102,6 @@ def _schedule_bookmarks_without_snapshots_task(user_id: int):
     bookmarks_without_snapshots = Bookmark.objects.filter(web_archive_snapshot_url__exact='', owner=user)
 
     for bookmark in bookmarks_without_snapshots:
-        _create_web_archive_snapshot_task(bookmark.id, False)
+        # To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating
+        # new ones when processing bookmarks in bulk
+        _load_web_archive_snapshot_task(bookmark.id)
diff --git a/bookmarks/services/wayback.py b/bookmarks/services/wayback.py
@@ -0,0 +1,40 @@
+import time
+from typing import Dict
+
+import waybackpy
+import waybackpy.utils
+from waybackpy.exceptions import NoCDXRecordFound
+
+
+class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
+    """
+    Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
+    See https://github.com/akamhy/waybackpy/issues/176
+    """
+
+    def newest(self):
+        unix_timestamp = int(time.time())
+        self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(unix_timestamp)
+        self.sort = 'closest'
+        self.limit = -5
+
+        newest_snapshot = None
+        for snapshot in self.snapshots():
+            newest_snapshot = snapshot
+            break
+
+        if not newest_snapshot:
+            raise NoCDXRecordFound(
+                "Wayback Machine's CDX server did not return any records "
+                + "for the query. The URL may not have any archives "
+                + " on the Wayback Machine or the URL may have been recently "
+                + "archived and is still not available on the CDX server."
+            )
+
+        return newest_snapshot
+
+    def add_payload(self, payload: Dict[str, str]) -> None:
+        super().add_payload(payload)
+        # Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
+        # makes searching for latest snapshots faster
+        payload['fastLatest'] = 'true'