Skip to content

Commit

Permalink
Prevent rate limit errors in wayback machine API (#339)
Browse files Browse the repository at this point in the history
The Wayback Machine Save API only allows a limited number of requests within a timespan. This introduces several changes to avoid rate limit errors:
- There will be max. 1 attempt to create a new snapshot
- If a new snapshot could not be created, then attempt to use the latest existing snapshot
- Bulk snapshot updates (bookmark import, load missing snapshots after login) will only attempt to load the latest snapshot instead of creating new ones
  • Loading branch information
sissbruecker authored Sep 10, 2022
1 parent 6420ec1 commit 1b35d5b
Show file tree
Hide file tree
Showing 4 changed files with 363 additions and 187 deletions.
65 changes: 53 additions & 12 deletions bookmarks/services/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from django.conf import settings
from django.contrib.auth import get_user_model
from django.contrib.auth.models import User
from waybackpy.exceptions import WaybackError
from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound

import bookmarks.services.wayback
from bookmarks.models import Bookmark, UserProfile
from bookmarks.services.website_loader import DEFAULT_USER_AGENT

Expand All @@ -26,6 +27,32 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo
_create_web_archive_snapshot_task(bookmark.id, force_update)


def _load_newest_snapshot(bookmark: Bookmark):
try:
logger.debug(f'Load existing snapshot for bookmark. url={bookmark.url}')
cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(bookmark.url)
existing_snapshot = cdx_api.newest()

if existing_snapshot:
bookmark.web_archive_snapshot_url = existing_snapshot.archive_url
bookmark.save()
logger.debug(f'Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}')

except NoCDXRecordFound:
logger.error(f'Could not find any snapshots for bookmark. url={bookmark.url}')
except WaybackError as error:
logger.error(f'Failed to load existing snapshot. url={bookmark.url}', exc_info=error)


def _create_snapshot(bookmark: Bookmark):
logger.debug(f'Create new snapshot for bookmark. url={bookmark.url}...')
archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT, max_tries=1)
archive.save()
bookmark.web_archive_snapshot_url = archive.archive_url
bookmark.save()
logger.debug(f'Successfully created new snapshot for bookmark:. url={bookmark.url}')


@background()
def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
try:
Expand All @@ -37,19 +64,31 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
if bookmark.web_archive_snapshot_url and not force_update:
return

logger.debug(f'Create web archive link for bookmark: {bookmark}...')
# Create new snapshot
try:
_create_snapshot(bookmark)
return
except TooManyRequestsError:
logger.error(
f'Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}')
except WaybackError:
logger.error(f'Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}')

# Load the newest snapshot as fallback
_load_newest_snapshot(bookmark)

archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT)

@background()
def _load_web_archive_snapshot_task(bookmark_id: int):
try:
archive.save()
except WaybackError as error:
logger.exception(f'Error creating web archive link for bookmark: {bookmark}...', exc_info=error)
raise

bookmark.web_archive_snapshot_url = archive.archive_url
bookmark.save()
logger.debug(f'Successfully created web archive link for bookmark: {bookmark}...')
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
# Skip if snapshot exists
if bookmark.web_archive_snapshot_url:
return
# Load the newest snapshot
_load_newest_snapshot(bookmark)


def schedule_bookmarks_without_snapshots(user: User):
Expand All @@ -63,4 +102,6 @@ def _schedule_bookmarks_without_snapshots_task(user_id: int):
bookmarks_without_snapshots = Bookmark.objects.filter(web_archive_snapshot_url__exact='', owner=user)

for bookmark in bookmarks_without_snapshots:
_create_web_archive_snapshot_task(bookmark.id, False)
# To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating
# new ones when processing bookmarks in bulk
_load_web_archive_snapshot_task(bookmark.id)
40 changes: 40 additions & 0 deletions bookmarks/services/wayback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import time
from typing import Dict

import waybackpy
import waybackpy.utils
from waybackpy.exceptions import NoCDXRecordFound


class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
"""
Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
See https://github.com/akamhy/waybackpy/issues/176
"""

def newest(self):
unix_timestamp = int(time.time())
self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(unix_timestamp)
self.sort = 'closest'
self.limit = -5

newest_snapshot = None
for snapshot in self.snapshots():
newest_snapshot = snapshot
break

if not newest_snapshot:
raise NoCDXRecordFound(
"Wayback Machine's CDX server did not return any records "
+ "for the query. The URL may not have any archives "
+ " on the Wayback Machine or the URL may have been recently "
+ "archived and is still not available on the CDX server."
)

return newest_snapshot

def add_payload(self, payload: Dict[str, str]) -> None:
super().add_payload(payload)
# Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
# makes searching for latest snapshots faster
payload['fastLatest'] = 'true'
Loading

0 comments on commit 1b35d5b

Please sign in to comment.