-
Notifications
You must be signed in to change notification settings - Fork 300
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Prevent rate limit errors in wayback machine API (#339)
The Wayback Machine Save API only allows a limited number of requests within a timespan. This introduces several changes to avoid rate limit errors: - There will be max. 1 attempt to create a new snapshot - If a new snapshot could not be created, then attempt to use the latest existing snapshot - Bulk snapshot updates (bookmark import, load missing snapshots after login) will only attempt to load the latest snapshot instead of creating new ones
- Loading branch information
1 parent
6420ec1
commit 1b35d5b
Showing
4 changed files
with
363 additions
and
187 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import time | ||
from typing import Dict | ||
|
||
import waybackpy | ||
import waybackpy.utils | ||
from waybackpy.exceptions import NoCDXRecordFound | ||
|
||
|
||
class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI): | ||
""" | ||
Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot. | ||
See https://github.com/akamhy/waybackpy/issues/176 | ||
""" | ||
|
||
def newest(self): | ||
unix_timestamp = int(time.time()) | ||
self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(unix_timestamp) | ||
self.sort = 'closest' | ||
self.limit = -5 | ||
|
||
newest_snapshot = None | ||
for snapshot in self.snapshots(): | ||
newest_snapshot = snapshot | ||
break | ||
|
||
if not newest_snapshot: | ||
raise NoCDXRecordFound( | ||
"Wayback Machine's CDX server did not return any records " | ||
+ "for the query. The URL may not have any archives " | ||
+ " on the Wayback Machine or the URL may have been recently " | ||
+ "archived and is still not available on the CDX server." | ||
) | ||
|
||
return newest_snapshot | ||
|
||
def add_payload(self, payload: Dict[str, str]) -> None: | ||
super().add_payload(payload) | ||
# Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest | ||
# makes searching for latest snapshots faster | ||
payload['fastLatest'] = 'true' |
Oops, something went wrong.