Skip to content

Commit

Permalink
Handle both old and new field names for versions
Browse files Browse the repository at this point in the history
This needs to be in place before doing edgi-govdata-archiving/web-monitoring-db#776. This doesn't update the fields we *send*. The DB will initially be backwards compatible with the current import format, so we can ship this first, *then* upgrade the DB without anything breaking.
  • Loading branch information
Mr0grog committed Jun 6, 2021
1 parent 9583014 commit 5818099
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
2 changes: 1 addition & 1 deletion web_monitoring/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def _load_known_versions(client, start_date, end_date):
chunk_size=1000)
# Limit to latest 500,000 results for sanity/time/memory
limited_versions = islice(versions, 500_000)
cache = set(_version_cache_key(v["capture_time"], v["capture_url"])
cache = set(_version_cache_key(v["capture_time"], v.get("url", v.get("capture_url")))
for v in limited_versions)
logger.debug(f' Found {len(cache)} known versions')
return cache
Expand Down
6 changes: 4 additions & 2 deletions web_monitoring/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,9 +1033,11 @@ def get_version_content(self, version_id):
content : bytes
"""
db_result = self.get_version(version_id)
content_uri = db_result['data']['uri']
# TODO: remove fallback once API migration is done:
# https://github.com/edgi-govdata-archiving/web-monitoring-db/issues/776
content_url = db_result['data'].get('body_url', db_result['data'].get('uri'))
# override the session-level "accept: json" header
response = self.request(GET, content_uri, headers={'accept': None})
response = self.request(GET, content_url, headers={'accept': None})
if response.headers.get('Content-Type', '').startswith('text/'):
return response.text
else:
Expand Down

0 comments on commit 5818099

Please sign in to comment.