From 581809957d4126be2e928b2877946ae6884e0432 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Sat, 5 Jun 2021 15:13:44 -0700 Subject: [PATCH] Handle both old and new field names for versions This needs to be in place before doing edgi-govdata-archiving/web-monitoring-db#776. This doesn't update the fields we *send*. The DB will initially be backwards compatible with the current import format, so we can ship this first, *then* upgrade the DB without anything breaking. --- web_monitoring/cli/cli.py | 2 +- web_monitoring/db.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/web_monitoring/cli/cli.py b/web_monitoring/cli/cli.py index dfd41347f..6f505694b 100644 --- a/web_monitoring/cli/cli.py +++ b/web_monitoring/cli/cli.py @@ -560,7 +560,7 @@ def _load_known_versions(client, start_date, end_date): chunk_size=1000) # Limit to latest 500,000 results for sanity/time/memory limited_versions = islice(versions, 500_000) - cache = set(_version_cache_key(v["capture_time"], v["capture_url"]) + cache = set(_version_cache_key(v["capture_time"], v.get("url", v.get("capture_url"))) for v in limited_versions) logger.debug(f' Found {len(cache)} known versions') return cache diff --git a/web_monitoring/db.py b/web_monitoring/db.py index 1c20c6322..61d0e98c3 100644 --- a/web_monitoring/db.py +++ b/web_monitoring/db.py @@ -1033,9 +1033,11 @@ def get_version_content(self, version_id): content : bytes """ db_result = self.get_version(version_id) - content_uri = db_result['data']['uri'] + # TODO: remove fallback once API migration is done: + # https://github.com/edgi-govdata-archiving/web-monitoring-db/issues/776 + content_url = db_result['data'].get('body_url', db_result['data'].get('uri')) # override the session-level "accept: json" header - response = self.request(GET, content_uri, headers={'accept': None}) + response = self.request(GET, content_url, headers={'accept': None}) if response.headers.get('Content-Type', '').startswith('text/'): return response.text else: