From fb4c92abf296d0a2517d227f3f5a6c5026737700 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Thu, 27 Jun 2024 21:36:31 -0500 Subject: [PATCH] Fix cache handling bug with validation callback The validation_callback invocation in CacheDownloader forced redownloads even on a cache hit, in order to pass the response content to a parser. After experimenting with several solutions, the easiest is to abstract out `validation_callback` into an arbitrary `response_ok` call which gets passed into the request fetching loop. `response_ok` is then a wrapper over the validation callback which can intentionally answer "yes" eagerly if a response evaluates to a cache hit -- a response which is a '200 Ok' and contains a Last-Modified header which scans as a hit should be treated as 'ok' regardless of its content. In addition to the code changes, a regression test is included, which is demonstrated to fail on the current release. --- CHANGELOG.rst | 3 ++ src/check_jsonschema/cachedownloader.py | 38 +++++++++++++++++-------- tests/unit/test_cachedownloader.py | 36 +++++++++++++++++++++++ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d61cdbf72..c40179693 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,9 @@ Unreleased ---------- .. vendor-insert-here +- Fix an ordering bug which caused caching to be ineffective, resulting in + repeated downloads of remote schemas even when the cache was populated. + Thanks :user:`alex1701c` for reporting! (:issue:`453`) 0.28.6 ------ diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index a31913097..268db55ed 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -60,23 +60,18 @@ def _compute_default_cache_dir(self) -> str | None: return cache_dir - def _get_request(self) -> requests.Response: + def _get_request( + self, *, response_ok: t.Callable[[requests.Response], bool] + ) -> requests.Response: try: - # do manual retries, rather than using urllib3 retries, to make it trivially - # testable with 'responses' r: requests.Response | None = None for _attempt in range(3): r = requests.get(self._file_url, stream=True) - if r.ok: - if self._validation_callback is not None: - try: - self._validation_callback(r.content) - except ValueError: - continue + if r.ok and response_ok(r): return r assert r is not None raise FailedDownloadError( - f"got responses with status={r.status_code}, retries exhausted" + f"got response with status={r.status_code}, retries exhausted" ) except requests.RequestException as e: raise FailedDownloadError("encountered error during download") from e @@ -113,12 +108,31 @@ def _write(self, dest: str, response: requests.Response) -> None: shutil.copy(fp.name, dest) os.remove(fp.name) + def _validate(self, response: requests.Response) -> bool: + if not self._validation_callback: + return True + + try: + self._validation_callback(response.content) + return True + except ValueError: + return False + def _download(self) -> str: assert self._cache_dir os.makedirs(self._cache_dir, exist_ok=True) dest = os.path.join(self._cache_dir, self._filename) - response = self._get_request() + def check_response_for_download(r: requests.Response) -> bool: + # if the response indicates a cache hit, treat it as valid + # this ensures that we short-circuit any further evaluation immediately on + # a hit + if self._cache_hit(dest, r): + return True + # we now know it's not a hit, so validate the content (forces download) + return self._validate(r) + + response = self._get_request(response_ok=check_response_for_download) # check to see if we have a file which matches the connection # only download if we do not (cache miss, vs hit) if not self._cache_hit(dest, response): @@ -129,7 +143,7 @@ def _download(self) -> str: @contextlib.contextmanager def open(self) -> t.Iterator[t.IO[bytes]]: if (not self._cache_dir) or self._disable_cache: - yield io.BytesIO(self._get_request().content) + yield io.BytesIO(self._get_request(response_ok=self._validate).content) else: with open(self._download(), "rb") as fp: yield fp diff --git a/tests/unit/test_cachedownloader.py b/tests/unit/test_cachedownloader.py index f26e08527..32098db91 100644 --- a/tests/unit/test_cachedownloader.py +++ b/tests/unit/test_cachedownloader.py @@ -268,3 +268,39 @@ def fake_mktime(*args): # at the end, the file always exists on disk assert f.exists() + + +def test_cachedownloader_validation_is_not_invoked_on_hit(monkeypatch, tmp_path): + """ + Regression test for https://github.com/python-jsonschema/check-jsonschema/issues/453 + + This was a bug in which the validation callback was invoked eagerly, even on a cache + hit. As a result, cache hits did not demonstrate their expected performance gain. + """ + # 1: construct some perfectly good data (it doesn't really matter what it is) + add_default_response() + # 2: put equivalent data on disk + f = tmp_path / "schema1.json" + f.write_text("{}") + + # 3: construct a validator which marks that it ran in a variable + validator_ran = False + + def dummy_validate_bytes(data): + nonlocal validator_ran + validator_ran = True + + # construct a downloader pointed at the schema and file, expecting a cache hit + # and use the above validation method + cd = CacheDownloader( + "https://example.com/schema1.json", + filename=str(f), + cache_dir=str(tmp_path), + validation_callback=dummy_validate_bytes, + ) + + # read data from the downloader + with cd.open() as fp: + assert fp.read() == b"{}" + # assert that the validator was not run + assert validator_ran is False