Skip to content

Commit

Permalink
Merge pull request #454 from python-jsonschema/bugfix-cache-perf
Browse files Browse the repository at this point in the history
Fix cache handling bug with validation callback
  • Loading branch information
sirosen authored Jun 28, 2024
2 parents ca8d0fd + fb4c92a commit 53793e2
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ Unreleased
----------

.. vendor-insert-here
- Fix an ordering bug which caused caching to be ineffective, resulting in
repeated downloads of remote schemas even when the cache was populated.
Thanks :user:`alex1701c` for reporting! (:issue:`453`)

0.28.6
------
Expand Down
38 changes: 26 additions & 12 deletions src/check_jsonschema/cachedownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,18 @@ def _compute_default_cache_dir(self) -> str | None:

return cache_dir

def _get_request(self) -> requests.Response:
def _get_request(
self, *, response_ok: t.Callable[[requests.Response], bool]
) -> requests.Response:
try:
# do manual retries, rather than using urllib3 retries, to make it trivially
# testable with 'responses'
r: requests.Response | None = None
for _attempt in range(3):
r = requests.get(self._file_url, stream=True)
if r.ok:
if self._validation_callback is not None:
try:
self._validation_callback(r.content)
except ValueError:
continue
if r.ok and response_ok(r):
return r
assert r is not None
raise FailedDownloadError(
f"got responses with status={r.status_code}, retries exhausted"
f"got response with status={r.status_code}, retries exhausted"
)
except requests.RequestException as e:
raise FailedDownloadError("encountered error during download") from e
Expand Down Expand Up @@ -113,12 +108,31 @@ def _write(self, dest: str, response: requests.Response) -> None:
shutil.copy(fp.name, dest)
os.remove(fp.name)

def _validate(self, response: requests.Response) -> bool:
if not self._validation_callback:
return True

try:
self._validation_callback(response.content)
return True
except ValueError:
return False

def _download(self) -> str:
assert self._cache_dir
os.makedirs(self._cache_dir, exist_ok=True)
dest = os.path.join(self._cache_dir, self._filename)

response = self._get_request()
def check_response_for_download(r: requests.Response) -> bool:
# if the response indicates a cache hit, treat it as valid
# this ensures that we short-circuit any further evaluation immediately on
# a hit
if self._cache_hit(dest, r):
return True
# we now know it's not a hit, so validate the content (forces download)
return self._validate(r)

response = self._get_request(response_ok=check_response_for_download)
# check to see if we have a file which matches the connection
# only download if we do not (cache miss, vs hit)
if not self._cache_hit(dest, response):
Expand All @@ -129,7 +143,7 @@ def _download(self) -> str:
@contextlib.contextmanager
def open(self) -> t.Iterator[t.IO[bytes]]:
if (not self._cache_dir) or self._disable_cache:
yield io.BytesIO(self._get_request().content)
yield io.BytesIO(self._get_request(response_ok=self._validate).content)
else:
with open(self._download(), "rb") as fp:
yield fp
36 changes: 36 additions & 0 deletions tests/unit/test_cachedownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,39 @@ def fake_mktime(*args):

# at the end, the file always exists on disk
assert f.exists()


def test_cachedownloader_validation_is_not_invoked_on_hit(monkeypatch, tmp_path):
"""
Regression test for https://github.com/python-jsonschema/check-jsonschema/issues/453
This was a bug in which the validation callback was invoked eagerly, even on a cache
hit. As a result, cache hits did not demonstrate their expected performance gain.
"""
# 1: construct some perfectly good data (it doesn't really matter what it is)
add_default_response()
# 2: put equivalent data on disk
f = tmp_path / "schema1.json"
f.write_text("{}")

# 3: construct a validator which marks that it ran in a variable
validator_ran = False

def dummy_validate_bytes(data):
nonlocal validator_ran
validator_ran = True

# construct a downloader pointed at the schema and file, expecting a cache hit
# and use the above validation method
cd = CacheDownloader(
"https://example.com/schema1.json",
filename=str(f),
cache_dir=str(tmp_path),
validation_callback=dummy_validate_bytes,
)

# read data from the downloader
with cd.open() as fp:
assert fp.read() == b"{}"
# assert that the validator was not run
assert validator_ran is False

0 comments on commit 53793e2

Please sign in to comment.