Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix cache handling bug with validation callback #454

Merged
merged 1 commit into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ Unreleased
----------

.. vendor-insert-here
- Fix an ordering bug which caused caching to be ineffective, resulting in
repeated downloads of remote schemas even when the cache was populated.
Thanks :user:`alex1701c` for reporting! (:issue:`453`)

0.28.6
------
Expand Down
38 changes: 26 additions & 12 deletions src/check_jsonschema/cachedownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,18 @@ def _compute_default_cache_dir(self) -> str | None:

return cache_dir

def _get_request(self) -> requests.Response:
def _get_request(
self, *, response_ok: t.Callable[[requests.Response], bool]
) -> requests.Response:
try:
# do manual retries, rather than using urllib3 retries, to make it trivially
# testable with 'responses'
r: requests.Response | None = None
for _attempt in range(3):
r = requests.get(self._file_url, stream=True)
if r.ok:
if self._validation_callback is not None:
try:
self._validation_callback(r.content)
except ValueError:
continue
if r.ok and response_ok(r):
return r
assert r is not None
raise FailedDownloadError(
f"got responses with status={r.status_code}, retries exhausted"
f"got response with status={r.status_code}, retries exhausted"
)
except requests.RequestException as e:
raise FailedDownloadError("encountered error during download") from e
Expand Down Expand Up @@ -113,12 +108,31 @@ def _write(self, dest: str, response: requests.Response) -> None:
shutil.copy(fp.name, dest)
os.remove(fp.name)

def _validate(self, response: requests.Response) -> bool:
if not self._validation_callback:
return True

try:
self._validation_callback(response.content)
return True
except ValueError:
return False

def _download(self) -> str:
assert self._cache_dir
os.makedirs(self._cache_dir, exist_ok=True)
dest = os.path.join(self._cache_dir, self._filename)

response = self._get_request()
def check_response_for_download(r: requests.Response) -> bool:
# if the response indicates a cache hit, treat it as valid
# this ensures that we short-circuit any further evaluation immediately on
# a hit
if self._cache_hit(dest, r):
return True
# we now know it's not a hit, so validate the content (forces download)
return self._validate(r)

response = self._get_request(response_ok=check_response_for_download)
# check to see if we have a file which matches the connection
# only download if we do not (cache miss, vs hit)
if not self._cache_hit(dest, response):
Expand All @@ -129,7 +143,7 @@ def _download(self) -> str:
@contextlib.contextmanager
def open(self) -> t.Iterator[t.IO[bytes]]:
if (not self._cache_dir) or self._disable_cache:
yield io.BytesIO(self._get_request().content)
yield io.BytesIO(self._get_request(response_ok=self._validate).content)
else:
with open(self._download(), "rb") as fp:
yield fp
36 changes: 36 additions & 0 deletions tests/unit/test_cachedownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,39 @@ def fake_mktime(*args):

# at the end, the file always exists on disk
assert f.exists()


def test_cachedownloader_validation_is_not_invoked_on_hit(monkeypatch, tmp_path):
"""
Regression test for https://github.com/python-jsonschema/check-jsonschema/issues/453

This was a bug in which the validation callback was invoked eagerly, even on a cache
hit. As a result, cache hits did not demonstrate their expected performance gain.
"""
# 1: construct some perfectly good data (it doesn't really matter what it is)
add_default_response()
# 2: put equivalent data on disk
f = tmp_path / "schema1.json"
f.write_text("{}")

# 3: construct a validator which marks that it ran in a variable
validator_ran = False

def dummy_validate_bytes(data):
nonlocal validator_ran
validator_ran = True

# construct a downloader pointed at the schema and file, expecting a cache hit
# and use the above validation method
cd = CacheDownloader(
"https://example.com/schema1.json",
filename=str(f),
cache_dir=str(tmp_path),
validation_callback=dummy_validate_bytes,
)

# read data from the downloader
with cd.open() as fp:
assert fp.read() == b"{}"
# assert that the validator was not run
assert validator_ran is False
Loading