Skip to content

Commit

Permalink
Use requests stream and shutil.copyfileobj to constrain memory usage …
Browse files Browse the repository at this point in the history
…during resource copy (#236)

* Use requests stream and shutil.copyfileobj to constrain memory usage

- When copying resources from a remote origin over HTTP(S) prefer
  to stream the response body and copy chunks into the destination
  file insead of loading the entire file into memory first before
  writing.

* Fix format/linting

* Fix ruff import re-ordering

* Remove with; use finally to close response

* Add unit test for resource_copy from https origin

- Add requests_mock dependency so that we can mock
  an https origin with an mp4 file.
  • Loading branch information
whargrove authored May 14, 2023
1 parent 0982830 commit 703d7f1
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 9 deletions.
14 changes: 14 additions & 0 deletions cdp_backend/tests/utils/test_file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import imageio
import pytest
import requests_mock

from cdp_backend.utils import file_utils
from cdp_backend.utils.file_utils import (
Expand Down Expand Up @@ -114,6 +115,19 @@ def test_resource_copy(tmpdir, example_video: Path) -> None: # type: ignore
resource_copy(str(example_video), save_path)


def test_resource_copy_https(tmpdir, example_video: Path) -> None: # type: ignore
with requests_mock.Mocker() as mock:
example_video_file = imageio.read(example_video)
mock.get("https://example.com/example_video.mp4", body=example_video_file)
dest = tmpdir / "example_video.mp4"
saved_path = resource_copy(
"https://example.com/example_video.mp4",
dest,
)
assert saved_path == dest
example_video_file.close()


# Type ignore because changing tmpdir typing
def test_hash_file_contents(tmpdir) -> None: # type: ignore
test_file = Path(tmpdir) / "a.txt"
Expand Down
25 changes: 16 additions & 9 deletions cdp_backend/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import math
import random
import re
import shutil
from hashlib import sha256
from pathlib import Path
from uuid import uuid4
Expand Down Expand Up @@ -225,21 +226,27 @@ def resource_copy( # noqa: C901
)
return str(dst)

# Set custom timeout for http resources
# Common case: http(s) URI
if uri.startswith("http"):
# The verify=False is passed to any http URIs
# It was added because it's very common for SSL certs to be bad
# See: https://github.com/CouncilDataProject/cdp-scrapers/pull/85
# And: https://github.com/CouncilDataProject/seattle/runs/5957646032

with open(dst, "wb") as open_dst:
open_dst.write(
requests.get(
uri,
verify=False,
timeout=1800,
).content
)
# Use stream=True to avoid downloading the entire file into memory
# See: https://github.com/CouncilDataProject/cdp-backend/issues/235
try:
# This response must be closed after the copy is done. But using
# `with requests.get() as response` fails mypy type checking.
# See: https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow
response = requests.get(uri, stream=True, verify=False, timeout=1800)
response.raise_for_status()
with open(dst, "wb") as open_dst:
shutil.copyfileobj(
response.raw, open_dst, length=64 * 1024 * 1024 # 64MB chunks
)
finally:
response.close()

else:
# TODO: Add explicit use of GCS credentials until public read is fixed
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ test = [
# Extras
"networkx>=2.5",
"pydot>=1.4",
"requests-mock>=1.10.0"
]
docs = [
# Sphinx + Doc Gen + Styling
Expand Down

0 comments on commit 703d7f1

Please sign in to comment.