From c08c4e65dd2ce719191c70b265587254a0110a57 Mon Sep 17 00:00:00 2001 From: Jason Giancono Date: Mon, 4 Mar 2024 00:51:11 +0800 Subject: [PATCH] add new strategy for cloud-based hashing static files storages (in particular manifest files storage) --- CHANGELOG.md | 2 + Makefile | 3 + collectfasta/__init__.py | 2 +- .../management/commands/collectstatic.py | 28 ++++- collectfasta/strategies/base.py | 13 ++ collectfasta/strategies/boto3.py | 15 +++ collectfasta/strategies/hashing.py | 90 +++++++++++++ collectfasta/tests/command/test_command.py | 119 ++++++++++++------ collectfasta/tests/utils.py | 33 ++++- setup.cfg | 2 +- 10 files changed, 262 insertions(+), 45 deletions(-) create mode 100644 collectfasta/strategies/hashing.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f921f3..a813483 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,6 @@ # Changelog +## 3.1.0 +- add new strategies for two-pass collectstatic where the first pass is file or memory based ## 3.0.1 - Refactor boto3 strategy to wrap the storage classes to re-introduce preloading of metadata diff --git a/Makefile b/Makefile index 3629ecf..558921d 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,9 @@ test: test-docker: docker-up pytest -svv; docker compose down + +test-docker-ff: docker-up + pytest -svv -x; docker compose down test-speed: docker-up pytest -x --speedtest -m speed_test -svv; docker compose down diff --git a/collectfasta/__init__.py b/collectfasta/__init__.py index 0552768..f5f41e5 100644 --- a/collectfasta/__init__.py +++ b/collectfasta/__init__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.1.0" diff --git a/collectfasta/management/commands/collectstatic.py b/collectfasta/management/commands/collectstatic.py index f1943b2..59114ec 100644 --- a/collectfasta/management/commands/collectstatic.py +++ b/collectfasta/management/commands/collectstatic.py @@ -62,13 +62,34 @@ def set_options(self, **options: Any) -> None: self.storage = self.strategy.wrap_storage(self.storage) super().set_options(**options) + def second_pass(self, stats: Dict[str, List[str]]) -> Dict[str, List[str]]: + second_pass_strategy = self.strategy.second_pass_strategy() + if self.collectfasta_enabled and second_pass_strategy: + self.copied_files = [] + self.symlinked_files = [] + self.unmodified_files = [] + self.num_copied_files = 0 + source_storage = self.storage + self.storage = second_pass_strategy.wrap_storage(self.storage) + self.strategy = second_pass_strategy + self.log(f"Running second pass with {self.strategy.__class__.__name__}...") + for f in source_storage.listdir("")[1]: + self.maybe_copy_file((f, f, source_storage)) + return { + "modified": self.copied_files + self.symlinked_files, + "unmodified": self.unmodified_files, + "post_processed": self.post_processed_files, + } + + return stats + def collect(self) -> Dict[str, List[str]]: """ Override collect to copy files concurrently. The tasks are populated by Command.copy_file() which is called by super().collect(). """ if not self.collectfasta_enabled or not settings.threads: - return super().collect() + return self.second_pass(super().collect()) # Store original value of post_process in super_post_process and always # set the value to False to prevent the default behavior from @@ -83,8 +104,7 @@ def collect(self) -> Dict[str, List[str]]: self.maybe_post_process(super_post_process) return_value["post_processed"] = self.post_processed_files - - return return_value + return self.second_pass(return_value) def handle(self, *args: Any, **options: Any) -> Optional[str]: """Override handle to suppress summary output.""" @@ -96,7 +116,7 @@ def handle(self, *args: Any, **options: Any) -> Optional[str]: def maybe_copy_file(self, args: Task) -> None: """Determine if file should be copied or not and handle exceptions.""" - path, prefixed_path, source_storage = args + path, prefixed_path, source_storage = self.strategy.copy_args_hook(args) # Build up found_files to look identical to how it's created in the # builtin command's collect() method so that we can run post_process diff --git a/collectfasta/strategies/base.py b/collectfasta/strategies/base.py index d7ef375..fb1bc97 100644 --- a/collectfasta/strategies/base.py +++ b/collectfasta/strategies/base.py @@ -73,6 +73,19 @@ def on_skip_hook( """Hook called when a file copy is skipped.""" ... + def second_pass_strategy(self) -> "Optional[Strategy[_RemoteStorage]]": + """ + Strategy that is used after the first pass of hashing is done - to copy the files + to the remote destination. + """ + return None + + def copy_args_hook( + self, args: Tuple[str, str, Storage] + ) -> Tuple[str, str, Storage]: + """Hook called before copying a file. Use this to modify the path or storage.""" + return args + class HashStrategy(Strategy[_RemoteStorage], abc.ABC): use_gzip = False diff --git a/collectfasta/strategies/boto3.py b/collectfasta/strategies/boto3.py index c00f4ba..4828139 100644 --- a/collectfasta/strategies/boto3.py +++ b/collectfasta/strategies/boto3.py @@ -20,6 +20,9 @@ from collectfasta import settings from .base import CachingHashStrategy +from .hashing import TwoPassFileSystemStrategy +from .hashing import TwoPassInMemoryStrategy +from .hashing import WithoutPrefixMixin logger = logging.getLogger(__name__) @@ -200,3 +203,15 @@ def pre_should_copy_hook(self) -> None: if settings.threads: logger.info("Resetting connection") self.remote_storage._connection = None + + +class Boto3WithoutPrefixStrategy(WithoutPrefixMixin, Boto3Strategy): + pass + + +class Boto3HashedMemoryStrategy(TwoPassInMemoryStrategy): + second_strategy = Boto3WithoutPrefixStrategy + + +class Boto3HashedFileSystemStrategy(TwoPassFileSystemStrategy): + second_strategy = Boto3WithoutPrefixStrategy diff --git a/collectfasta/strategies/hashing.py b/collectfasta/strategies/hashing.py new file mode 100644 index 0000000..2fe4d83 --- /dev/null +++ b/collectfasta/strategies/hashing.py @@ -0,0 +1,90 @@ +from typing import Type + +from django.contrib.staticfiles.storage import HashedFilesMixin +from django.contrib.staticfiles.storage import ManifestFilesMixin +from django.core.files.storage import FileSystemStorage +from django.core.files.storage import Storage +from django.core.files.storage.memory import InMemoryStorage + +from .base import HashStrategy +from .base import _RemoteStorage + + +class InMemoryHashedFilesStorage(HashedFilesMixin, InMemoryStorage): + pass + + +class FileSystemHashedFilesStorage(HashedFilesMixin, FileSystemStorage): + pass + + +class InMemoryManifestFilesStorage(ManifestFilesMixin, InMemoryStorage): + pass + + +class FileSystemManifestFilesStorage(ManifestFilesMixin, FileSystemStorage): + pass + + +class HashingTwoPassStrategy(HashStrategy[HashedFilesMixin]): + """ + Hashing strategies interact a lot with the remote storage as a part of post-processing + This strategy will instead run the hashing strategy using InMemoryStorage first, then copy + the files to the remote storage + """ + + first_hashed_storage: Type[ManifestFilesMixin] = None + first_manifest_storage: Type[HashedFilesMixin] = None + second_strategy = None + + def __init__(self, remote_storage: _RemoteStorage) -> None: + self.first_pass = True + self.original_storage = remote_storage + self.memory_storage = self._get_tmp_storage() + self.remote_storage = self.memory_storage + super().__init__(self.memory_storage) + + def _get_tmp_storage(self) -> Storage: + if isinstance(self.original_storage, ManifestFilesMixin): + return self.first_manifest_storage(location=self.original_storage.location) + elif isinstance(self.original_storage, HashedFilesMixin): + return self.first_hashed_storage(location=self.original_storage.location) + else: + raise ValueError( + "HashingMemoryStrategy can only be used with subclasses of HashedFilesMixin or ManifestFilesMixin" + ) + + def wrap_storage(self, remote_storage: _RemoteStorage) -> _RemoteStorage: + return self.remote_storage + + def get_remote_file_hash(self, prefixed_path: str) -> str | None: + try: + super().get_local_file_hash(prefixed_path, self.remote_storage) + except FileNotFoundError: + return None + + def second_pass_strategy(self): + """ + Strategy that is used after the first pass of hashing is done - to copy the files + to the remote destination. + """ + return self.second_strategy(self.original_storage) + + +class WithoutPrefixMixin: + def copy_args_hook(self, args): + return ( + args[0].replace(self.remote_storage.location, ""), + args[1].replace(self.remote_storage.location, ""), + args[2], + ) + + +class TwoPassInMemoryStrategy(HashingTwoPassStrategy): + first_hashed_storage = InMemoryHashedFilesStorage + first_manifest_storage = InMemoryManifestFilesStorage + + +class TwoPassFileSystemStrategy(HashingTwoPassStrategy): + first_hashed_storage = FileSystemHashedFilesStorage + first_manifest_storage = FileSystemManifestFilesStorage diff --git a/collectfasta/tests/command/test_command.py b/collectfasta/tests/command/test_command.py index 9397314..7e46208 100644 --- a/collectfasta/tests/command/test_command.py +++ b/collectfasta/tests/command/test_command.py @@ -1,4 +1,7 @@ import timeit +from typing import Callable +from typing import ContextManager +from typing import Iterator from unittest import TestCase from unittest import mock @@ -7,8 +10,10 @@ from django.test.utils import override_settings from collectfasta.management.commands.collectstatic import Command +from collectfasta.tests.utils import assert_static_file_number from collectfasta.tests.utils import clean_static_dir from collectfasta.tests.utils import create_static_file +from collectfasta.tests.utils import create_two_referenced_static_files from collectfasta.tests.utils import live_test from collectfasta.tests.utils import make_100_files from collectfasta.tests.utils import make_test @@ -19,32 +24,59 @@ from .utils import call_collectstatic + +def generate_storage_variations( + storages: dict, strategies: dict +) -> Iterator[tuple[str, ContextManager]]: + for storage_name, storage_settings in storages.items(): + for strategy_name, strategy_settings in strategies.items(): + yield ( + f"{storage_name}_{strategy_name}", + override_django_settings( + STORAGES={"staticfiles": {"BACKEND": storage_settings}}, + COLLECTFASTA_STRATEGY=strategy_settings, + ), + ) + + aws_backend_confs = { - "boto3": override_django_settings( - STORAGES={ - "staticfiles": { - "BACKEND": "storages.backends.s3.S3Storage", + **dict( + generate_storage_variations( + { + "boto3": "storages.backends.s3.S3Storage", + "boto3static": "storages.backends.s3.S3StaticStorage", }, - }, - COLLECTFASTA_STRATEGY="collectfasta.strategies.boto3.Boto3Strategy", + { + "base": "collectfasta.strategies.boto3.Boto3Strategy", + }, + ) ), - "boto3manifest": override_django_settings( - STORAGES={ - "staticfiles": { - "BACKEND": "storages.backends.s3.S3ManifestStaticStorage", + **dict( + generate_storage_variations( + { + "boto3": "storages.backends.s3.S3Storage", + "boto3static": "storages.backends.s3.S3StaticStorage", }, - }, - COLLECTFASTA_STRATEGY="collectfasta.strategies.boto3.Boto3Strategy", + { + "base": "collectfasta.strategies.boto3.Boto3Strategy", + }, + ) ), - "boto3static": override_django_settings( - STORAGES={ - "staticfiles": { - "BACKEND": "storages.backends.s3.S3StaticStorage", + **dict( + generate_storage_variations( + { + "boto3manifest": "storages.backends.s3.S3ManifestStaticStorage", }, - }, - COLLECTFASTA_STRATEGY="collectfasta.strategies.boto3.Boto3Strategy", + { + "base": "collectfasta.strategies.boto3.Boto3Strategy", + "memory_2pass": "collectfasta.strategies.hashing.Boto3HashedMemoryStrategy", # noqa E501 + "file_2pass": "collectfasta.strategies.hashing.Boto3HashedFileSystemStrategy", # noqa E501 + }, + ) ), } + + gcloud_backend_confs = { "gcloud": override_django_settings( STORAGES={ @@ -81,17 +113,17 @@ **filesystem_backend_confs, } -make_test_aws_backends = many(**aws_backend_confs) -make_test_all_backends = many(**all_backend_confs) -make_test_cloud_backends = many(**aws_backend_confs, **gcloud_backend_confs) +make_test_aws_backends: Callable = many(**aws_backend_confs) # type: ignore +make_test_all_backends: Callable = many(**all_backend_confs) # type: ignore +make_test_cloud_backends: Callable = many(**aws_backend_confs, **gcloud_backend_confs) # type: ignore @make_test_all_backends @live_test def test_basics(case: TestCase) -> None: clean_static_dir() - create_static_file() - case.assertIn("1 static file copied.", call_collectstatic()) + create_two_referenced_static_files() + assert_static_file_number(2, call_collectstatic(), case) # file state should now be cached case.assertIn("0 static files copied.", call_collectstatic()) @@ -101,8 +133,8 @@ def test_basics(case: TestCase) -> None: @override_setting("threads", 5) def test_threads(case: TestCase) -> None: clean_static_dir() - create_static_file() - case.assertIn("1 static file copied.", call_collectstatic()) + create_two_referenced_static_files() + assert_static_file_number(2, call_collectstatic(), case) # file state should now be cached case.assertIn("0 static files copied.", call_collectstatic()) @@ -113,13 +145,12 @@ def test_threads(case: TestCase) -> None: def test_basics_cloud_speed(case: TestCase) -> None: clean_static_dir() make_100_files() - - case.assertIn("100 static files copied", call_collectstatic()) + assert_static_file_number(100, call_collectstatic(), case) def collectstatic_one(): - case.assertIn("1 static file copied", call_collectstatic()) + assert_static_file_number(2, call_collectstatic(), case) - create_static_file() + create_two_referenced_static_files() ittook = timeit.timeit(collectstatic_one, number=1) print(f"it took {ittook} seconds") @@ -137,9 +168,9 @@ def test_no_collectfasta_cloud_speed(case: TestCase) -> None: case.assertIn("100 static files copied", call_collectstatic()) def collectstatic_one(): - case.assertIn("1 static file copied", call_collectstatic()) + case.assertIn("2 static files copied", call_collectstatic()) - create_static_file() + create_two_referenced_static_files() ittook = timeit.timeit(collectstatic_one, number=1) print(f"it took {ittook} seconds") @@ -163,8 +194,9 @@ def test_dry_run(case: TestCase) -> None: @override_setting("aws_is_gzipped", True) def test_aws_is_gzipped(case: TestCase) -> None: clean_static_dir() - create_static_file() - case.assertIn("1 static file copied.", call_collectstatic()) + create_two_referenced_static_files() + assert_static_file_number(2, call_collectstatic(), case) + # file state should now be cached case.assertIn("0 static files copied.", call_collectstatic()) @@ -181,10 +213,16 @@ def test_raises_for_no_configured_strategy(case: TestCase) -> None: @mock.patch("collectfasta.strategies.base.Strategy.post_copy_hook", autospec=True) def test_calls_post_copy_hook(_case: TestCase, post_copy_hook: mock.MagicMock) -> None: clean_static_dir() - path = create_static_file() + (path_one, path_two) = create_two_referenced_static_files() cmd = Command() cmd.run_from_argv(["manage.py", "collectstatic", "--noinput"]) - post_copy_hook.assert_called_once_with(mock.ANY, path.name, path.name, mock.ANY) + post_copy_hook.assert_has_calls( + [ + mock.call(mock.ANY, path_one.name, path_one.name, mock.ANY), + mock.call(mock.ANY, path_two.name, path_two.name, mock.ANY), + ], + any_order=True, + ) @make_test_all_backends @@ -192,9 +230,16 @@ def test_calls_post_copy_hook(_case: TestCase, post_copy_hook: mock.MagicMock) - @mock.patch("collectfasta.strategies.base.Strategy.on_skip_hook", autospec=True) def test_calls_on_skip_hook(_case: TestCase, on_skip_hook: mock.MagicMock) -> None: clean_static_dir() - path = create_static_file() + (path_one, path_two) = create_two_referenced_static_files() cmd = Command() cmd.run_from_argv(["manage.py", "collectstatic", "--noinput"]) on_skip_hook.assert_not_called() cmd.run_from_argv(["manage.py", "collectstatic", "--noinput"]) - on_skip_hook.assert_called_once_with(mock.ANY, path.name, path.name, mock.ANY) + + on_skip_hook.assert_has_calls( + [ + mock.call(mock.ANY, path_one.name, path_one.name, mock.ANY), + mock.call(mock.ANY, path_two.name, path_two.name, mock.ANY), + ], + any_order=True, + ) diff --git a/collectfasta/tests/utils.py b/collectfasta/tests/utils.py index e2c7566..e8900a5 100644 --- a/collectfasta/tests/utils.py +++ b/collectfasta/tests/utils.py @@ -10,6 +10,7 @@ from typing import Type from typing import TypeVar from typing import cast +from unittest import TestCase import pytest from django.conf import settings as django_settings @@ -34,10 +35,18 @@ def speed_test(func): F = TypeVar("F", bound=Callable[..., Any]) +def assert_static_file_number(files: int, output: str, case: TestCase) -> None: + # if it's a manifest, there will be 2*N + 1 files copied + if "manifest" in case.id() and "2pass" in case.id(): + case.assertIn(f"{files*2+1} static files copied.", output) + else: + case.assertIn(f"{files} static files copied.", output) + + def make_100_files(): with ThreadPoolExecutor(max_workers=5) as executor: - for _ in range(100): - executor.submit(create_big_static_file) + for _ in range(50): + executor.submit(create_big_referenced_static_file) executor.shutdown(wait=True) @@ -100,6 +109,14 @@ def test(func: F) -> Type[unittest.TestCase]: return test +def create_two_referenced_static_files() -> tuple[pathlib.Path, pathlib.Path]: + """Create a static file, then another file with a reference to the file""" + path = create_static_file() + reference_path = static_dir / f"{uuid.uuid4().hex}.txt" + reference_path.write_text(f"{{% static '{path.name}' %}}") + return (path, reference_path) + + def create_static_file() -> pathlib.Path: """Write random characters to a file in the static directory.""" path = static_dir / f"{uuid.uuid4().hex}.txt" @@ -107,6 +124,14 @@ def create_static_file() -> pathlib.Path: return path +def create_big_referenced_static_file() -> tuple[pathlib.Path, pathlib.Path]: + """Create a big static file, then another file with a reference to the file""" + path = create_big_static_file() + reference_path = static_dir / f"{uuid.uuid4().hex}.txt" + reference_path.write_text(f"{{% static '{path.name}' %}}") + return (path, reference_path) + + def create_big_static_file() -> pathlib.Path: """Write random characters to a file in the static directory.""" path = static_dir / f"{uuid.uuid4().hex}.txt" @@ -115,6 +140,10 @@ def create_big_static_file() -> pathlib.Path: def clean_static_dir() -> None: + for filename in os.listdir(django_settings.AWS_LOCATION): + file = pathlib.Path(django_settings.AWS_LOCATION) / filename + if file.is_file(): + file.unlink() for filename in os.listdir(static_dir.as_posix()): file = static_dir / filename if file.is_file(): diff --git a/setup.cfg b/setup.cfg index 14b2c44..9e91471 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,7 +45,7 @@ markers = [flake8] exclude = appveyor, .idea, .git, .venv, .tox, __pycache__, *.egg-info, build max-complexity = 8 -max-line-length = 89 +max-line-length = 120 # ignore F821 until mypy-0.730 compatibility is released # https://github.com/PyCQA/pyflakes/issues/475 # see this discussion as to why we're ignoring E722