Skip to content

Commit

Permalink
add new strategy for cloud-based hashing static files storages (in pa…
Browse files Browse the repository at this point in the history
…rticular manifest files storage)
  • Loading branch information
jasongi committed Mar 3, 2024
1 parent 429bb1e commit a62f2dd
Show file tree
Hide file tree
Showing 10 changed files with 262 additions and 45 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Changelog
## 3.1.0
- add new strategies for two-pass collectstatic where the first pass is file or memory based

## 3.0.1
- Refactor boto3 strategy to wrap the storage classes to re-introduce preloading of metadata
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ test:

test-docker: docker-up
pytest -svv; docker compose down

test-docker-ff: docker-up
pytest -svv -x; docker compose down

test-speed: docker-up
pytest -x --speedtest -m speed_test -svv; docker compose down
Expand Down
2 changes: 1 addition & 1 deletion collectfasta/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.0.1"
__version__ = "3.1.0"
28 changes: 24 additions & 4 deletions collectfasta/management/commands/collectstatic.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,34 @@ def set_options(self, **options: Any) -> None:
self.storage = self.strategy.wrap_storage(self.storage)
super().set_options(**options)

def second_pass(self, stats: Dict[str, List[str]]) -> Dict[str, List[str]]:
second_pass_strategy = self.strategy.second_pass_strategy()
if self.collectfasta_enabled and second_pass_strategy:
self.copied_files = []
self.symlinked_files = []
self.unmodified_files = []
self.num_copied_files = 0
source_storage = self.storage
self.storage = second_pass_strategy.wrap_storage(self.storage)
self.strategy = second_pass_strategy
self.log(f"Running second pass with {self.strategy.__class__.__name__}...")
for f in source_storage.listdir("")[1]:
self.maybe_copy_file((f, f, source_storage))
return {
"modified": self.copied_files + self.symlinked_files,
"unmodified": self.unmodified_files,
"post_processed": self.post_processed_files,
}

return stats

def collect(self) -> Dict[str, List[str]]:
"""
Override collect to copy files concurrently. The tasks are populated by
Command.copy_file() which is called by super().collect().
"""
if not self.collectfasta_enabled or not settings.threads:
return super().collect()
return self.second_pass(super().collect())

# Store original value of post_process in super_post_process and always
# set the value to False to prevent the default behavior from
Expand All @@ -83,8 +104,7 @@ def collect(self) -> Dict[str, List[str]]:

self.maybe_post_process(super_post_process)
return_value["post_processed"] = self.post_processed_files

return return_value
return self.second_pass(return_value)

def handle(self, *args: Any, **options: Any) -> Optional[str]:
"""Override handle to suppress summary output."""
Expand All @@ -96,7 +116,7 @@ def handle(self, *args: Any, **options: Any) -> Optional[str]:

def maybe_copy_file(self, args: Task) -> None:
"""Determine if file should be copied or not and handle exceptions."""
path, prefixed_path, source_storage = args
path, prefixed_path, source_storage = self.strategy.copy_args_hook(args)

# Build up found_files to look identical to how it's created in the
# builtin command's collect() method so that we can run post_process
Expand Down
13 changes: 13 additions & 0 deletions collectfasta/strategies/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ def on_skip_hook(
"""Hook called when a file copy is skipped."""
...

def second_pass_strategy(self) -> "Optional[Strategy[_RemoteStorage]]":
"""
Strategy that is used after the first pass of hashing is done - to copy the files
to the remote destination.
"""
return None

def copy_args_hook(
self, args: Tuple[str, str, Storage]
) -> Tuple[str, str, Storage]:
"""Hook called before copying a file. Use this to modify the path or storage."""
return args


class HashStrategy(Strategy[_RemoteStorage], abc.ABC):
use_gzip = False
Expand Down
15 changes: 15 additions & 0 deletions collectfasta/strategies/boto3.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from collectfasta import settings

from .base import CachingHashStrategy
from .hashing import TwoPassFileSystemStrategy
from .hashing import TwoPassInMemoryStrategy
from .hashing import WithoutPrefixMixin

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -200,3 +203,15 @@ def pre_should_copy_hook(self) -> None:
if settings.threads:
logger.info("Resetting connection")
self.remote_storage._connection = None


class Boto3WithoutPrefixStrategy(WithoutPrefixMixin, Boto3Strategy):
pass


class Boto3HashedMemoryStrategy(TwoPassInMemoryStrategy):
second_strategy = Boto3WithoutPrefixStrategy


class Boto3HashedFileSystemStrategy(TwoPassFileSystemStrategy):
second_strategy = Boto3WithoutPrefixStrategy
90 changes: 90 additions & 0 deletions collectfasta/strategies/hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from typing import Type

from django.contrib.staticfiles.storage import HashedFilesMixin
from django.contrib.staticfiles.storage import ManifestFilesMixin
from django.core.files.storage import FileSystemStorage
from django.core.files.storage import Storage
from django.core.files.storage.memory import InMemoryStorage

from .base import HashStrategy
from .base import _RemoteStorage


class InMemoryHashedFilesStorage(HashedFilesMixin, InMemoryStorage):
pass


class FileSystemHashedFilesStorage(HashedFilesMixin, FileSystemStorage):
pass


class InMemoryManifestFilesStorage(ManifestFilesMixin, InMemoryStorage):
pass


class FileSystemManifestFilesStorage(ManifestFilesMixin, FileSystemStorage):
pass


class HashingTwoPassStrategy(HashStrategy[HashedFilesMixin]):
"""
Hashing strategies interact a lot with the remote storage as a part of post-processing
This strategy will instead run the hashing strategy using InMemoryStorage first, then copy
the files to the remote storage
"""

first_hashed_storage: Type[HashedFilesMixin] = None
first_manifest_storage: Type[ManifestFilesMixin] = None
second_strategy = None

def __init__(self, remote_storage: _RemoteStorage) -> None:
self.first_pass = True
self.original_storage = remote_storage
self.memory_storage = self._get_tmp_storage()
self.remote_storage = self.memory_storage
super().__init__(self.memory_storage)

def _get_tmp_storage(self) -> Storage:
if isinstance(self.original_storage, ManifestFilesMixin):
return self.first_manifest_storage(location=self.original_storage.location)
elif isinstance(self.original_storage, HashedFilesMixin):
return self.first_hashed_storage(location=self.original_storage.location)
else:
raise ValueError(
"HashingMemoryStrategy can only be used with subclasses of HashedFilesMixin or ManifestFilesMixin"
)

def wrap_storage(self, remote_storage: _RemoteStorage) -> _RemoteStorage:
return self.remote_storage

def get_remote_file_hash(self, prefixed_path: str) -> str | None:
try:
super().get_local_file_hash(prefixed_path, self.remote_storage)
except FileNotFoundError:
return None

def second_pass_strategy(self):
"""
Strategy that is used after the first pass of hashing is done - to copy the files
to the remote destination.
"""
return self.second_strategy(self.original_storage)


class WithoutPrefixMixin:
def copy_args_hook(self, args):
return (
args[0].replace(self.remote_storage.location, ""),
args[1].replace(self.remote_storage.location, ""),
args[2],
)


class TwoPassInMemoryStrategy(HashingTwoPassStrategy):
first_hashed_storage = InMemoryHashedFilesStorage
first_manifest_storage = InMemoryManifestFilesStorage


class TwoPassFileSystemStrategy(HashingTwoPassStrategy):
first_hashed_storage = FileSystemHashedFilesStorage
first_manifest_storage = FileSystemManifestFilesStorage
Loading

0 comments on commit a62f2dd

Please sign in to comment.