Skip to content

Commit

Permalink
add new strategy for cloud-based hashing static files storages (in pa…
Browse files Browse the repository at this point in the history
…rticular manifest files storage)
  • Loading branch information
jasongi committed Mar 3, 2024
1 parent 429bb1e commit 66b8c8b
Show file tree
Hide file tree
Showing 11 changed files with 246 additions and 47 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Changelog
## 3.1.0
- add new strategies for two-pass collectstatic where the first pass is file or memory based

## 3.0.1
- Refactor boto3 strategy to wrap the storage classes to re-introduce preloading of metadata
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ test:
test-docker: docker-up
pytest -svv; docker compose down

test-docker-ff: docker-up
pytest -svv -x; docker compose down

test-speed: docker-up
pytest -x --speedtest -m speed_test -svv; docker compose down

Expand Down
2 changes: 1 addition & 1 deletion collectfasta/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.0.1"
__version__ = "3.1.0"
28 changes: 24 additions & 4 deletions collectfasta/management/commands/collectstatic.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,34 @@ def set_options(self, **options: Any) -> None:
self.storage = self.strategy.wrap_storage(self.storage)
super().set_options(**options)

def second_pass(self, stats: Dict[str, List[str]]) -> Dict[str, List[str]]:
second_pass_strategy = self.strategy.second_pass_strategy()
if self.collectfasta_enabled and second_pass_strategy:
self.copied_files = []
self.symlinked_files = []
self.unmodified_files = []
self.num_copied_files = 0
source_storage = self.storage
self.storage = second_pass_strategy.wrap_storage(self.storage)
self.strategy = second_pass_strategy
self.log(f"Running second pass with {self.strategy.__class__.__name__}...")
for f in source_storage.listdir("")[1]:
self.maybe_copy_file((f, f, source_storage))
return {
"modified": self.copied_files + self.symlinked_files,
"unmodified": self.unmodified_files,
"post_processed": self.post_processed_files,
}

return stats

def collect(self) -> Dict[str, List[str]]:
"""
Override collect to copy files concurrently. The tasks are populated by
Command.copy_file() which is called by super().collect().
"""
if not self.collectfasta_enabled or not settings.threads:
return super().collect()
return self.second_pass(super().collect())

# Store original value of post_process in super_post_process and always
# set the value to False to prevent the default behavior from
Expand All @@ -83,8 +104,7 @@ def collect(self) -> Dict[str, List[str]]:

self.maybe_post_process(super_post_process)
return_value["post_processed"] = self.post_processed_files

return return_value
return self.second_pass(return_value)

def handle(self, *args: Any, **options: Any) -> Optional[str]:
"""Override handle to suppress summary output."""
Expand All @@ -96,7 +116,7 @@ def handle(self, *args: Any, **options: Any) -> Optional[str]:

def maybe_copy_file(self, args: Task) -> None:
"""Determine if file should be copied or not and handle exceptions."""
path, prefixed_path, source_storage = args
path, prefixed_path, source_storage = self.strategy.copy_args_hook(args)

# Build up found_files to look identical to how it's created in the
# builtin command's collect() method so that we can run post_process
Expand Down
13 changes: 13 additions & 0 deletions collectfasta/strategies/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ def on_skip_hook(
"""Hook called when a file copy is skipped."""
...

def second_pass_strategy(self) -> "Optional[Strategy[_RemoteStorage]]":
"""
Strategy that is used after the first pass of hashing is done - to copy the files
to the remote destination.
"""
return None

def copy_args_hook(
self, args: Tuple[str, str, Storage]
) -> Tuple[str, str, Storage]:
"""Hook called before copying a file. Use this to modify the path or storage."""
return args


class HashStrategy(Strategy[_RemoteStorage], abc.ABC):
use_gzip = False
Expand Down
15 changes: 15 additions & 0 deletions collectfasta/strategies/boto3.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from collectfasta import settings

from .base import CachingHashStrategy
from .hashing import TwoPassFileSystemStrategy
from .hashing import TwoPassInMemoryStrategy
from .hashing import WithoutPrefixMixin

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -200,3 +203,15 @@ def pre_should_copy_hook(self) -> None:
if settings.threads:
logger.info("Resetting connection")
self.remote_storage._connection = None


class Boto3WithoutPrefixStrategy(WithoutPrefixMixin, Boto3Strategy):
pass


class Boto3ManifestMemoryStrategy(TwoPassInMemoryStrategy):
second_strategy = Boto3WithoutPrefixStrategy


class Boto3ManifestFileSystemStrategy(TwoPassFileSystemStrategy):
second_strategy = Boto3WithoutPrefixStrategy
83 changes: 83 additions & 0 deletions collectfasta/strategies/hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from typing import Optional
from typing import Type

from django.contrib.staticfiles.storage import ManifestFilesMixin
from django.core.files.storage import FileSystemStorage
from django.core.files.storage import Storage
from django.core.files.storage.memory import InMemoryStorage

from .base import HashStrategy
from .base import Strategy
from .base import _RemoteStorage


class InMemoryManifestFilesStorage(ManifestFilesMixin, InMemoryStorage): # type: ignore
pass


class FileSystemManifestFilesStorage(ManifestFilesMixin, FileSystemStorage): # type: ignore
pass


class HashingTwoPassStrategy(HashStrategy[Storage]):
"""
Hashing strategies interact a lot with the remote storage as a part of post-processing
This strategy will instead run the hashing strategy using InMemoryStorage first, then copy
the files to the remote storage
"""

first_manifest_storage: Optional[Type[ManifestFilesMixin]] = None
second_strategy: Optional[Type[Strategy]] = None

def __init__(self, remote_storage: _RemoteStorage) -> None:
self.first_pass = True
self.original_storage = remote_storage
self.memory_storage = self._get_tmp_storage()
self.remote_storage = self.memory_storage
super().__init__(self.memory_storage)

def _get_tmp_storage(self) -> Storage:
if isinstance(self.original_storage, ManifestFilesMixin):
return self.first_manifest_storage(location=self.original_storage.location) # type: ignore
else:
raise ValueError(
"HashingMemoryStrategy can only be used with subclasses of ManifestFilesMixin"
)

def wrap_storage(self, remote_storage: Storage) -> Storage:
return self.remote_storage

def get_remote_file_hash(self, prefixed_path: str) -> Optional[str]:
try:
return super().get_local_file_hash(prefixed_path, self.remote_storage)
except FileNotFoundError:
return None

def second_pass_strategy(self):
"""
Strategy that is used after the first pass of hashing is done - to copy the files
to the remote destination.
"""
if self.second_strategy is None:
raise NotImplementedError(
"second_strategy must be set to a valid strategy class"
)
else:
return self.second_strategy(self.original_storage)


class WithoutPrefixMixin:
def copy_args_hook(self, args):
return (
args[0].replace(self.remote_storage.location, ""), # type: ignore
args[1].replace(self.remote_storage.location, ""), # type: ignore
args[2],
)


class TwoPassInMemoryStrategy(HashingTwoPassStrategy):
first_manifest_storage = InMemoryManifestFilesStorage


class TwoPassFileSystemStrategy(HashingTwoPassStrategy):
first_manifest_storage = FileSystemManifestFilesStorage
Loading

0 comments on commit 66b8c8b

Please sign in to comment.