Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write build artifacts to (cloud) storage from build servers #5549

Merged
merged 9 commits into from
May 9, 2019
Merged
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ logs/*
media/dash
media/epub
media/export
media/html
media/htmlzip
media/json
media/man
Expand Down
10 changes: 10 additions & 0 deletions docs/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ Default: :djangosetting:`ALLOW_ADMIN`
Whether to include `django.contrib.admin` in the URL's.


RTD_BUILD_MEDIA_STORAGE
-----------------------

Default: ``None``

Use this storage class to upload build artifacts to cloud storage (S3, Azure storage).
This should be a dotted path to the relevant class (eg. ``'path.to.MyBuildMediaStorage'``).
This class should mixin :class:`readthedocs.builds.storage.BuildMediaStorageMixin`.


ELASTICSEARCH_DSL
-----------------

Expand Down
105 changes: 105 additions & 0 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import logging
from pathlib import Path

from django.core.files.storage import FileSystemStorage
from storages.utils import safe_join, get_available_overwrite_name


log = logging.getLogger(__name__)


class BuildMediaStorageMixin:

"""
A mixin for Storage classes needed to write build artifacts

This adds and modifies some functionality to Django's File Storage API.
By default, classes mixing this in will now overwrite files by default instead
of finding an available name.
This mixin also adds convenience methods to copy and delete entire directories.

See: https://docs.djangoproject.com/en/1.11/ref/files/storage
"""

@staticmethod
def _dirpath(path):
"""It may just be Azure, but for listdir to work correctly, the path must end in '/'"""
path = str(path)
if not path.endswith('/'):
path += '/'

return path

def get_available_name(self, name, max_length=None):
"""
Overrides Django's storage implementation to always return the passed name (overwrite)

By default, Django will not overwrite files even if the same name is specified.
This changes that functionality so that the default is to use the same name and overwrite
rather than modify the path to not clobber files.
"""
return get_available_overwrite_name(name, max_length=max_length)

def delete_directory(self, path):
"""
Delete all files under a certain path from storage

Many storage backends (S3, Azure storage) don't care about "directories".
The directory effectively doesn't exist if there are no files in it.
However, in these backends, there is no "rmdir" operation so you have to recursively
delete all files.

:param path: the path to the directory to remove
"""
log.debug('Deleting directory %s from media storage', path)
folders, files = self.listdir(self._dirpath(path))
for folder_name in folders:
if folder_name:
# Recursively delete the subdirectory
self.delete_directory(safe_join(path, folder_name))
for filename in files:
if filename:
self.delete(safe_join(path, filename))

def copy_directory(self, source, destination):
"""
Copy a directory recursively to storage

:param source: the source path on the local disk
:param destination: the destination path in storage
"""
log.debug('Copying source directory %s to media storage at %s', source, destination)
source = Path(source)
for filepath in source.iterdir():
sub_destination = safe_join(destination, filepath.name)
if filepath.is_dir():
# Recursively copy the subdirectory
self.copy_directory(filepath, sub_destination)
elif filepath.is_file():
with filepath.open('rb') as fd:
self.save(sub_destination, fd)


class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage):

"""Storage subclass that writes build artifacts under MEDIA_ROOT"""

def get_available_name(self, name, max_length=None):
"""
A hack to overwrite by default with the FileSystemStorage

After upgrading to Django 2.2, this method can be removed
because subclasses can set OS_OPEN_FLAGS such that FileSystemStorage._save
will properly overwrite files.
See: https://github.com/django/django/pull/8476
"""
name = super().get_available_name(name, max_length=max_length)
if self.exists(name):
self.delete(name)
return name

def listdir(self, path):
"""Return empty lists for nonexistent directories (as cloud storages do)"""
if not self.exists(path):
return [], []
return super().listdir(path)
66 changes: 9 additions & 57 deletions readthedocs/builds/syncers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import shutil

from django.conf import settings
from django.core.exceptions import SuspiciousFileOperation
from django.core.files.storage import get_storage_class

from readthedocs.core.utils import safe_makedirs
Expand All @@ -31,6 +30,15 @@ def copy(cls, path, target, is_file=False, **kwargs):
raise NotImplementedError


class NullSyncer:

"""A syncer that doesn't actually do anything"""

@classmethod
def copy(cls, path, target, is_file=False, **kwargs):
pass # noqa


class LocalSyncer(BaseSyncer):

@classmethod
Expand Down Expand Up @@ -169,62 +177,6 @@ def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=a
)


class SelectiveStorageRemotePuller(RemotePuller):

"""
Like RemotePuller but certain files are copied via Django's storage system.

If a file with extensions specified by ``extensions`` is copied, it will be copied to storage
and the original is removed.

See: https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-DEFAULT_FILE_STORAGE
"""

extensions = ('.pdf', '.epub', '.zip')

@classmethod
def get_storage_path(cls, path):
"""
Gets the path to the file within the storage engine.

For example, if the path was $MEDIA_ROOT/pdfs/latest.pdf
the storage_path is 'pdfs/latest.pdf'

:raises: SuspiciousFileOperation if the path isn't under settings.MEDIA_ROOT
"""
path = os.path.normpath(path)
if not path.startswith(settings.MEDIA_ROOT):
raise SuspiciousFileOperation

path = path.replace(settings.MEDIA_ROOT, '').lstrip('/')
return path

@classmethod
def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=arguments-differ
RemotePuller.copy(path, target, host, is_file, **kwargs)

if getattr(storage, 'write_build_media', False):
# This is a sanity check for the case where
# storage is backed by the local filesystem
# In that case, removing the original target file locally
# would remove the file from storage as well

if is_file and os.path.exists(target) and \
any([target.lower().endswith(ext) for ext in cls.extensions]):
log.info('Selective Copy %s to media storage', target)

try:
storage_path = cls.get_storage_path(target)

if storage.exists(storage_path):
storage.delete(storage_path)

with open(target, 'rb') as fd:
storage.save(storage_path, fd)
except Exception:
log.exception('Storage access failed for file. Not failing build.')


class Syncer(SettingsOverrideObject):
_default_class = LocalSyncer
_override_setting = 'FILE_SYNCER'
16 changes: 11 additions & 5 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,23 +510,29 @@ def get_subproject_urls(self):
return [(proj.child.slug, proj.child.get_docs_url())
for proj in self.subprojects.all()]

def get_storage_path(self, type_, version_slug=LATEST):
def get_storage_path(self, type_, version_slug=LATEST, include_file=True):
"""
Get a path to a build artifact for use with Django's storage system.

:param type_: Media content type, ie - 'pdf', 'htmlzip'
:param version_slug: Project version slug for lookup
:param include_file: Include file name in return
:return: the path to an item in storage
(can be used with ``storage.url`` to get the URL)
"""
extension = type_.replace('htmlzip', 'zip')
return '{}/{}/{}/{}.{}'.format(
folder_path = '{}/{}/{}'.format(
type_,
self.slug,
version_slug,
self.slug,
extension,
)
if include_file:
extension = type_.replace('htmlzip', 'zip')
return '{}/{}.{}'.format(
folder_path,
self.slug,
extension,
)
return folder_path

def get_production_media_path(self, type_, version_slug, include_file=True):
"""
Expand Down
Loading