Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search indexing with storage #5854

Merged
merged 11 commits into from
Aug 8, 2019
4 changes: 2 additions & 2 deletions docs/development/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ Whether to include `django.contrib.admin` in the URL's.
RTD_BUILD_MEDIA_STORAGE
-----------------------

Default: ``None``
Default: ``readthedocs.builds.storage.BuildMediaFileSystemStorage``

Use this storage class to upload build artifacts to cloud storage (S3, Azure storage).
This should be a dotted path to the relevant class (eg. ``'path.to.MyBuildMediaStorage'``).
This class should mixin :class:`readthedocs.builds.storage.BuildMediaStorageMixin`.
Your class should mixin :class:`readthedocs.builds.storage.BuildMediaStorageMixin`.


ELASTICSEARCH_DSL
Expand Down
23 changes: 20 additions & 3 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ def delete_directory(self, path):
for folder_name in folders:
if folder_name:
# Recursively delete the subdirectory
self.delete_directory(safe_join(path, folder_name))
self.delete_directory(self.join(path, folder_name))
for filename in files:
if filename:
self.delete(safe_join(path, filename))
self.delete(self.join(path, filename))

def copy_directory(self, source, destination):
"""
Expand All @@ -79,14 +79,31 @@ def copy_directory(self, source, destination):
log.debug('Copying source directory %s to media storage at %s', source, destination)
source = Path(source)
for filepath in source.iterdir():
sub_destination = safe_join(destination, filepath.name)
sub_destination = self.join(destination, filepath.name)
if filepath.is_dir():
# Recursively copy the subdirectory
self.copy_directory(filepath, sub_destination)
elif filepath.is_file():
with filepath.open('rb') as fd:
self.save(sub_destination, fd)

def join(self, directory, filepath):
return safe_join(directory, filepath)

def walk(self, top):
if top in ('', '/'):
raise SuspiciousFileOperation('Iterating all storage cannot be right')

log.debug('Walking %s in media storage', top)
folders, files = self.listdir(self._dirpath(top))

yield top, folders, files

for folder_name in folders:
if folder_name:
# Recursively walk the subdirectory
yield from self.walk(self.join(top, folder_name))


class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage):

Expand Down
46 changes: 28 additions & 18 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,26 +1271,36 @@ def get_processed_json(self):
Both lead to `foo/index.html`
https://github.com/rtfd/readthedocs.org/issues/5368
"""
fjson_paths = []
basename = os.path.splitext(self.path)[0]
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
fjson_paths.append(new_basename + '.fjson')

full_json_path = self.project.get_production_media_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for fjson_path in fjson_paths:
file_path = os.path.join(full_json_path, fjson_path)
if os.path.exists(file_path):
return process_file(file_path)
except Exception:
file_path = None

if settings.RTD_BUILD_MEDIA_STORAGE:
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

fjson_paths = []
basename = os.path.splitext(self.path)[0]
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
fjson_paths.append(new_basename + '.fjson')

storage_path = self.project.get_storage_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for fjson_path in fjson_paths:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we should be able to get away from this soon. We should be starting to store the proper path of a file after readthedocs/readthedocs-sphinx-ext#62 is merged.

file_path = storage.join(storage_path, fjson_path)
if storage.exists(file_path):
return process_file(file_path)
except Exception:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
)
else:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
'Skipping HTMLFile processing because of no storage backend'
)

return {
'path': file_path,
'title': '',
Expand Down
Loading