Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index path with original path name #5785

Merged
merged 12 commits into from
Jun 18, 2019
10 changes: 5 additions & 5 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,19 +1231,19 @@ def get_processed_json(self):
Both lead to `foo/index.html`
https://github.com/rtfd/readthedocs.org/issues/5368
"""
paths = []
fjson_paths = []
basename = os.path.splitext(self.path)[0]
paths.append(basename + '.fjson')
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
paths.append(new_basename + '.fjson')
fjson_paths.append(new_basename + '.fjson')

full_json_path = self.project.get_production_media_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for path in paths:
file_path = os.path.join(full_json_path, path)
for fjson_path in fjson_paths:
file_path = os.path.join(full_json_path, fjson_path)
if os.path.exists(file_path):
return process_file(file_path)
except Exception:
Expand Down
4 changes: 2 additions & 2 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import logging
from pprint import pformat

from rest_framework import generics
from rest_framework import serializers
from rest_framework import generics, serializers
from rest_framework.exceptions import ValidationError
from rest_framework.pagination import PageNumberPagination

from readthedocs.search.faceted_search import PageSearch
from readthedocs.search.utils import get_project_list_or_404


log = logging.getLogger(__name__)


Expand Down
3 changes: 2 additions & 1 deletion readthedocs/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ class PageDocument(RTDDocTypeMixin, DocType):
project = fields.KeywordField(attr='project.slug')
version = fields.KeywordField(attr='version.slug')
path = fields.KeywordField(attr='processed_json.path')
full_path = fields.KeywordField(attr='path')

# Searchable content
title = fields.TextField(attr='processed_json.title')
Expand Down Expand Up @@ -153,7 +154,7 @@ def faceted_search(

def get_queryset(self):
"""Overwrite default queryset to filter certain files to index."""
queryset = super(PageDocument, self).get_queryset()
queryset = super().get_queryset()

# Do not index files that belong to non sphinx project
# Also do not index certain files
Expand Down
25 changes: 14 additions & 11 deletions readthedocs/search/parse_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,38 +59,41 @@ def generate_sections_from_pyquery(body):
}


def process_file(filename):
"""Read a file from disk and parse it into a structured dict."""
def process_file(fjson_filename):
"""Read the fjson file from disk and parse it into a structured dict."""
try:
with codecs.open(filename, encoding='utf-8', mode='r') as f:
with codecs.open(fjson_filename, encoding='utf-8', mode='r') as f:
file_contents = f.read()
except IOError:
log.info('Unable to read file: %s', filename)
return None
log.info('Unable to read file: %s', fjson_filename)
raise
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't this cause all indexing to fail if a single file is missing?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see we're catching it at a higher level, 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were returning None, but we expect to always have a dict.

This is caught by https://github.com/stsewd/readthedocs.org/blob/96a85fa8af3cac8b139bdf99598d119eae0e0163/readthedocs/projects/models.py#L1244-L1260

Which returns a default dict

data = json.loads(file_contents)
sections = []
path = ''
title = ''
body_content = ''

if 'current_page_name' in data:
path = data['current_page_name']
else:
log.info('Unable to index file due to no name %s', filename)
return None
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we return none from this function, get_processed_json is going to return None too, we always expect a dict from here. And get_processed_json has a default https://github.com/stsewd/readthedocs.org/blob/a2e0e3f5e442b0072a4b7cbac9efadbe2a41c224/readthedocs/projects/models.py#L1254-L1261

if 'body' in data and data['body']:
log.info('Unable to index file due to no name %s', fjson_filename)

if data.get('body'):
body = PyQuery(data['body'])
body_content = body.text().replace('¶', '')
sections.extend(generate_sections_from_pyquery(body))
else:
log.info('Unable to index content for: %s', filename)
log.info('Unable to index content for: %s', fjson_filename)

if 'title' in data:
title = data['title']
if title.startswith('<'):
title = PyQuery(data['title']).text()
else:
log.info('Unable to index title for: %s', filename)
log.info('Unable to index title for: %s', fjson_filename)

return {
'headers': process_headers(data, filename),
'headers': process_headers(data, fjson_filename),
'content': body_content,
'path': path,
'title': title,
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/templates/search/elastic_search.html
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ <h3>

{% elif 'page' in result.meta.index %}

<a href="{% doc_url result.project|get_project result.version result.path %}?highlight={{ query }}">
<a href="{% doc_url result.project|get_project result.version result.full_path %}?highlight={{ query }}">
{{ result.project }} - {{ result.title }}
</a>
{% for fragment in result.meta.highlight.content %}
Expand Down