Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add elasticsearch backend and search api #493

Merged
merged 5 commits into from
Oct 27, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pip_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ django-profiles==0.2
django-secure==0.1.2
django==1.4.8
docutils==0.8.1
elasticsearch==0.4.3
github2==0.5.2
httplib2==0.7.2
mercurial==2.6.3
Expand Down
49 changes: 45 additions & 4 deletions readthedocs/restapi/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
from django.conf import settings

from distlib.version import UnsupportedVersionError
from rest_framework import decorators
from rest_framework import permissions
from rest_framework import viewsets
from rest_framework import decorators, permissions, viewsets, status
from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer
from rest_framework.response import Response

from betterversion.better import version_windows, BetterVersion
from builds.models import Version
from projects.models import Project, EmailHook
from search.indexes import Page as PageIndex, Project as ProjectIndex
from djangome import views as djangome

from .serializers import ProjectSerializer
Expand All @@ -29,7 +28,7 @@ def valid_versions(self, request, **kwargs):
"""
project = get_object_or_404(Project, pk=kwargs['pk'])
if not project.num_major or not project.num_minor or not project.num_point:
return Response({'error': 'Project does not support point version control.'})
return Response({'error': 'Project does not support point version control'}, status=status.HTTP_400_BAD_REQUEST)
versions = []
for ver in project.versions.all():
try:
Expand Down Expand Up @@ -177,3 +176,45 @@ def quick_search(request):
value = ':'.join(data.split(':')[6:])
ret_dict[key] = value
return Response({"results": ret_dict})

@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
def search(request):
project_id = request.GET.get('project', None)
version_slug = request.GET.get('version', 'latest')
query = request.GET.get('q', None)

if project_id:
# This is a search within a project -- do a Page search.
body = {
'filter': {
'term': {'project': project_id},
'term': {'version': version_slug},
},
'query': {
'bool': {
'should': [
{'match': {'title': {'query': query, 'boost': 10}}},
{'match': {'headers': {'query': query, 'boost': 5}}},
{'match': {'content': {'query': query}}},
]
}
}
}
results = PageIndex().search(body, routing=project_id)

else:
body = {
'query': {
'bool': {
'should': [
{'match': {'name': {'query': query, 'boost': 10}}},
{'match': {'description': {'query': query}}},
]
}
}
}
results = ProjectIndex().search(body)

return Response({'results': results})
Empty file added readthedocs/search/__init__.py
Empty file.
267 changes: 267 additions & 0 deletions readthedocs/search/indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
"""
Search indexing classes to index into Elasticsearch.

Django settings that should be defined:

`ES_HOSTS`: A list of hosts where Elasticsearch lives. E.g.
['192.168.1.1:9200', '192.168.2.1:9200']

`ES_DEFAULT_NUM_REPLICAS`: An integer of the number of replicas.

`ES_DEFAULT_NUM_SHARDS`: An integer of the number of shards.


TODO: Handle page removal case in Page.

"""
import datetime

from elasticsearch import Elasticsearch, exceptions
from elasticsearch.helpers import bulk_index

from django.conf import settings


class Index(object):
"""
Base class to define some common methods across indexes.
"""
# The _index and _type define the URL path to Elasticsearch, e.g.:
# http://localhost:9200/{_index}/{_type}/_search
_index = 'readthedocs'
_type = None

def __init__(self):
self.es = Elasticsearch(settings.ES_HOSTS)

def get_settings(self, settings_override=None):
"""
Returns settings to be passed to ES create_index.

If `settings_override` is provided, this will use `settings_override`
to override the defaults defined here.

"""
default_settings = {
'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
'refresh_interval': '5s',
'store.compress.tv': True,
'store.compress.stored': True,
'analysis': self.get_analysis(),
}
if settings_override:
default_settings.update(settings_override)

return default_settings

def get_analysis(self):
"""
Returns the analysis dict to be used in settings for create_index.

For languages that ES supports we define either the minimal or light
stemming, which isn't as aggresive as the snowball stemmer. We also
define the stopwords for that language.

For all languages we've customized we're using the ICU plugin.

"""
analyzers = {}
filters = {}

# The default is used for fields that need ICU but are composed of
# many languages.
analyzers['default_icu'] = {
'type': 'custom',
'tokenizer': 'icu_tokenizer',
'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'],
}

# Customize the word_delimiter filter to set various options.
filters['custom_word_delimiter'] = {
'type': 'word_delimiter',
'preserve_original': True,
}

return {
'analyzer': analyzers,
'filter': filters,
}

def timestamped_index(self):
return '{0}-{1}'.format(
self._index, datetime.datetime.now().strftime('%Y%m%d%H%M%S'))

def create_index(self, index=None):
"""
Creates index.

This uses `get_settings` and `get_mappings` to define the index.

"""
index = index or self._index
body = {
'settings': self.get_settings(),
}
self.es.indices.create(index=index, body=body)

def put_mapping(self, index=None):
index = index or self._index
self.es.indices.put_mapping(index, self._type, self.get_mapping())

def bulk_index(self, data, index=None, chunk_size=500, parent=None):
"""
Given a list of documents, uses Elasticsearch bulk indexing.

For each doc this calls `extract_document`, then indexes.

`chunk_size` defaults to the elasticsearch lib's default. Override per
your document size as needed.

"""
index = index or self._index
docs = []
for d in data:
source = self.extract_document(d)
doc = {
'_index': index,
'_type': self._type,
'_id': source['id'],
'_source': source,
}
if parent:
doc['_parent'] = parent
docs.append(doc)

bulk_index(self.es, docs, chunk_size=chunk_size)

def index_document(self, data, index=None, parent=None):
index = index or self._index
doc = self.extract_document(data)
self.es.index(index=index, doc_type=self._type, body=doc, id=doc['id'],
parent=parent)

def get_mapping(self):
"""
Returns the mapping for this _index and _type.
"""
raise NotImplemented

def extract_document(self, pk, obj):
"""
Extracts the Elasticsearch document for this object instance.
"""
raise NotImplemented

def update_aliases(self, new_index, delete=True):
"""
Points `_index` to `new_index` and deletes `_index` if delete=True.

The ES `update_aliases` is atomic.
"""
old_index = None

# Get current alias, if any.
try:
aliases = self.es.indices.get_alias(name=self._index)
if aliases and aliases.keys():
old_index = aliases.keys()[0]
except exceptions.NotFoundError:
pass

actions = []
if old_index:
actions.append({'remove': {'index': old_index,
'alias': self._index}})
actions.append({'add': {'index': new_index, 'alias': self._index}})

self.es.indices.update_aliases(body={'actions': actions})

# Delete old index if any and if specified.
if delete and old_index:
self.es.indices.delete(index=old_index)

def search(self, body, **kwargs):
return self.es.search(index=self._index, doc_type=self._type,
body=body, **kwargs)


class Project(Index):

_type = 'project'

def get_mapping(self):
mapping = {
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
'properties': {
'id': {'type': 'long'},
'name': {'type': 'string', 'analyzer': 'default_icu'},
'slug': {'type': 'string', 'index': 'not_analyzed'},
'description': {'type': 'string',
'analyzer': 'default_icu'},
'lang': {'type': 'string', 'index': 'not_analyzed'},
'author': {'type': 'string', 'analyzer': 'default_icu'},
'url': {'type': 'string', 'index': 'not_analyzed'},
}
}
}

return mapping

def extract_document(self, data):
doc = {}

attrs = ('id', 'name', 'description', 'author', 'url')
for attr in attrs:
doc[attr] = data.get(attr, '')

# Add project boost.
doc['_boost'] = data.get('_boost', 1.0)

return doc


class Page(Index):

_type = 'page'
_parent = 'project'

def get_mapping(self):
mapping = {
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
# Associate a page with a project.
'_parent': {'type': self._parent},
'properties': {
'id': {'type': 'string', 'index': 'not_analyzed'},
'project': {'type': 'long'},
'title': {'type': 'string', 'analyzer': 'default_icu'},
'headers': {'type': 'string', 'analyzer': 'default_icu'},
'version': {'type': 'string', 'index': 'not_analyzed'},
'path': {'type': 'string', 'index': 'not_analyzed'},
'content': {'type': 'string', 'analyzer': 'default_icu'},
}
}
}

return mapping

def extract_document(self, data):
doc = {}

attrs = ('id', 'project', 'title', 'headers', 'version', 'path',
'content')
for attr in attrs:
doc[attr] = data.get(attr, '')

# Add page boost.
doc['_boost'] = data.get('_boost', 1.0)

return doc
4 changes: 4 additions & 0 deletions readthedocs/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@
},
}

# Elasticsearch settings.
ES_HOSTS = ['127.0.0.1:9200']
ES_DEFAULT_NUM_REPLICAS = 0
ES_DEFAULT_NUM_SHARDS = 5

AUTH_PROFILE_MODULE = "core.UserProfile"
SOUTH_TESTS_MIGRATE = False
Expand Down
1 change: 1 addition & 0 deletions readthedocs/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
url(r'^api/v2/', include(router.urls)),
url(r'^api/v2/footer_html/$', 'restapi.views.footer_html', name='footer_html'),
url(r'^api/v2/quick_search/$', 'restapi.views.quick_search', name='quick_search'),
url(r'^api/v2/search/$', 'restapi.views.search', name='search'),
url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
url(r'^feeds/new/$',
NewProjectsFeed(),
Expand Down