Skip to content

Commit

Permalink
Merge pull request #493 from robhudson/elasticsearch
Browse files Browse the repository at this point in the history
Add elasticsearch backend and search api
  • Loading branch information
ericholscher committed Oct 27, 2013
2 parents f0d95a7 + b592289 commit 9b0bde3
Show file tree
Hide file tree
Showing 6 changed files with 318 additions and 4 deletions.
1 change: 1 addition & 0 deletions pip_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ django-profiles==0.2
django-secure==0.1.2
django==1.4.8
docutils==0.8.1
elasticsearch==0.4.3
github2==0.5.2
httplib2==0.7.2
mercurial==2.6.3
Expand Down
49 changes: 45 additions & 4 deletions readthedocs/restapi/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
from django.conf import settings

from distlib.version import UnsupportedVersionError
from rest_framework import decorators
from rest_framework import permissions
from rest_framework import viewsets
from rest_framework import decorators, permissions, viewsets, status
from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer
from rest_framework.response import Response

from betterversion.better import version_windows, BetterVersion
from builds.models import Version
from projects.models import Project, EmailHook
from search.indexes import Page as PageIndex, Project as ProjectIndex
from djangome import views as djangome

from .serializers import ProjectSerializer
Expand All @@ -29,7 +28,7 @@ def valid_versions(self, request, **kwargs):
"""
project = get_object_or_404(Project, pk=kwargs['pk'])
if not project.num_major or not project.num_minor or not project.num_point:
return Response({'error': 'Project does not support point version control.'})
return Response({'error': 'Project does not support point version control'}, status=status.HTTP_400_BAD_REQUEST)
versions = []
for ver in project.versions.all():
try:
Expand Down Expand Up @@ -177,3 +176,45 @@ def quick_search(request):
value = ':'.join(data.split(':')[6:])
ret_dict[key] = value
return Response({"results": ret_dict})

@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
def search(request):
project_id = request.GET.get('project', None)
version_slug = request.GET.get('version', 'latest')
query = request.GET.get('q', None)

if project_id:
# This is a search within a project -- do a Page search.
body = {
'filter': {
'term': {'project': project_id},
'term': {'version': version_slug},
},
'query': {
'bool': {
'should': [
{'match': {'title': {'query': query, 'boost': 10}}},
{'match': {'headers': {'query': query, 'boost': 5}}},
{'match': {'content': {'query': query}}},
]
}
}
}
results = PageIndex().search(body, routing=project_id)

else:
body = {
'query': {
'bool': {
'should': [
{'match': {'name': {'query': query, 'boost': 10}}},
{'match': {'description': {'query': query}}},
]
}
}
}
results = ProjectIndex().search(body)

return Response({'results': results})
Empty file added readthedocs/search/__init__.py
Empty file.
267 changes: 267 additions & 0 deletions readthedocs/search/indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
"""
Search indexing classes to index into Elasticsearch.
Django settings that should be defined:
`ES_HOSTS`: A list of hosts where Elasticsearch lives. E.g.
['192.168.1.1:9200', '192.168.2.1:9200']
`ES_DEFAULT_NUM_REPLICAS`: An integer of the number of replicas.
`ES_DEFAULT_NUM_SHARDS`: An integer of the number of shards.
TODO: Handle page removal case in Page.
"""
import datetime

from elasticsearch import Elasticsearch, exceptions
from elasticsearch.helpers import bulk_index

from django.conf import settings


class Index(object):
"""
Base class to define some common methods across indexes.
"""
# The _index and _type define the URL path to Elasticsearch, e.g.:
# http://localhost:9200/{_index}/{_type}/_search
_index = 'readthedocs'
_type = None

def __init__(self):
self.es = Elasticsearch(settings.ES_HOSTS)

def get_settings(self, settings_override=None):
"""
Returns settings to be passed to ES create_index.
If `settings_override` is provided, this will use `settings_override`
to override the defaults defined here.
"""
default_settings = {
'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
'refresh_interval': '5s',
'store.compress.tv': True,
'store.compress.stored': True,
'analysis': self.get_analysis(),
}
if settings_override:
default_settings.update(settings_override)

return default_settings

def get_analysis(self):
"""
Returns the analysis dict to be used in settings for create_index.
For languages that ES supports we define either the minimal or light
stemming, which isn't as aggresive as the snowball stemmer. We also
define the stopwords for that language.
For all languages we've customized we're using the ICU plugin.
"""
analyzers = {}
filters = {}

# The default is used for fields that need ICU but are composed of
# many languages.
analyzers['default_icu'] = {
'type': 'custom',
'tokenizer': 'icu_tokenizer',
'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'],
}

# Customize the word_delimiter filter to set various options.
filters['custom_word_delimiter'] = {
'type': 'word_delimiter',
'preserve_original': True,
}

return {
'analyzer': analyzers,
'filter': filters,
}

def timestamped_index(self):
return '{0}-{1}'.format(
self._index, datetime.datetime.now().strftime('%Y%m%d%H%M%S'))

def create_index(self, index=None):
"""
Creates index.
This uses `get_settings` and `get_mappings` to define the index.
"""
index = index or self._index
body = {
'settings': self.get_settings(),
}
self.es.indices.create(index=index, body=body)

def put_mapping(self, index=None):
index = index or self._index
self.es.indices.put_mapping(index, self._type, self.get_mapping())

def bulk_index(self, data, index=None, chunk_size=500, parent=None):
"""
Given a list of documents, uses Elasticsearch bulk indexing.
For each doc this calls `extract_document`, then indexes.
`chunk_size` defaults to the elasticsearch lib's default. Override per
your document size as needed.
"""
index = index or self._index
docs = []
for d in data:
source = self.extract_document(d)
doc = {
'_index': index,
'_type': self._type,
'_id': source['id'],
'_source': source,
}
if parent:
doc['_parent'] = parent
docs.append(doc)

bulk_index(self.es, docs, chunk_size=chunk_size)

def index_document(self, data, index=None, parent=None):
index = index or self._index
doc = self.extract_document(data)
self.es.index(index=index, doc_type=self._type, body=doc, id=doc['id'],
parent=parent)

def get_mapping(self):
"""
Returns the mapping for this _index and _type.
"""
raise NotImplemented

def extract_document(self, pk, obj):
"""
Extracts the Elasticsearch document for this object instance.
"""
raise NotImplemented

def update_aliases(self, new_index, delete=True):
"""
Points `_index` to `new_index` and deletes `_index` if delete=True.
The ES `update_aliases` is atomic.
"""
old_index = None

# Get current alias, if any.
try:
aliases = self.es.indices.get_alias(name=self._index)
if aliases and aliases.keys():
old_index = aliases.keys()[0]
except exceptions.NotFoundError:
pass

actions = []
if old_index:
actions.append({'remove': {'index': old_index,
'alias': self._index}})
actions.append({'add': {'index': new_index, 'alias': self._index}})

self.es.indices.update_aliases(body={'actions': actions})

# Delete old index if any and if specified.
if delete and old_index:
self.es.indices.delete(index=old_index)

def search(self, body, **kwargs):
return self.es.search(index=self._index, doc_type=self._type,
body=body, **kwargs)


class Project(Index):

_type = 'project'

def get_mapping(self):
mapping = {
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
'properties': {
'id': {'type': 'long'},
'name': {'type': 'string', 'analyzer': 'default_icu'},
'slug': {'type': 'string', 'index': 'not_analyzed'},
'description': {'type': 'string',
'analyzer': 'default_icu'},
'lang': {'type': 'string', 'index': 'not_analyzed'},
'author': {'type': 'string', 'analyzer': 'default_icu'},
'url': {'type': 'string', 'index': 'not_analyzed'},
}
}
}

return mapping

def extract_document(self, data):
doc = {}

attrs = ('id', 'name', 'description', 'author', 'url')
for attr in attrs:
doc[attr] = data.get(attr, '')

# Add project boost.
doc['_boost'] = data.get('_boost', 1.0)

return doc


class Page(Index):

_type = 'page'
_parent = 'project'

def get_mapping(self):
mapping = {
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
# Associate a page with a project.
'_parent': {'type': self._parent},
'properties': {
'id': {'type': 'string', 'index': 'not_analyzed'},
'project': {'type': 'long'},
'title': {'type': 'string', 'analyzer': 'default_icu'},
'headers': {'type': 'string', 'analyzer': 'default_icu'},
'version': {'type': 'string', 'index': 'not_analyzed'},
'path': {'type': 'string', 'index': 'not_analyzed'},
'content': {'type': 'string', 'analyzer': 'default_icu'},
}
}
}

return mapping

def extract_document(self, data):
doc = {}

attrs = ('id', 'project', 'title', 'headers', 'version', 'path',
'content')
for attr in attrs:
doc[attr] = data.get(attr, '')

# Add page boost.
doc['_boost'] = data.get('_boost', 1.0)

return doc
4 changes: 4 additions & 0 deletions readthedocs/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@
},
}

# Elasticsearch settings.
ES_HOSTS = ['127.0.0.1:9200']
ES_DEFAULT_NUM_REPLICAS = 0
ES_DEFAULT_NUM_SHARDS = 5

AUTH_PROFILE_MODULE = "core.UserProfile"
SOUTH_TESTS_MIGRATE = False
Expand Down
1 change: 1 addition & 0 deletions readthedocs/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
url(r'^api/v2/', include(router.urls)),
url(r'^api/v2/footer_html/$', 'restapi.views.footer_html', name='footer_html'),
url(r'^api/v2/quick_search/$', 'restapi.views.quick_search', name='quick_search'),
url(r'^api/v2/search/$', 'restapi.views.search', name='search'),
url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
url(r'^feeds/new/$',
NewProjectsFeed(),
Expand Down

0 comments on commit 9b0bde3

Please sign in to comment.