Merge pull request #493 from robhudson/elasticsearch

Add elasticsearch backend and search api
readthedocs · Oct 27, 2013 · 9b0bde3 · 9b0bde3
2 parents f0d95a7 + b592289
commit 9b0bde3
Show file tree

Hide file tree

Showing 6 changed files with 318 additions and 4 deletions.
diff --git a/pip_requirements.txt b/pip_requirements.txt
@@ -14,6 +14,7 @@ django-profiles==0.2
 django-secure==0.1.2
 django==1.4.8
 docutils==0.8.1
+elasticsearch==0.4.3
 github2==0.5.2
 httplib2==0.7.2
 mercurial==2.6.3

diff --git a/readthedocs/restapi/views.py b/readthedocs/restapi/views.py
@@ -3,15 +3,14 @@
 from django.conf import settings
 
 from distlib.version import UnsupportedVersionError
-from rest_framework import decorators
-from rest_framework import permissions
-from rest_framework import viewsets
+from rest_framework import decorators, permissions, viewsets, status
 from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer
 from rest_framework.response import Response
 
 from betterversion.better import version_windows, BetterVersion 
 from builds.models import Version
 from projects.models import Project, EmailHook
+from search.indexes import Page as PageIndex, Project as ProjectIndex
 from djangome import views as djangome
 
 from .serializers import ProjectSerializer
@@ -29,7 +28,7 @@ def valid_versions(self, request, **kwargs):
         """
         project = get_object_or_404(Project, pk=kwargs['pk'])
         if not project.num_major or not project.num_minor or not project.num_point:
-            return Response({'error': 'Project does not support point version control.'})
+            return Response({'error': 'Project does not support point version control'}, status=status.HTTP_400_BAD_REQUEST)
         versions = []
         for ver in project.versions.all():
             try:
@@ -177,3 +176,45 @@ def quick_search(request):
             value = ':'.join(data.split(':')[6:])
             ret_dict[key] = value
     return Response({"results": ret_dict})
+
+@decorators.api_view(['GET'])
+@decorators.permission_classes((permissions.AllowAny,))
+@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
+def search(request):
+    project_id = request.GET.get('project', None)
+    version_slug = request.GET.get('version', 'latest')
+    query = request.GET.get('q', None)
+
+    if project_id:
+        # This is a search within a project -- do a Page search.
+        body = {
+            'filter': {
+                'term': {'project': project_id},
+                'term': {'version': version_slug},
+            },
+            'query': {
+                'bool': {
+                    'should': [
+                        {'match': {'title': {'query': query, 'boost': 10}}},
+                        {'match': {'headers': {'query': query, 'boost': 5}}},
+                        {'match': {'content': {'query': query}}},
+                    ]
+                }
+            }
+        }
+        results = PageIndex().search(body, routing=project_id)
+
+    else:
+        body = {
+            'query': {
+                'bool': {
+                    'should': [
+                        {'match': {'name': {'query': query, 'boost': 10}}},
+                        {'match': {'description': {'query': query}}},
+                    ]
+                }
+            }
+        }
+        results = ProjectIndex().search(body)
+
+    return Response({'results': results})
diff --git a/readthedocs/search/__init__.py b/readthedocs/search/__init__.py
diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py
@@ -0,0 +1,267 @@
+"""
+Search indexing classes to index into Elasticsearch.
+
+Django settings that should be defined:
+
+    `ES_HOSTS`: A list of hosts where Elasticsearch lives. E.g.
+                ['192.168.1.1:9200', '192.168.2.1:9200']
+
+    `ES_DEFAULT_NUM_REPLICAS`: An integer of the number of replicas.
+
+    `ES_DEFAULT_NUM_SHARDS`: An integer of the number of shards.
+
+
+TODO: Handle page removal case in Page.
+
+"""
+import datetime
+
+from elasticsearch import Elasticsearch, exceptions
+from elasticsearch.helpers import bulk_index
+
+from django.conf import settings
+
+
+class Index(object):
+    """
+    Base class to define some common methods across indexes.
+    """
+    # The _index and _type define the URL path to Elasticsearch, e.g.:
+    #   http://localhost:9200/{_index}/{_type}/_search
+    _index = 'readthedocs'
+    _type = None
+
+    def __init__(self):
+        self.es = Elasticsearch(settings.ES_HOSTS)
+
+    def get_settings(self, settings_override=None):
+        """
+        Returns settings to be passed to ES create_index.
+
+        If `settings_override` is provided, this will use `settings_override`
+        to override the defaults defined here.
+
+        """
+        default_settings = {
+            'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
+            'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
+            'refresh_interval': '5s',
+            'store.compress.tv': True,
+            'store.compress.stored': True,
+            'analysis': self.get_analysis(),
+        }
+        if settings_override:
+            default_settings.update(settings_override)
+
+        return default_settings
+
+    def get_analysis(self):
+        """
+        Returns the analysis dict to be used in settings for create_index.
+
+        For languages that ES supports we define either the minimal or light
+        stemming, which isn't as aggresive as the snowball stemmer. We also
+        define the stopwords for that language.
+
+        For all languages we've customized we're using the ICU plugin.
+
+        """
+        analyzers = {}
+        filters = {}
+
+        # The default is used for fields that need ICU but are composed of
+        # many languages.
+        analyzers['default_icu'] = {
+            'type': 'custom',
+            'tokenizer': 'icu_tokenizer',
+            'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'],
+        }
+
+        # Customize the word_delimiter filter to set various options.
+        filters['custom_word_delimiter'] = {
+            'type': 'word_delimiter',
+            'preserve_original': True,
+        }
+
+        return {
+            'analyzer': analyzers,
+            'filter': filters,
+        }
+
+    def timestamped_index(self):
+        return '{0}-{1}'.format(
+            self._index, datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
+
+    def create_index(self, index=None):
+        """
+        Creates index.
+
+        This uses `get_settings` and `get_mappings` to define the index.
+
+        """
+        index = index or self._index
+        body = {
+            'settings': self.get_settings(),
+        }
+        self.es.indices.create(index=index, body=body)
+
+    def put_mapping(self, index=None):
+        index = index or self._index
+        self.es.indices.put_mapping(index, self._type, self.get_mapping())
+
+    def bulk_index(self, data, index=None, chunk_size=500, parent=None):
+        """
+        Given a list of documents, uses Elasticsearch bulk indexing.
+
+        For each doc this calls `extract_document`, then indexes.
+
+        `chunk_size` defaults to the elasticsearch lib's default. Override per
+        your document size as needed.
+
+        """
+        index = index or self._index
+        docs = []
+        for d in data:
+            source = self.extract_document(d)
+            doc = {
+                '_index': index,
+                '_type': self._type,
+                '_id': source['id'],
+                '_source': source,
+            }
+            if parent:
+                doc['_parent'] = parent
+            docs.append(doc)
+
+        bulk_index(self.es, docs, chunk_size=chunk_size)
+
+    def index_document(self, data, index=None, parent=None):
+        index = index or self._index
+        doc = self.extract_document(data)
+        self.es.index(index=index, doc_type=self._type, body=doc, id=doc['id'],
+                      parent=parent)
+
+    def get_mapping(self):
+        """
+        Returns the mapping for this _index and _type.
+        """
+        raise NotImplemented
+
+    def extract_document(self, pk, obj):
+        """
+        Extracts the Elasticsearch document for this object instance.
+        """
+        raise NotImplemented
+
+    def update_aliases(self, new_index, delete=True):
+        """
+        Points `_index` to `new_index` and deletes `_index` if delete=True.
+
+        The ES `update_aliases` is atomic.
+        """
+        old_index = None
+
+        # Get current alias, if any.
+        try:
+            aliases = self.es.indices.get_alias(name=self._index)
+            if aliases and aliases.keys():
+                old_index = aliases.keys()[0]
+        except exceptions.NotFoundError:
+            pass
+
+        actions = []
+        if old_index:
+            actions.append({'remove': {'index': old_index,
+                                       'alias': self._index}})
+        actions.append({'add': {'index': new_index, 'alias': self._index}})
+
+        self.es.indices.update_aliases(body={'actions': actions})
+
+        # Delete old index if any and if specified.
+        if delete and old_index:
+            self.es.indices.delete(index=old_index)
+
+    def search(self, body, **kwargs):
+        return self.es.search(index=self._index, doc_type=self._type,
+                              body=body, **kwargs)
+
+
+class Project(Index):
+
+    _type = 'project'
+
+    def get_mapping(self):
+        mapping = {
+            self._type: {
+                # Disable _all field to reduce index size.
+                '_all': {'enabled': False},
+                # Add a boost field to enhance relevancy of a document.
+                '_boost': {'name': '_boost', 'null_value': 1.0},
+                'properties': {
+                    'id': {'type': 'long'},
+                    'name': {'type': 'string', 'analyzer': 'default_icu'},
+                    'slug': {'type': 'string', 'index': 'not_analyzed'},
+                    'description': {'type': 'string',
+                                    'analyzer': 'default_icu'},
+                    'lang': {'type': 'string', 'index': 'not_analyzed'},
+                    'author': {'type': 'string', 'analyzer': 'default_icu'},
+                    'url': {'type': 'string', 'index': 'not_analyzed'},
+                }
+            }
+        }
+
+        return mapping
+
+    def extract_document(self, data):
+        doc = {}
+
+        attrs = ('id', 'name', 'description', 'author', 'url')
+        for attr in attrs:
+            doc[attr] = data.get(attr, '')
+
+        # Add project boost.
+        doc['_boost'] = data.get('_boost', 1.0)
+
+        return doc
+
+
+class Page(Index):
+
+    _type = 'page'
+    _parent = 'project'
+
+    def get_mapping(self):
+        mapping = {
+            self._type: {
+                # Disable _all field to reduce index size.
+                '_all': {'enabled': False},
+                # Add a boost field to enhance relevancy of a document.
+                '_boost': {'name': '_boost', 'null_value': 1.0},
+                # Associate a page with a project.
+                '_parent': {'type': self._parent},
+                'properties': {
+                    'id': {'type': 'string', 'index': 'not_analyzed'},
+                    'project': {'type': 'long'},
+                    'title': {'type': 'string', 'analyzer': 'default_icu'},
+                    'headers': {'type': 'string', 'analyzer': 'default_icu'},
+                    'version': {'type': 'string', 'index': 'not_analyzed'},
+                    'path': {'type': 'string', 'index': 'not_analyzed'},
+                    'content': {'type': 'string', 'analyzer': 'default_icu'},
+                }
+            }
+        }
+
+        return mapping
+
+    def extract_document(self, data):
+        doc = {}
+
+        attrs = ('id', 'project', 'title', 'headers', 'version', 'path',
+                 'content')
+        for attr in attrs:
+            doc[attr] = data.get(attr, '')
+
+        # Add page boost.
+        doc['_boost'] = data.get('_boost', 1.0)
+
+        return doc
diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py
@@ -173,6 +173,10 @@
     },
 }
 
+# Elasticsearch settings.
+ES_HOSTS = ['127.0.0.1:9200']
+ES_DEFAULT_NUM_REPLICAS = 0
+ES_DEFAULT_NUM_SHARDS = 5
 
 AUTH_PROFILE_MODULE = "core.UserProfile"
 SOUTH_TESTS_MIGRATE = False

diff --git a/readthedocs/urls.py b/readthedocs/urls.py
@@ -103,6 +103,7 @@
     url(r'^api/v2/', include(router.urls)),
     url(r'^api/v2/footer_html/$', 'restapi.views.footer_html', name='footer_html'),
     url(r'^api/v2/quick_search/$', 'restapi.views.quick_search', name='quick_search'),
+    url(r'^api/v2/search/$', 'restapi.views.search', name='search'),
     url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
     url(r'^feeds/new/$',
         NewProjectsFeed(),