diff --git a/pip_requirements.txt b/pip_requirements.txt index 98da547a6b1..5de7610e149 100644 --- a/pip_requirements.txt +++ b/pip_requirements.txt @@ -14,6 +14,7 @@ django-profiles==0.2 django-secure==0.1.2 django==1.4.8 docutils==0.8.1 +elasticsearch==0.4.3 github2==0.5.2 httplib2==0.7.2 mercurial==2.6.3 diff --git a/readthedocs/restapi/views.py b/readthedocs/restapi/views.py index d44397d1a9e..a2d7267f839 100644 --- a/readthedocs/restapi/views.py +++ b/readthedocs/restapi/views.py @@ -3,15 +3,14 @@ from django.conf import settings from distlib.version import UnsupportedVersionError -from rest_framework import decorators -from rest_framework import permissions -from rest_framework import viewsets +from rest_framework import decorators, permissions, viewsets, status from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer from rest_framework.response import Response from betterversion.better import version_windows, BetterVersion from builds.models import Version from projects.models import Project, EmailHook +from search.indexes import Page as PageIndex, Project as ProjectIndex from djangome import views as djangome from .serializers import ProjectSerializer @@ -29,7 +28,7 @@ def valid_versions(self, request, **kwargs): """ project = get_object_or_404(Project, pk=kwargs['pk']) if not project.num_major or not project.num_minor or not project.num_point: - return Response({'error': 'Project does not support point version control.'}) + return Response({'error': 'Project does not support point version control'}, status=status.HTTP_400_BAD_REQUEST) versions = [] for ver in project.versions.all(): try: @@ -177,3 +176,45 @@ def quick_search(request): value = ':'.join(data.split(':')[6:]) ret_dict[key] = value return Response({"results": ret_dict}) + +@decorators.api_view(['GET']) +@decorators.permission_classes((permissions.AllowAny,)) +@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer)) +def search(request): + project_id = request.GET.get('project', None) + version_slug = request.GET.get('version', 'latest') + query = request.GET.get('q', None) + + if project_id: + # This is a search within a project -- do a Page search. + body = { + 'filter': { + 'term': {'project': project_id}, + 'term': {'version': version_slug}, + }, + 'query': { + 'bool': { + 'should': [ + {'match': {'title': {'query': query, 'boost': 10}}}, + {'match': {'headers': {'query': query, 'boost': 5}}}, + {'match': {'content': {'query': query}}}, + ] + } + } + } + results = PageIndex().search(body, routing=project_id) + + else: + body = { + 'query': { + 'bool': { + 'should': [ + {'match': {'name': {'query': query, 'boost': 10}}}, + {'match': {'description': {'query': query}}}, + ] + } + } + } + results = ProjectIndex().search(body) + + return Response({'results': results}) diff --git a/readthedocs/search/__init__.py b/readthedocs/search/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py new file mode 100644 index 00000000000..2a5873c635b --- /dev/null +++ b/readthedocs/search/indexes.py @@ -0,0 +1,267 @@ +""" +Search indexing classes to index into Elasticsearch. + +Django settings that should be defined: + + `ES_HOSTS`: A list of hosts where Elasticsearch lives. E.g. + ['192.168.1.1:9200', '192.168.2.1:9200'] + + `ES_DEFAULT_NUM_REPLICAS`: An integer of the number of replicas. + + `ES_DEFAULT_NUM_SHARDS`: An integer of the number of shards. + + +TODO: Handle page removal case in Page. + +""" +import datetime + +from elasticsearch import Elasticsearch, exceptions +from elasticsearch.helpers import bulk_index + +from django.conf import settings + + +class Index(object): + """ + Base class to define some common methods across indexes. + """ + # The _index and _type define the URL path to Elasticsearch, e.g.: + # http://localhost:9200/{_index}/{_type}/_search + _index = 'readthedocs' + _type = None + + def __init__(self): + self.es = Elasticsearch(settings.ES_HOSTS) + + def get_settings(self, settings_override=None): + """ + Returns settings to be passed to ES create_index. + + If `settings_override` is provided, this will use `settings_override` + to override the defaults defined here. + + """ + default_settings = { + 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, + 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, + 'refresh_interval': '5s', + 'store.compress.tv': True, + 'store.compress.stored': True, + 'analysis': self.get_analysis(), + } + if settings_override: + default_settings.update(settings_override) + + return default_settings + + def get_analysis(self): + """ + Returns the analysis dict to be used in settings for create_index. + + For languages that ES supports we define either the minimal or light + stemming, which isn't as aggresive as the snowball stemmer. We also + define the stopwords for that language. + + For all languages we've customized we're using the ICU plugin. + + """ + analyzers = {} + filters = {} + + # The default is used for fields that need ICU but are composed of + # many languages. + analyzers['default_icu'] = { + 'type': 'custom', + 'tokenizer': 'icu_tokenizer', + 'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'], + } + + # Customize the word_delimiter filter to set various options. + filters['custom_word_delimiter'] = { + 'type': 'word_delimiter', + 'preserve_original': True, + } + + return { + 'analyzer': analyzers, + 'filter': filters, + } + + def timestamped_index(self): + return '{0}-{1}'.format( + self._index, datetime.datetime.now().strftime('%Y%m%d%H%M%S')) + + def create_index(self, index=None): + """ + Creates index. + + This uses `get_settings` and `get_mappings` to define the index. + + """ + index = index or self._index + body = { + 'settings': self.get_settings(), + } + self.es.indices.create(index=index, body=body) + + def put_mapping(self, index=None): + index = index or self._index + self.es.indices.put_mapping(index, self._type, self.get_mapping()) + + def bulk_index(self, data, index=None, chunk_size=500, parent=None): + """ + Given a list of documents, uses Elasticsearch bulk indexing. + + For each doc this calls `extract_document`, then indexes. + + `chunk_size` defaults to the elasticsearch lib's default. Override per + your document size as needed. + + """ + index = index or self._index + docs = [] + for d in data: + source = self.extract_document(d) + doc = { + '_index': index, + '_type': self._type, + '_id': source['id'], + '_source': source, + } + if parent: + doc['_parent'] = parent + docs.append(doc) + + bulk_index(self.es, docs, chunk_size=chunk_size) + + def index_document(self, data, index=None, parent=None): + index = index or self._index + doc = self.extract_document(data) + self.es.index(index=index, doc_type=self._type, body=doc, id=doc['id'], + parent=parent) + + def get_mapping(self): + """ + Returns the mapping for this _index and _type. + """ + raise NotImplemented + + def extract_document(self, pk, obj): + """ + Extracts the Elasticsearch document for this object instance. + """ + raise NotImplemented + + def update_aliases(self, new_index, delete=True): + """ + Points `_index` to `new_index` and deletes `_index` if delete=True. + + The ES `update_aliases` is atomic. + """ + old_index = None + + # Get current alias, if any. + try: + aliases = self.es.indices.get_alias(name=self._index) + if aliases and aliases.keys(): + old_index = aliases.keys()[0] + except exceptions.NotFoundError: + pass + + actions = [] + if old_index: + actions.append({'remove': {'index': old_index, + 'alias': self._index}}) + actions.append({'add': {'index': new_index, 'alias': self._index}}) + + self.es.indices.update_aliases(body={'actions': actions}) + + # Delete old index if any and if specified. + if delete and old_index: + self.es.indices.delete(index=old_index) + + def search(self, body, **kwargs): + return self.es.search(index=self._index, doc_type=self._type, + body=body, **kwargs) + + +class Project(Index): + + _type = 'project' + + def get_mapping(self): + mapping = { + self._type: { + # Disable _all field to reduce index size. + '_all': {'enabled': False}, + # Add a boost field to enhance relevancy of a document. + '_boost': {'name': '_boost', 'null_value': 1.0}, + 'properties': { + 'id': {'type': 'long'}, + 'name': {'type': 'string', 'analyzer': 'default_icu'}, + 'slug': {'type': 'string', 'index': 'not_analyzed'}, + 'description': {'type': 'string', + 'analyzer': 'default_icu'}, + 'lang': {'type': 'string', 'index': 'not_analyzed'}, + 'author': {'type': 'string', 'analyzer': 'default_icu'}, + 'url': {'type': 'string', 'index': 'not_analyzed'}, + } + } + } + + return mapping + + def extract_document(self, data): + doc = {} + + attrs = ('id', 'name', 'description', 'author', 'url') + for attr in attrs: + doc[attr] = data.get(attr, '') + + # Add project boost. + doc['_boost'] = data.get('_boost', 1.0) + + return doc + + +class Page(Index): + + _type = 'page' + _parent = 'project' + + def get_mapping(self): + mapping = { + self._type: { + # Disable _all field to reduce index size. + '_all': {'enabled': False}, + # Add a boost field to enhance relevancy of a document. + '_boost': {'name': '_boost', 'null_value': 1.0}, + # Associate a page with a project. + '_parent': {'type': self._parent}, + 'properties': { + 'id': {'type': 'string', 'index': 'not_analyzed'}, + 'project': {'type': 'long'}, + 'title': {'type': 'string', 'analyzer': 'default_icu'}, + 'headers': {'type': 'string', 'analyzer': 'default_icu'}, + 'version': {'type': 'string', 'index': 'not_analyzed'}, + 'path': {'type': 'string', 'index': 'not_analyzed'}, + 'content': {'type': 'string', 'analyzer': 'default_icu'}, + } + } + } + + return mapping + + def extract_document(self, data): + doc = {} + + attrs = ('id', 'project', 'title', 'headers', 'version', 'path', + 'content') + for attr in attrs: + doc[attr] = data.get(attr, '') + + # Add page boost. + doc['_boost'] = data.get('_boost', 1.0) + + return doc diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 1db6d179791..1bd63c0e4d4 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -173,6 +173,10 @@ }, } +# Elasticsearch settings. +ES_HOSTS = ['127.0.0.1:9200'] +ES_DEFAULT_NUM_REPLICAS = 0 +ES_DEFAULT_NUM_SHARDS = 5 AUTH_PROFILE_MODULE = "core.UserProfile" SOUTH_TESTS_MIGRATE = False diff --git a/readthedocs/urls.py b/readthedocs/urls.py index 1f9f22d4ae6..434d7f13523 100644 --- a/readthedocs/urls.py +++ b/readthedocs/urls.py @@ -103,6 +103,7 @@ url(r'^api/v2/', include(router.urls)), url(r'^api/v2/footer_html/$', 'restapi.views.footer_html', name='footer_html'), url(r'^api/v2/quick_search/$', 'restapi.views.quick_search', name='quick_search'), + url(r'^api/v2/search/$', 'restapi.views.search', name='search'), url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')), url(r'^feeds/new/$', NewProjectsFeed(),