Merge pull request #5086 from rtfd/humitos/custom-robots-txt

Support custom robots.txt
readthedocs · Jan 16, 2019 · f06271b · f06271b
2 parents 1f8443c + 3e4b1a4
commit f06271b
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 1 deletion.
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -230,3 +230,49 @@ What commit of Read the Docs is in production?
 ----------------------------------------------
 
 We deploy readthedocs.org from the `rel` branch in our GitHub repository. You can see the latest commits that have been deployed by looking on GitHub: https://github.com/rtfd/readthedocs.org/commits/rel
+
+
+How can I avoid search results having a deprecated version of my docs?
+---------------------------------------------------------------------
+
+If readers search something related to your docs in Google, it will probably return the most relevant version of your documentation.
+It may happen that this version is already deprecated and you want to stop Google indexing it as a result,
+and start suggesting the latest (or newer) one.
+
+To accomplish this, you can add a ``robots.txt`` file to your documentation's root so it ends up served at the root URL of your project
+(for example, https://yourproject.readthedocs.io/robots.txt).
+
+
+Minimal example of ``robots.txt``
++++++++++++++++++++++++++++++++++
+
+::
+
+   User-agent: *
+   Disallow: /en/deprecated-version/
+   Disallow: /en/2.0/
+
+.. note::
+
+   See `Google's docs`_ for its full syntax.
+
+This file has to be served as is under ``/robots.txt``.
+Depending if you are using Sphinx or MkDocs, you will need a different configuration for this.
+
+
+Sphinx
+~~~~~~
+
+Sphinx uses `html_extra`_ option to add static files to the output.
+You need to create a ``robots.txt`` file and put it under the path defined in ``html_extra``.
+
+
+MkDocs
+~~~~~~
+
+MkDocs needs the ``robots.txt`` to be at the directory defined at `docs_dir`_ config.
+
+
+.. _Google's docs: https://support.google.com/webmasters/answer/6062608
+.. _html_extra: https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-html_extra_path
+.. _docs_dir: https://www.mkdocs.org/user-guide/configuration/#docs_dir
diff --git a/readthedocs/core/urls/subdomain.py b/readthedocs/core/urls/subdomain.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 """URL configurations for subdomains."""
 from __future__ import absolute_import
 
@@ -10,7 +12,7 @@
 
 from readthedocs.core.views.serve import (
     redirect_page_with_filename,
-    redirect_project_slug, serve_docs
+    redirect_project_slug, serve_docs, robots_txt,
 )
 from readthedocs.core.views import (
     server_error_500,
@@ -22,6 +24,8 @@
 handler404 = server_error_404
 
 subdomain_urls = [
+    url(r'robots.txt$', robots_txt, name='robots_txt'),
+
     url(r'^(?:|projects/(?P<subproject_slug>{project_slug})/)'
         r'page/(?P<filename>.*)$'.format(**pattern_opts),
         redirect_page_with_filename,

diff --git a/readthedocs/core/views/serve.py b/readthedocs/core/views/serve.py
@@ -223,3 +223,49 @@ def _serve_symlink_docs(request, project, privacy_level, filename=''):
 
     raise Http404(
         'File not found. Tried these files: %s' % ','.join(files_tried))
+
+
+@map_project_slug
+def robots_txt(request, project):
+    """
+    Serve custom user's defined ``/robots.txt``.
+
+    If the user added a ``robots.txt`` in the "default version" of the project,
+    we serve it directly.
+    """
+    # Use the ``robots.txt`` file from the default version configured
+    version_slug = project.get_default_version()
+    version = project.versions.get(slug=version_slug)
+
+    no_serve_robots_txt = any([
+        # If project is private or,
+        project.privacy_level == constants.PRIVATE,
+        # default version is private or,
+        version.privacy_level == constants.PRIVATE,
+        # default version is not active or,
+        not version.active,
+        # default version is not built
+        not version.built,
+    ])
+    if no_serve_robots_txt:
+        # ... we do return a 404
+        raise Http404()
+
+    filename = resolve_path(
+        project,
+        version_slug=version_slug,
+        filename='robots.txt',
+        subdomain=True,  # subdomain will make it a "full" path without a URL prefix
+    )
+
+    # This breaks path joining, by ignoring the root when given an "absolute" path
+    if filename[0] == '/':
+        filename = filename[1:]
+
+    basepath = PublicSymlink(project).project_root
+    fullpath = os.path.join(basepath, filename)
+
+    if os.path.exists(fullpath):
+        return HttpResponse(open(fullpath).read(), content_type='text/plain')
+
+    return HttpResponse('User-agent: *\nAllow: /\n', content_type='text/plain')
diff --git a/readthedocs/rtd_tests/tests/test_doc_serving.py b/readthedocs/rtd_tests/tests/test_doc_serving.py
@@ -2,13 +2,17 @@
 
 from __future__ import absolute_import, unicode_literals, division, print_function
 import mock
+from mock import patch, mock_open
 import django_dynamic_fixture as fixture
+import pytest
+import six
 
 from django.contrib.auth.models import User
 from django.test import TestCase
 from django.test.utils import override_settings
 from django.http import Http404
 from django.conf import settings
+from django.urls import reverse
 
 from readthedocs.rtd_tests.base import RequestFactoryTestMixin
 from readthedocs.projects import constants
@@ -77,6 +81,28 @@ def test_private_files_not_found(self):
         self.assertTrue('private_web_root' in str(exc.exception))
         self.assertTrue('public_web_root' not in str(exc.exception))
 
+    @override_settings(
+        PYTHON_MEDIA=False,
+        USE_SUBDOMAIN=True,
+        PUBLIC_DOMAIN='readthedocs.io',
+        ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
+    )
+    def test_robots_txt(self):
+        self.public.versions.update(active=True, built=True)
+        response = self.client.get(
+            reverse('robots_txt'),
+            HTTP_HOST='private.readthedocs.io',
+        )
+        self.assertEqual(response.status_code, 404)
+
+        self.client.force_login(self.eric)
+        response = self.client.get(
+            reverse('robots_txt'),
+            HTTP_HOST='private.readthedocs.io',
+        )
+        # Private projects/versions always return 404 for robots.txt
+        self.assertEqual(response.status_code, 404)
+
 
 @override_settings(SERVE_DOCS=[constants.PRIVATE, constants.PUBLIC])
 class TestPublicDocs(BaseDocServing):
@@ -110,3 +136,41 @@ def test_both_files_not_found(self):
             _serve_symlink_docs(request, project=self.private, filename='/en/latest/usage.html', privacy_level='public')
         self.assertTrue('private_web_root' not in str(exc.exception))
         self.assertTrue('public_web_root' in str(exc.exception))
+
+    @override_settings(
+        PYTHON_MEDIA=False,
+        USE_SUBDOMAIN=True,
+        PUBLIC_DOMAIN='readthedocs.io',
+        ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
+    )
+    def test_default_robots_txt(self):
+        self.public.versions.update(active=True, built=True)
+        response = self.client.get(
+            reverse('robots_txt'),
+            HTTP_HOST='public.readthedocs.io',
+        )
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, b'User-agent: *\nAllow: /\n')
+
+    @override_settings(
+        PYTHON_MEDIA=False,
+        USE_SUBDOMAIN=True,
+        PUBLIC_DOMAIN='readthedocs.io',
+        ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
+    )
+    @patch(
+        'builtins.open',
+        new_callable=mock_open,
+        read_data='My own robots.txt',
+    )
+    @patch('readthedocs.core.views.serve.os')
+    @pytest.mark.skipif(six.PY2, reason='In Python2 the mock is __builtins__.open')
+    def test_custom_robots_txt(self, os_mock, open_mock):
+        os_mock.path.exists.return_value = True
+        self.public.versions.update(active=True, built=True)
+        response = self.client.get(
+            reverse('robots_txt'),
+            HTTP_HOST='public.readthedocs.io',
+        )
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, b'My own robots.txt')