Skip to content

Commit

Permalink
Merge pull request #5086 from rtfd/humitos/custom-robots-txt
Browse files Browse the repository at this point in the history
Support custom robots.txt
  • Loading branch information
humitos authored Jan 16, 2019
2 parents 1f8443c + 3e4b1a4 commit f06271b
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 1 deletion.
46 changes: 46 additions & 0 deletions docs/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,49 @@ What commit of Read the Docs is in production?
----------------------------------------------

We deploy readthedocs.org from the `rel` branch in our GitHub repository. You can see the latest commits that have been deployed by looking on GitHub: https://github.com/rtfd/readthedocs.org/commits/rel


How can I avoid search results having a deprecated version of my docs?
---------------------------------------------------------------------

If readers search something related to your docs in Google, it will probably return the most relevant version of your documentation.
It may happen that this version is already deprecated and you want to stop Google indexing it as a result,
and start suggesting the latest (or newer) one.

To accomplish this, you can add a ``robots.txt`` file to your documentation's root so it ends up served at the root URL of your project
(for example, https://yourproject.readthedocs.io/robots.txt).


Minimal example of ``robots.txt``
+++++++++++++++++++++++++++++++++

::

User-agent: *
Disallow: /en/deprecated-version/
Disallow: /en/2.0/

.. note::

See `Google's docs`_ for its full syntax.

This file has to be served as is under ``/robots.txt``.
Depending if you are using Sphinx or MkDocs, you will need a different configuration for this.


Sphinx
~~~~~~

Sphinx uses `html_extra`_ option to add static files to the output.
You need to create a ``robots.txt`` file and put it under the path defined in ``html_extra``.


MkDocs
~~~~~~

MkDocs needs the ``robots.txt`` to be at the directory defined at `docs_dir`_ config.


.. _Google's docs: https://support.google.com/webmasters/answer/6062608
.. _html_extra: https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-html_extra_path
.. _docs_dir: https://www.mkdocs.org/user-guide/configuration/#docs_dir
6 changes: 5 additions & 1 deletion readthedocs/core/urls/subdomain.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-

"""URL configurations for subdomains."""
from __future__ import absolute_import

Expand All @@ -10,7 +12,7 @@

from readthedocs.core.views.serve import (
redirect_page_with_filename,
redirect_project_slug, serve_docs
redirect_project_slug, serve_docs, robots_txt,
)
from readthedocs.core.views import (
server_error_500,
Expand All @@ -22,6 +24,8 @@
handler404 = server_error_404

subdomain_urls = [
url(r'robots.txt$', robots_txt, name='robots_txt'),

url(r'^(?:|projects/(?P<subproject_slug>{project_slug})/)'
r'page/(?P<filename>.*)$'.format(**pattern_opts),
redirect_page_with_filename,
Expand Down
46 changes: 46 additions & 0 deletions readthedocs/core/views/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,49 @@ def _serve_symlink_docs(request, project, privacy_level, filename=''):

raise Http404(
'File not found. Tried these files: %s' % ','.join(files_tried))


@map_project_slug
def robots_txt(request, project):
"""
Serve custom user's defined ``/robots.txt``.
If the user added a ``robots.txt`` in the "default version" of the project,
we serve it directly.
"""
# Use the ``robots.txt`` file from the default version configured
version_slug = project.get_default_version()
version = project.versions.get(slug=version_slug)

no_serve_robots_txt = any([
# If project is private or,
project.privacy_level == constants.PRIVATE,
# default version is private or,
version.privacy_level == constants.PRIVATE,
# default version is not active or,
not version.active,
# default version is not built
not version.built,
])
if no_serve_robots_txt:
# ... we do return a 404
raise Http404()

filename = resolve_path(
project,
version_slug=version_slug,
filename='robots.txt',
subdomain=True, # subdomain will make it a "full" path without a URL prefix
)

# This breaks path joining, by ignoring the root when given an "absolute" path
if filename[0] == '/':
filename = filename[1:]

basepath = PublicSymlink(project).project_root
fullpath = os.path.join(basepath, filename)

if os.path.exists(fullpath):
return HttpResponse(open(fullpath).read(), content_type='text/plain')

return HttpResponse('User-agent: *\nAllow: /\n', content_type='text/plain')
64 changes: 64 additions & 0 deletions readthedocs/rtd_tests/tests/test_doc_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

from __future__ import absolute_import, unicode_literals, division, print_function
import mock
from mock import patch, mock_open
import django_dynamic_fixture as fixture
import pytest
import six

from django.contrib.auth.models import User
from django.test import TestCase
from django.test.utils import override_settings
from django.http import Http404
from django.conf import settings
from django.urls import reverse

from readthedocs.rtd_tests.base import RequestFactoryTestMixin
from readthedocs.projects import constants
Expand Down Expand Up @@ -77,6 +81,28 @@ def test_private_files_not_found(self):
self.assertTrue('private_web_root' in str(exc.exception))
self.assertTrue('public_web_root' not in str(exc.exception))

@override_settings(
PYTHON_MEDIA=False,
USE_SUBDOMAIN=True,
PUBLIC_DOMAIN='readthedocs.io',
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
)
def test_robots_txt(self):
self.public.versions.update(active=True, built=True)
response = self.client.get(
reverse('robots_txt'),
HTTP_HOST='private.readthedocs.io',
)
self.assertEqual(response.status_code, 404)

self.client.force_login(self.eric)
response = self.client.get(
reverse('robots_txt'),
HTTP_HOST='private.readthedocs.io',
)
# Private projects/versions always return 404 for robots.txt
self.assertEqual(response.status_code, 404)


@override_settings(SERVE_DOCS=[constants.PRIVATE, constants.PUBLIC])
class TestPublicDocs(BaseDocServing):
Expand Down Expand Up @@ -110,3 +136,41 @@ def test_both_files_not_found(self):
_serve_symlink_docs(request, project=self.private, filename='/en/latest/usage.html', privacy_level='public')
self.assertTrue('private_web_root' not in str(exc.exception))
self.assertTrue('public_web_root' in str(exc.exception))

@override_settings(
PYTHON_MEDIA=False,
USE_SUBDOMAIN=True,
PUBLIC_DOMAIN='readthedocs.io',
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
)
def test_default_robots_txt(self):
self.public.versions.update(active=True, built=True)
response = self.client.get(
reverse('robots_txt'),
HTTP_HOST='public.readthedocs.io',
)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, b'User-agent: *\nAllow: /\n')

@override_settings(
PYTHON_MEDIA=False,
USE_SUBDOMAIN=True,
PUBLIC_DOMAIN='readthedocs.io',
ROOT_URLCONF=settings.SUBDOMAIN_URLCONF,
)
@patch(
'builtins.open',
new_callable=mock_open,
read_data='My own robots.txt',
)
@patch('readthedocs.core.views.serve.os')
@pytest.mark.skipif(six.PY2, reason='In Python2 the mock is __builtins__.open')
def test_custom_robots_txt(self, os_mock, open_mock):
os_mock.path.exists.return_value = True
self.public.versions.update(active=True, built=True)
response = self.client.get(
reverse('robots_txt'),
HTTP_HOST='public.readthedocs.io',
)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, b'My own robots.txt')

0 comments on commit f06271b

Please sign in to comment.