diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
index 6bd96da716..b0e5094ac0 100644
--- a/gensim/corpora/bleicorpus.py
+++ b/gensim/corpora/bleicorpus.py
@@ -5,9 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Blei's LDA-C format.
-"""
+"""Сorpus in Blei's LDA-C format."""
 
 from __future__ import with_statement
 
@@ -19,30 +17,44 @@
 from six.moves import xrange
 
 
-logger = logging.getLogger('gensim.corpora.bleicorpus')
+logger = logging.getLogger(__name__)
 
 
 class BleiCorpus(IndexedCorpus):
-    """
-    Corpus in Blei's LDA-C format.
+    """Corpus in Blei's LDA-C format.
 
     The corpus is represented as two files: one describing the documents, and another
     describing the mapping between words and their ids.
 
     Each document is one line::
 
-      N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
+        N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
+
+
+    The vocabulary is a file with words, one word per line; word at line K has an implicit `id=K`.
 
-    The vocabulary is a file with words, one word per line; word at line K has an
-    implicit ``id=K``.
     """
 
     def __init__(self, fname, fname_vocab=None):
         """
-        Initialize the corpus from a file.
 
-        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
-        `fname.vocab`.
+        Parameters
+        ----------
+        fname : str
+            Path to corpus.
+        fname_vocab : str, optional
+            Vocabulary file. If `fname_vocab` is None, searching one of variants:
+
+            * `fname`.vocab
+            * `fname`/vocab.txt
+            * `fname_without_ext`.vocab
+            * `fname_folder`/vocab.txt
+
+        Raises
+        ------
+        IOError
+            If vocabulary file doesn't exist.
+
         """
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s", fname)
@@ -67,8 +79,13 @@ def __init__(self, fname, fname_vocab=None):
         self.id2word = dict(enumerate(words))
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """Iterate over the corpus, returning one sparse (BoW) vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
+            Document's BoW representation.
+
         """
         lineno = -1
         with utils.smart_open(self.fname) as fin:
@@ -77,6 +94,19 @@ def __iter__(self):
         self.length = lineno + 1
 
     def line2doc(self, line):
+        """Convert line in Blei LDA-C format to document (BoW representation).
+
+        Parameters
+        ----------
+        line : str
+            Line in Blei's LDA-C format.
+
+        Returns
+        -------
+        list of (int, float)
+            Document's BoW representation.
+
+        """
         parts = utils.to_unicode(line).split()
         if int(parts[0]) != len(parts) - 1:
             raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
@@ -86,14 +116,28 @@ def line2doc(self, line):
 
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
-        """
-        Save a corpus in the LDA-C format.
-
-        There are actually two files saved: `fname` and `fname.vocab`, where
-        `fname.vocab` is the vocabulary file.
+        """Save a corpus in the LDA-C format.
+
+        Notes
+        -----
+        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output file.
+        corpus : iterable of iterable of (int, float)
+            Input corpus in BoW format.
+        id2word : dict of (str, str), optional
+            Mapping id -> word for `corpus`.
+        metadata : bool, optional
+            THIS PARAMETER WILL BE IGNORED.
+
+        Returns
+        -------
+        list of int
+            Offsets for each line in file (in bytes).
 
-        This function is automatically called by `BleiCorpus.serialize`; don't
-        call it directly, call `serialize` instead.
         """
         if id2word is None:
             logger.info("no word id mapping provided; initializing from corpus")
@@ -121,8 +165,19 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         return offsets
 
     def docbyoffset(self, offset):
-        """
-        Return the document stored at file position `offset`.
+        """Get document corresponding to `offset`.
+        Offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`.
+
+        Parameters
+        ----------
+        offset : int
+            Position of the document in the file (in bytes).
+
+        Returns
+        -------
+        list of (int, float)
+            Document in BoW format.
+
         """
         with utils.smart_open(self.fname) as f:
             f.seek(offset)
diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
index 969437e571..16a88a93e9 100644
--- a/gensim/corpora/csvcorpus.py
+++ b/gensim/corpora/csvcorpus.py
@@ -4,10 +4,7 @@
 # Copyright (C) 2013 Zygmunt Zając <zygmunt@fastml.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Corpus in CSV format.
-
-"""
+"""Corpus in CSV format."""
 
 
 from __future__ import with_statement
@@ -18,22 +15,28 @@
 
 from gensim import interfaces, utils
 
-logger = logging.getLogger('gensim.corpora.csvcorpus')
+logger = logging.getLogger(__name__)
 
 
 class CsvCorpus(interfaces.CorpusABC):
-    """
-    Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
-    based on the file content.
+    """Corpus in CSV format.
 
+    Notes
+    -----
+    The CSV delimiter, headers etc. are guessed automatically based on the file content.
     All row values are expected to be ints/floats.
 
     """
 
     def __init__(self, fname, labels):
         """
-        Initialize the corpus from a file.
-        `labels` = are class labels present in the input file? => skip the first column
+
+        Parameters
+        ----------
+        fname : str
+            Path to corpus.
+        labels : bool
+            If True - ignore first column (class labels).
 
         """
         logger.info("loading corpus from %s", fname)
@@ -48,8 +51,12 @@ def __init__(self, fname, labels):
         logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """Iterate over the corpus, returning one BoW vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
+            Document in BoW format.
 
         """
         reader = csv.reader(utils.smart_open(self.fname), self.dialect)
diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
index 559081b886..c4e58cb95a 100644
--- a/gensim/corpora/indexedcorpus.py
+++ b/gensim/corpora/indexedcorpus.py
@@ -5,17 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Indexed corpus is a mechanism for random-accessing corpora.
-
-While the standard corpus interface in gensim allows iterating over corpus with
-`for doc in corpus: pass`, indexed corpus allows accessing the documents with
-`corpus[docno]` (in O(1) look-up time).
-
-This functionality is achieved by storing an extra file (by default named the same
-as the corpus file plus '.index' suffix) that stores the byte offset of the beginning
-of each document.
-"""
+"""Base Indexed Corpus class."""
 
 import logging
 import six
@@ -24,24 +14,44 @@
 
 from gensim import interfaces, utils
 
-logger = logging.getLogger('gensim.corpora.indexedcorpus')
+logger = logging.getLogger(__name__)
 
 
 class IndexedCorpus(interfaces.CorpusABC):
+    """Indexed corpus is a mechanism for random-accessing corpora.
+
+    While the standard corpus interface in gensim allows iterating over corpus,
+    we'll show it with :class:`~gensim.corpora.mmcorpus.MmCorpus`.
+
+    >>> from gensim.corpora import MmCorpus
+    >>> from gensim.test.utils import datapath
+    >>>
+    >>> corpus = MmCorpus(datapath('testcorpus.mm'))
+    >>> for doc in corpus:
+    ...    pass
+
+    :class:`~gensim.corpora.indexedcorpus.IndexedCorpus` allows accessing the documents with index
+    in :math:`{O}(1)` look-up time.
+
+    >>> document_index = 3
+    >>> doc = corpus[document_index]
+
+    Notes
+    -----
+    This functionality is achieved by storing an extra file (by default named the same as the `fname.index`)
+    that stores the byte offset of the beginning of each document.
+
+    """
+
     def __init__(self, fname, index_fname=None):
         """
-        Initialize this abstract base class, by loading a previously saved index
-        from `index_fname` (or `fname.index` if `index_fname` is not set).
-        This index will allow subclasses to support the `corpus[docno]` syntax
-        (random access to document #`docno` in O(1)).
-
-        >>> # save corpus in SvmLightCorpus format with an index
-        >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
-        >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
-        >>> # load back as a document stream (*not* plain Python list)
-        >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
-        >>> print(corpus_with_random_access[1])
-        [(0, 1.0), (1, 2.0)]
+
+        Parameters
+        ----------
+        fname : str
+            Path to corpus.
+        index_fname : str, optional
+            Path to index, if not provided - used `fname.index`.
 
         """
         try:
@@ -58,25 +68,38 @@ def __init__(self, fname, index_fname=None):
     @classmethod
     def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
                   progress_cnt=None, labels=None, metadata=False):
-        """
-        Iterate through the document stream `corpus`, saving the documents to `fname`
-        and recording byte offset of each document. Save the resulting index
-        structure to file `index_fname` (or `fname`.index is not set).
-
-        This relies on the underlying corpus class `serializer` providing (in
-        addition to standard iteration):
+        """Serialize corpus with offset metadata, allows to use direct indexes after loading.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output file.
+        corpus : iterable of iterable of (int, float)
+            Corpus in BoW format.
+        id2word : dict of (str, str), optional
+            Mapping id -> word.
+        index_fname : str, optional
+             Where to save resulting index, if None - store index to `fname`.index.
+        progress_cnt : int, optional
+            Number of documents after which progress info is printed.
+        labels : bool, optional
+             If True - ignore first column (class labels).
+        metadata : bool, optional
+            If True - ensure that serialize will write out article titles to a pickle file.
+
+        Examples
+        --------
+        >>> from gensim.corpora import MmCorpus
+        >>> from gensim.test.utils import get_tmpfile
+        >>>
+        >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
+        >>> output_fname = get_tmpfile("test.mm")
+        >>>
+        >>> MmCorpus.serialize(output_fname, corpus)
+        >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access
+        >>> print(mm[1]) # retrieve document no. 42, etc.
+        [(1, 0.1)]
 
-        * `save_corpus` method that returns a sequence of byte offsets, one for
-           each saved document,
-        * the `docbyoffset(offset)` method, which returns a document
-          positioned at `offset` bytes within the persistent storage (file).
-        * metadata if set to true will ensure that serialize will write out article titles to a pickle file.
-
-        Example:
-
-        >>> MmCorpus.serialize('test.mm', corpus)
-        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
-        >>> print(mm[42]) # retrieve document no. 42, etc.
         """
         if getattr(corpus, 'fname', None) == fname:
             raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)
@@ -107,9 +130,17 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
         utils.pickle(offsets, index_fname)
 
     def __len__(self):
-        """
-        Return the index length if the corpus is indexed. Otherwise, make a pass
-        over self to calculate the corpus length and cache this number.
+        """Get the index length.
+
+        Notes
+        -----
+        If the corpus is not indexed, also count corpus length and cache this value.
+
+        Returns
+        -------
+        int
+            Length of index.
+
         """
         if self.index is not None:
             return len(self.index)
@@ -119,11 +150,34 @@ def __len__(self):
         return self.length
 
     def __getitem__(self, docno):
+        """Get document by `docno` index.
+
+        Parameters
+        ----------
+        docno : {int, iterable of int}
+            Document number or iterable of numbers (like a list of str).
+
+        Returns
+        -------
+        list of (int, float)
+            If `docno` is int - return document in BoW format.
+
+        :class:`~gensim.utils.SlicedCorpus`
+            If `docno` is iterable of int - return several documents in BoW format
+            wrapped to :class:`~gensim.utils.SlicedCorpus`.
+
+        Raises
+        ------
+        RuntimeError
+            If index isn't exist.
+
+        """
         if self.index is None:
             raise RuntimeError("Cannot call corpus[docid] without an index")
         if isinstance(docno, (slice, list, numpy.ndarray)):
             return utils.SlicedCorpus(self, docno)
         elif isinstance(docno, six.integer_types + (numpy.integer,)):
             return self.docbyoffset(self.index[docno])
+            # TODO: no `docbyoffset` method, should be defined in this class
         else:
             raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')
diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py
index c19aa321e2..459274cfae 100644
--- a/gensim/corpora/svmlightcorpus.py
+++ b/gensim/corpora/svmlightcorpus.py
@@ -5,9 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Corpus in SVMlight format.
-"""
+"""Corpus in SVMlight format."""
 
 
 from __future__ import with_statement
@@ -18,17 +16,16 @@
 from gensim.corpora import IndexedCorpus
 
 
-logger = logging.getLogger('gensim.corpora.svmlightcorpus')
+logger = logging.getLogger(__name__)
 
 
 class SvmLightCorpus(IndexedCorpus):
-    """
-    Corpus in SVMlight format.
+    """Corpus in SVMlight format.
 
     Quoting http://svmlight.joachims.org/:
-    The input file contains the training examples. The first lines
-    may contain comments and are ignored if they start with #. Each of the following
-    lines represents one training example and is of the following format::
+    The input file contains the training examples. The first lines  may contain comments and are ignored
+    if they start with #. Each of the following lines represents one training example
+    and is of the following format::
 
         <line> .=. <target> <feature>:<value> <feature>:<value> ... <feature>:<value> # <info>
         <target> .=. +1 | -1 | 0 | <float>
@@ -38,21 +35,24 @@ class SvmLightCorpus(IndexedCorpus):
 
     The "qid" feature (used for SVMlight ranking), if present, is ignored.
 
-    Although not mentioned in the specification above, SVMlight also expect its
-    feature ids to be 1-based (counting starts at 1). We convert features to 0-base
-    internally by decrementing all ids when loading a SVMlight input file, and
-    increment them again when saving as SVMlight.
+    Notes
+    -----
+    Although not mentioned in the specification above, SVMlight also expect its feature ids to be 1-based
+    (counting starts at 1). We convert features to 0-base internally by decrementing all ids when loading a SVMlight
+    input file, and increment them again when saving as SVMlight.
 
     """
 
     def __init__(self, fname, store_labels=True):
         """
-        Initialize the corpus from a file.
 
-        Although vector labels (~SVM target class) are not used in gensim in any way,
-        they are parsed and stored in `self.labels` for convenience. Set `store_labels=False`
-        to skip storing these labels (e.g. if there are too many vectors to store
-        the self.labels array in memory).
+        Parameters
+        ----------
+        fname: str
+            Path to corpus.
+        store_labels : bool, optional
+            Whether to store labels (~SVM target class). They currently have no application but stored
+            in `self.labels` for convenience by default.
 
         """
         IndexedCorpus.__init__(self, fname)
@@ -64,8 +64,13 @@ def __init__(self, fname, store_labels=True):
         self.labels = []
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """ Iterate over the corpus, returning one sparse (BoW) vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
+            Document in BoW format.
+
         """
         lineno = -1
         self.labels = []
@@ -80,14 +85,29 @@ def __iter__(self):
 
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
-        """
-        Save a corpus in the SVMlight format.
+        """Save a corpus in the SVMlight format.
+
+        The SVMlight `<target>` class tag is taken from the `labels` array, or set to 0 for all documents
+        if `labels` is not supplied.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output file.
+        corpus : iterable of iterable of (int, float)
+            Corpus in BoW format.
+        id2word : dict of (str, str), optional
+            Mapping id -> word.
+        labels : list or False
+            An SVMlight `<target>` class tags or False if not present.
+        metadata : bool
+            ARGUMENT WILL BE IGNORED.
+
+        Returns
+        -------
+        list of int
+            Offsets for each line in file (in bytes).
 
-        The SVMlight `<target>` class tag is taken from the `labels` array, or set
-        to 0 for all documents if `labels` is not supplied.
-
-        This function is automatically called by `SvmLightCorpus.serialize`; don't
-        call it directly, call `serialize` instead.
         """
         logger.info("converting corpus to SVMlight format: %s", fname)
 
@@ -100,16 +120,37 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
         return offsets
 
     def docbyoffset(self, offset):
-        """
-        Return the document stored at file position `offset`.
+        """Get the document stored at file position `offset`.
+
+        Parameters
+        ----------
+        offset : int
+            Document's position.
+
+        Returns
+        -------
+        tuple of (int, float)
+
         """
         with utils.smart_open(self.fname) as f:
             f.seek(offset)
             return self.line2doc(f.readline())[0]
+            # TODO: it brakes if gets None from line2doc
 
     def line2doc(self, line):
-        """
-        Create a document from a single line (string) in SVMlight format
+        """Get a document from a single line in SVMlight format.
+        This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.doc2line`.
+
+        Parameters
+        ----------
+        line : str
+            Line in SVMLight format.
+
+        Returns
+        -------
+        (list of (int, float), str)
+            Document in BoW format and target class label.
+
         """
         line = utils.to_unicode(line)
         line = line[: line.find('#')].strip()
@@ -125,8 +166,21 @@ def line2doc(self, line):
 
     @staticmethod
     def doc2line(doc, label=0):
-        """
-        Output the document in SVMlight format, as a string. Inverse function to `line2doc`.
+        """Convert BoW representation of document in SVMlight format.
+        This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.line2doc`.
+
+        Parameters
+        ----------
+        doc : list of (int, float)
+            Document in BoW format.
+        label : int, optional
+            Document label (if provided).
+
+        Returns
+        -------
+        str
+            `doc` in SVMlight format.
+
         """
         pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc)  # +1 to convert 0-base to 1-base
         return "%s %s\n" % (label, pairs)
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
old mode 100755
new mode 100644
index 7148b90884..cd57f20109
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -7,15 +7,19 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump.
+"""Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump.
+
+Notes
+-----
+If you have the `pattern` package installed, this module will use a fancy lemmatization to get a lemma
+of each token (instead of plain alphabetic tokenizer). The package is available at [1]_ .
 
-If you have the `pattern` package installed, this module will use a fancy
-lemmatization to get a lemma of each token (instead of plain alphabetic
-tokenizer). The package is available at https://github.com/clips/pattern .
+See :mod:`~gensim.scripts.make_wiki` for a canned (example) script based on this module.
+
+References
+----------
+.. [1] https://github.com/clips/pattern
 
-See scripts/process_wiki.py for a canned (example) script based on this
-module.
 """
 
 
@@ -34,45 +38,74 @@
 
 logger = logging.getLogger(__name__)
 
-# ignore articles shorter than ARTICLE_MIN_WORDS characters (after full preprocessing)
 ARTICLE_MIN_WORDS = 50
+"""Ignore shorter articles (after full preprocessing)."""
 
 # default thresholds for lengths of individual tokens
 TOKEN_MIN_LEN = 2
 TOKEN_MAX_LEN = 15
 
 
-RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)  # comments
-RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)  # footnotes
-RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)  # links to languages
-RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)  # template
-RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)  # template
-RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)  # remove URL, keep description
-RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)  # simplify links, keep description
-RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of images
-RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of files
-RE_P9 = re.compile(r'<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE)  # outside links
-RE_P10 = re.compile(r'<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE)  # math content
-RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)  # all other tags
-RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)  # table formatting
-RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)  # table cell formatting
-RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)  # categories
-RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)  # Remove File and Image template
-
-
-# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
-# ought to be ignored
+RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)
+"""Comments."""
+RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)
+"""Footnotes."""
+RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
+"""Links to languages."""
+RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
+"""Template."""
+RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
+"""Template."""
+RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
+"""Remove URL, keep description."""
+RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
+"""Simplify links, keep description."""
+RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
+"""Keep description of images."""
+RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
+"""Keep description of files."""
+RE_P9 = re.compile(r'<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE)
+"""External links."""
+RE_P10 = re.compile(r'<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE)
+"""Math content."""
+RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
+"""All other tags."""
+RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)
+"""Table formatting."""
+RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)
+"""Table cell formatting."""
+RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
+"""Categories."""
+RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+"""Remove File and Image templates."""
+
 IGNORED_NAMESPACES = [
     'Wikipedia', 'Category', 'File', 'Portal', 'Template',
     'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
     'Special', 'Talk'
 ]
+"""MediaWiki namespaces [2]_ that ought to be ignored.
+
+References
+----------
+.. [2] https://www.mediawiki.org/wiki/Manual:Namespace
+
+"""
 
 
 def filter_wiki(raw):
-    """
-    Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
-    or utf-8 encoded string.
+    """Filter out wiki markup from `raw`, leaving only text.
+
+    Parameters
+    ----------
+    raw : str
+        Unicode or utf-8 encoded string.
+
+    Returns
+    -------
+    str
+        `raw` without markup.
+
     """
     # parsing of the wiki markup is not perfect, but sufficient for our purposes
     # contributions to improving this code are welcome :)
@@ -82,6 +115,19 @@ def filter_wiki(raw):
 
 
 def remove_markup(text):
+    """Filter out wiki markup from `text`, leaving only text.
+
+    Parameters
+    ----------
+    text : str
+        String containing markup.
+
+    Returns
+    -------
+    str
+        `text` without markup.
+
+    """
     text = re.sub(RE_P2, '', text)  # remove the last list (=languages)
     # the wiki markup is recursive (markup inside markup etc)
     # instead of writing a recursive grammar, here we deal with that by removing
@@ -119,12 +165,24 @@ def remove_markup(text):
 def remove_template(s):
     """Remove template wikimedia markup.
 
-    Return a copy of `s` with all the wikimedia markup template removed. See
-    http://meta.wikimedia.org/wiki/Help:Template for wikimedia templates
-    details.
+    Parameters
+    ----------
+    s : str
+        String containing markup template.
+
+    Returns
+    -------
+    str
+        Сopy of `s` with all the wikimedia markup template removed. See [4]_ for wikimedia templates details.
+
+    Notes
+    -----
+    Since template can be nested, it is difficult remove them using regular expressions.
+
+    References
+    ----------
+    .. [4] http://meta.wikimedia.org/wiki/Help:Template
 
-    Note: Since template can be nested, it is difficult remove them using
-    regular expresssions.
     """
 
     # Find the start and end position of each template by finding the opening
@@ -157,9 +215,20 @@ def remove_template(s):
 def remove_file(s):
     """Remove the 'File:' and 'Image:' markup, keeping the file caption.
 
-    Return a copy of `s` with all the 'File:' and 'Image:' markup replaced by
-    their corresponding captions. See http://www.mediawiki.org/wiki/Help:Images
-    for the markup details.
+    Parameters
+    ----------
+    s : str
+        String containing 'File:' and 'Image:' markup.
+
+    Returns
+    -------
+    str
+        Сopy of `s` with all the 'File:' and 'Image:' markup replaced by their corresponding captions. [3]_
+
+    References
+    ----------
+    .. [3] http://www.mediawiki.org/wiki/Help:Images
+
     """
     # The regex RE_P15 match a File: or Image: markup
     for match in re.finditer(RE_P15, s):
@@ -170,13 +239,26 @@ def remove_file(s):
 
 
 def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
-    """
-    Tokenize a piece of text from wikipedia. The input string `content` is assumed
-    to be mark-up free (see `filter_wiki()`).
+    """Tokenize a piece of text from wikipedia.
 
     Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.
 
-    Return list of tokens as utf8 bytestrings.
+    Parameters
+    ----------
+    content : str
+        String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`).
+    token_min_len : int
+        Minimal token length.
+    token_max_len : int
+        Maximal token length.
+    lower : bool
+         If True - convert `content` to lower case.
+
+    Returns
+    -------
+    list of str
+        List of tokens from `content`.
+
     """
     # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
     return [
@@ -186,7 +268,19 @@ def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN,
 
 
 def get_namespace(tag):
-    """Returns the namespace of tag."""
+    """Get the namespace of tag.
+
+    Parameters
+    ----------
+    tag : str
+        Namespace or tag.
+
+    Returns
+    -------
+    str
+        Matched namespace or tag.
+
+    """
     m = re.match("^{(.*?)}", tag)
     namespace = m.group(1) if m else ""
     if not namespace.startswith("http://www.mediawiki.org/xml/export-"):
@@ -198,10 +292,19 @@ def get_namespace(tag):
 
 
 def extract_pages(f, filter_namespaces=False):
-    """
-    Extract pages from a MediaWiki database dump = open file-like object `f`.
+    """Extract pages from a MediaWiki database dump.
 
-    Return an iterable over (str, str, str) which generates (title, content, pageid) triplets.
+    Parameters
+    ----------
+    f : file
+        File-like object.
+    filter_namespaces : list of str or bool
+         Namespaces that will be extracted.
+
+    Yields
+    ------
+    tuple of (str or None, str, str)
+        Title, text and page id.
 
     """
     elems = (elem for _, elem in iterparse(f, events=("end",)))
@@ -247,12 +350,35 @@ def extract_pages(f, filter_namespaces=False):
 
 def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
                     token_max_len=TOKEN_MAX_LEN, lower=True):
-    """
-    Parse a wikipedia article, returning its content as a list of tokens
-    (utf8-encoded strings).
+    """Parse a wikipedia article, extract all tokens.
+
+    Notes
+    -----
+    Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages
+    like japanese or thai to perform better tokenization.
+    The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool).
+
+    Parameters
+    ----------
+    args : (str, bool, str, int)
+        Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
+        page identificator.
+    tokenizer_func : function
+        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
+        Needs to have interface:
+        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
+    token_min_len : int
+        Minimal token length.
+    token_max_len : int
+        Maximal token length.
+    lower : bool
+         If True - convert article text to lower case.
+
+    Returns
+    -------
+    (list of str, str, int)
+        List of tokens from article, title and page id.
 
-    Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better
-    tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower).
     """
     text, lemmatize, title, pageid = args
     text = filter_wiki(text)
@@ -264,13 +390,36 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
 
 
 def init_to_ignore_interrupt():
-    """Should only be used when master is prepared to handle termination of child processes."""
+    """Enables interruption ignoring.
+
+    Warnings
+    --------
+    Should only be used when master is prepared to handle termination of
+    child processes.
+
+    """
     signal.signal(signal.SIGINT, signal.SIG_IGN)
 
 
 def _process_article(args):
-    """Should not be called explicitly. Use `process_article` instead."""
+    """Same as :func:`~gensim.corpora.wikicorpus.process_article`, but with args in list format.
 
+    Parameters
+    ----------
+    args : [(str, bool, str, int), (function, int, int, bool)]
+        First element - same as `args` from :func:`~gensim.corpora.wikicorpus.process_article`,
+        second element is tokenizer function, token minimal length, token maximal length, lowercase flag.
+
+    Returns
+    -------
+    (list of str, str, int)
+        List of tokens from article, title and page id.
+
+    Warnings
+    --------
+    Should not be called explicitly. Use :func:`~gensim.corpora.wikicorpus.process_article` instead.
+
+    """
     tokenizer_func, token_min_len, token_max_len, lower = args[-1]
     args = args[:-1]
 
@@ -281,17 +430,33 @@ def _process_article(args):
 
 
 class WikiCorpus(TextCorpus):
-    """
-    Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
-    or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
+    """Treat a wikipedia articles dump as a **read-only** corpus.
+
+    Supported dump formats:
 
-    The documents are extracted on-the-fly, so that the whole (massive) dump
-    can stay compressed on disk.
+    * <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
+    * <LANG>wiki-latest-pages-articles.xml.bz2
 
-    **Note:** "multistream" archives are *not* supported in Python 2 due to
-    `limitations in the core bz2 library
+    The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.
+
+    Notes
+    -----
+    Dumps for English wikipedia can be founded `here <https://dumps.wikimedia.org/enwiki/>`_.
+
+    Attributes
+    ----------
+    metadata : bool
+        Whether to write articles titles to serialized corpus.
+
+    Warnings
+    --------
+    "Multistream" archives are *not* supported in Python 2 due to `limitations in the core bz2 library
     <https://docs.python.org/2/library/bz2.html#de-compression-of-files>`_.
 
+    Examples
+    --------
+    >>> from gensim.corpora import WikiCorpus, MmCorpus
+    >>>
     >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
     >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping
 
@@ -299,26 +464,37 @@ class WikiCorpus(TextCorpus):
     def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                  token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
-        """
-        Initialize the corpus. Unless a dictionary is provided, this scans the
-        corpus once, to determine its vocabulary.
-
-        If `pattern` package is installed, use fancier shallow parsing to get
-        token lemmas. Otherwise, use simple regexp tokenization. You can override
-        this automatic logic by forcing the `lemmatize` parameter explicitly.
-        self.metadata if set to true will ensure that serialize will write out article titles to a pickle file.
-
-        Set `article_min_tokens` as a min threshold for article token count (defaults to 50). Any article below this is
-        ignored.
-
-        Set `tokenizer_func` (defaults to `tokenize`) with a custom function reference to control tokenization else use
-        the default regexp tokenization. Set this parameter for languages like japanese or thai to perform better
-        tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). The
-        parameter values are as configured on the class instance by default.
-
-        Set `lower` to control if everything should be converted to lowercase or not (default True).
-
-        Set `token_min_len`, `token_max_len` as thresholds for token lengths that are returned (default to 2 and 15).
+        """Initialize the corpus.
+
+        Unless a dictionary is provided, this scans the corpus once,
+        to determine its vocabulary.
+
+        Parameters
+        ----------
+        fname : str
+            Path to file with wikipedia dump.
+        processes : int, optional
+            Number of processes to run, defaults to **number of cpu - 1**.
+        lemmatize : bool
+            Whether to use lemmatization instead of simple regexp tokenization.
+            Defaults to `True` if *pattern* package installed.
+        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
+            Dictionary, if not provided,  this scans the corpus once, to determine its vocabulary
+            (this needs **really long time**).
+        filter_namespaces : tuple of str
+            Namespaces to consider.
+        tokenizer_func : function, optional
+            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
+            Need to support interface:
+            tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
+        article_min_tokens : int, optional
+            Minimum tokens in article. Article will be ignored if number of tokens is less.
+        token_min_len : int, optional
+            Minimal token length.
+        token_max_len : int, optional
+            Maximal token length.
+        lower : bool, optional
+             If True - convert all text to lower case.
 
         """
         self.fname = fname
@@ -336,18 +512,23 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.dictionary = dictionary or Dictionary(self.get_texts())
 
     def get_texts(self):
-        """
-        Iterate over the dump, returning text version of each article as a list
-        of tokens.
+        """Iterate over the dump, yielding list of tokens for each article.
 
-        Only articles of sufficient length are returned (short articles & redirects
-        etc are ignored). This is controlled by `article_min_tokens` on the class instance.
-
-        Note that this iterates over the **texts**; if you want vectors, just use
-        the standard corpus interface instead of this function::
+        Notes
+        -----
+        This iterates over the **texts**. If you want vectors, just use the standard corpus interface
+        instead of this method:
 
         >>> for vec in wiki_corpus:
         >>>     print(vec)
+
+        Yields
+        ------
+        list of str
+            If `metadata` is False, yield only list of token extracted from the article.
+        (list of str, (int, str))
+            List of tokens (extracted from the article), page id and article title otherwise.
+
         """
 
         articles, articles_all = 0, 0