piskvorky · menshikh-iv · Jun 22, 2018 · Jun 20, 2018 · Jun 20, 2018 · Jun 20, 2018
diff --git a/gensim/corpora/_mmreader.c b/gensim/corpora/_mmreader.c
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -4,7 +4,6 @@
 # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-
 """This module implements the concept of a Dictionary -- a mapping between words and their integer ids."""
 
 from __future__ import with_statement
@@ -30,8 +29,10 @@
 class Dictionary(utils.SaveLoad, Mapping):
     """Dictionary encapsulates the mapping between normalized words and their integer ids.
 
-    Notable instance attributes
-    ---------------------------
+    Notable instance attributes:
+
+    Attributes
+    ----------
     token2id : dict of (str, int)
         token -> tokenId.
     id2token : dict of (int, str)
@@ -60,13 +61,10 @@ def __init__(self, documents=None, prune_at=2000000):
         Examples
         --------
         >>> from gensim.corpora import Dictionary
+        >>>
         >>> texts = [['human', 'interface', 'computer']]
-
-        >>> # initialize a Dictionary
-        >>> dct = Dictionary(texts)
-
-        >>> # add more document (extend the vocabulary)
-        >>> dct.add_documents([["cat", "say", "meow"], ["dog"]])
+        >>> dct = Dictionary(texts)  # initialize a Dictionary
+        >>> dct.add_documents([["cat", "say", "meow"], ["dog"]])  # add more document (extend the vocabulary)
         >>> dct.doc2bow(["dog", "computer", "non_existent_word"])
         [(0, 1), (6, 1)]
 
@@ -181,7 +179,6 @@ def add_documents(self, documents, prune_at=2000000):
         >>> from gensim.corpora import Dictionary
         >>>
         >>> corpus = ["máma mele maso".split(), "ema má máma".split()]
-
         >>> dct = Dictionary(corpus)
         >>> len(dct)
         5
@@ -213,9 +210,9 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
         document : list of str
             Input document.
         allow_update : bool, optional
-            Update self, by adding new tokens from `document` and updating internal corpus statistics?
+            Update self, by adding new tokens from `document` and updating internal corpus statistics.
         return_missing : bool, optional
-            Return missing tokens? Missing tokens are tokens present in `document` but not in self.
+            If True - return missing tokens (tokens present in `document` but not in self) with frequencies.
 
         Return
         ------
@@ -270,10 +267,7 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
             return result
 
     def doc2idx(self, document, unknown_word_index=-1):
-        """Convert `document` (a list of words) into a list of indexes = list of `token_id`s.
-
-        Notes
-        -----
+        """Convert `document` (a list of words) into a list of indexes = list of `token_id`.
         Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.
 
         Parameters
@@ -328,7 +322,8 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N
         #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `keep_n=None`).
 
         After the pruning, resulting gaps in word ids are shrunk.
-        Due to this gap shrinking, **the same word may have a different word id before and after the call to this function!**
+        Due to this gap shrinking, **the same word may have a different word id before and after the call
+        to this function!**
 
         Examples
         --------
@@ -463,7 +458,7 @@ def save_as_text(self, fname, sort_by_word=True):
         fname : str
             Path to output file.
         sort_by_word : bool, optional
-            Sort words in lexicographical order before writing them out?
+            Sort words in lexicographical order before writing them out.
 
         Notes
         -----
@@ -483,6 +478,7 @@ def save_as_text(self, fname, sort_by_word=True):
         See Also
         --------
         :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`
+            Load :class:`~gensim.corpora.dictionary.Dictionary` from text file.
 
         Examples
         --------
@@ -495,7 +491,7 @@ def save_as_text(self, fname, sort_by_word=True):
         >>> dct = Dictionary(corpus)
         >>> dct.save_as_text(tmp_fname)
         >>>
-        >>> loaded_dct = Dictionary.load_from_text("testdata")
+        >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
         >>> assert dct.token2id == loaded_dct.token2id
 
         """
@@ -513,7 +509,8 @@ def save_as_text(self, fname, sort_by_word=True):
                     fout.write(utils.to_utf8(line))
 
     def merge_with(self, other):
-        """Merge another dictionary into this dictionary, mapping the same tokens to the same ids and new tokens to new ids.
+        """Merge another dictionary into this dictionary, mapping the same tokens to the same ids
+        and new tokens to new ids.
 
         Notes
         -----
@@ -525,7 +522,7 @@ def merge_with(self, other):
 
         Parameters
         ----------
-        other : any Mapping, e.g. `dict`, :class:`~gensim.corpora.dictionary.Dictionary`, …
+        other : {dict, :class:`~gensim.corpora.dictionary.Dictionary`}
             Other dictionary.
 
         Return
@@ -584,6 +581,7 @@ def load_from_text(fname):
         See Also
         --------
         :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`
+            Save :class:`~gensim.corpora.dictionary.Dictionary` to text file.
 
         Examples
         --------
@@ -596,7 +594,7 @@ def load_from_text(fname):
         >>> dct = Dictionary(corpus)
         >>> dct.save_as_text(tmp_fname)
         >>>
-        >>> loaded_dct = Dictionary.load_from_text("testdata")
+        >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
         >>> assert dct.token2id == loaded_dct.token2id
 
         """

diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py
@@ -4,10 +4,8 @@
 # Copyright (C) 2012 Homer Strong, Radim Rehurek
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-
-"""
-This module implements the "hashing trick" [1]_ -- a mapping between words and their integer ids
-using a fixed, static mapping (hash function).
+"""Implements the `"hashing trick" <http://en.wikipedia.org/wiki/Hashing-Trick>`_ -- a mapping between words
+and their integer ids using a fixed, static mapping (hash function).
 
 Notes
 -----
@@ -27,11 +25,6 @@
 
 * Multiple words may map to the same id, causing hash collisions. The word <-> id mapping is no longer a bijection.
 
-References
-----------
-
-.. [1] http://en.wikipedia.org/wiki/Hashing-Trick
-
 """
 
 from __future__ import with_statement
@@ -50,17 +43,13 @@
 class HashDictionary(utils.SaveLoad, dict):
     """Mapping between words and their integer ids, using a hashing function.
 
-    Notes
-    -----
-
     Unlike :class:`~gensim.corpora.dictionary.Dictionary`,
     building a :class:`~gensim.corpora.hashdictionary.HashDictionary` before using it **isn't a necessary step**.
 
     You can start converting words to ids immediately, without training on a corpus.
 
     Examples
     --------
-
     >>> from gensim.corpora import HashDictionary
     >>>
     >>> dct = HashDictionary(debug=False)  # needs no training corpus!
@@ -75,15 +64,15 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr
 
         Parameters
         ----------
-        documents : iterable of iterable of str
+        documents : iterable of iterable of str, optional
             Iterable of documents. If given, used to collect additional corpus statistics.
             :class:`~gensim.corpora.hashdictionary.HashDictionary` can work
             without these statistics (optional parameter).
         id_range : int, optional
             Number of hash-values in table, used as `id = myhash(key) %% id_range`.
-        myhash : function
+        myhash : function, optional
             Hash function, should support interface `myhash(str) -> int`, uses `zlib.adler32` by default.
-        debug : bool
+        debug : bool, optional
             If True - store which tokens have mapped to a given id. **Will use a lot of RAM**.
             If you find yourself running out of memory (or not sure that you really need raw tokens),
             keep `debug=False`.
@@ -110,7 +99,10 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr
     def __getitem__(self, tokenid):
         """Get all words that have mapped to the given id so far, as a set.
 
-        Works only if you initialized your `HashDictionary` object with `debug=True`.
+        Warnings
+        --------
+        Works only if you initialized your :class:`~gensim.corpora.hashdictionary.HashDictionary` object
+        with `debug=True`.
 
         Parameters
         ----------
@@ -162,8 +154,11 @@ def from_documents(*args, **kwargs):
         return HashDictionary(*args, **kwargs)
 
     def add_documents(self, documents):
-        """Collect corpus statistics from a corpus. Useful only if `debug=True`, to build
-        the reverse `id=>set(words)` mapping.
+        """Collect corpus statistics from a corpus.
+
+        Warnings
+        --------
+        Useful only if `debug=True`, to build the reverse `id=>set(words)` mapping.
 
         Notes
         -----
@@ -199,8 +194,8 @@ def add_documents(self, documents):
         )
 
     def doc2bow(self, document, allow_update=False, return_missing=False):
-        """Convert a sequence of words `document` into the bag-of-words format of
-        `[(word_id, word_count)]` (e.g. `[(1, 4), (150, 1), (2005, 2)]`).
+        """Convert a sequence of words `document` into the bag-of-words format of `[(word_id, word_count)]`
+        (e.g. `[(1, 4), (150, 1), (2005, 2)]`).
 
         Notes
         -----
@@ -217,7 +212,7 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
             A sequence of word tokens = **tokenized and normalized** strings.
         allow_update : bool, optional
             If True - update corpus statistics and if `debug=True`, also the reverse id=>word mapping.
-        return_missing : bool
+        return_missing : bool, optional
             Not used. Only here for compatibility with the Dictionary class.
 
         Return
@@ -263,12 +258,16 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
             return result
 
     def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
-        """Filter tokens in the debug dictionary by their frequency. Only makes sense when `debug=True`.
+        """Filter tokens in the debug dictionary by their frequency.
 
         Since :class:`~gensim.corpora.hashdictionary.HashDictionary` id range is fixed and doesn't depend on the number
         of tokens seen, this doesn't really "remove" anything. It only clears some
         internal corpus statistics, for easier debugging and a smaller RAM footprint.
 
+        Warnings
+        --------
+        Only makes sense when `debug=True`.
+
         Parameters
         ----------
         no_below : int, optional
@@ -287,18 +286,6 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
         #. More than `no_above` documents (fraction of total corpus size, **not absolute number**).
         #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`).
 
-        Examples
-        --------
-
-        >>> from gensim.corpora import HashDictionary
-        >>>
-        >>> dct = HashDictionary(debug=True)
-        >>>
-        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
-        >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1)
-        >>> print dct.token2id
-        {'maso': 15025}
-
         """
         no_above_abs = int(no_above * self.num_docs)  # convert fractional threshold to absolute threshold
         ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs]
@@ -319,7 +306,11 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
         )
 
     def save_as_text(self, fname):
-        """Save the debug token=>id mapping to a text file. Only makes sense when `debug=True`, for debugging.
+        """Save the debug token=>id mapping to a text file.
+
+        Warnings
+        --------
+        Only makes sense when `debug=True`, for debugging.
 
         Parameters
         ----------

diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py
@@ -4,8 +4,7 @@
 # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-
-"""Corpus in the Matrix Market format, https://math.nist.gov/MatrixMarket/formats.html."""
+"""Corpus in the `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_."""
 
 import logging
 
@@ -17,12 +16,15 @@
 
 
 class MmCorpus(matutils.MmReader, IndexedCorpus):
-    """Corpus serialized using the `sparse coordinate Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_.
+    """Corpus serialized using the `sparse coordinate Matrix Market format
+    <https://math.nist.gov/MatrixMarket/formats.html>`_.
 
     Wrap a term-document matrix on disk (in matrix-market format), and present it
     as an object which supports iteration over the matrix rows (~documents).
 
-    Notable attributes
+    Notable instance attributes:
+
+    Attributes
     ------------------
     num_docs : int
         Number of documents in the market matrix file.
@@ -48,7 +50,6 @@ class MmCorpus(matutils.MmReader, IndexedCorpus):
     ...     pass
 
     """
-
     def __init__(self, fname):
         """
 
@@ -69,7 +70,7 @@ def __iter__(self):
         Yields
         ------
         list of (int, numeric)
-            Document in the `sparse Gensim bag-of-words format <https://radimrehurek.com/gensim/intro.html#core-concepts>`_.
+            Document in the `sparse Gensim bag-of-words format <intro.rst#core-concepts>`__.
 
         Notes
         ------
@@ -93,16 +94,16 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
             Corpus in Bow format.
         id2word : dict of (int, str), optional
             Mapping between word_id -> word. Used to retrieve the total vocabulary size if provided.
-            Otherwise, the total vacabulary size is estimated based on the highest feature id encountered in `corpus`.
+            Otherwise, the total vocabulary size is estimated based on the highest feature id encountered in `corpus`.
         progress_cnt : int, optional
             How often to report (log) progress.
         metadata : bool, optional
             If true, writes out additional metadata.
 
-        Notes
-        -----
-        This function is automatically called by :class:`~gensim.corpora.mmcorpus.MmCorpus.serialize()`; don't
-        call it directly, call `serialize` instead.
+        Warnings
+        --------
+        This function is automatically called by :class:`~gensim.corpora.mmcorpus.MmCorpus.serialize`, don't
+        call it directly, call :class:`~gensim.corpora.mmcorpus.MmCorpus.serialize` instead.
 
         Example
         -------