piskvorky · menshikh-iv · Nov 13, 2017 · Nov 1, 2017 · Nov 1, 2017 · Nov 2, 2017
diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py
@@ -40,6 +40,26 @@
 
 
 def remove_stopwords(s):
+    """Takes string, removes all words those are among stopwords.
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    str
+        Unicode string without stopwords.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import remove_stopwords
+    >>> s = "Better late than never, but better never late."
+    >>> remove_stopwords(s)
+    u'Better late never, better late.'
+
+    """
+
     s = utils.to_unicode(s)
     return " ".join(w for w in s.split() if w not in STOPWORDS)
 
@@ -48,12 +68,36 @@ def remove_stopwords(s):
 
 
 def strip_punctuation(s):
+    """Takes string, replaces all punctuation characters with spaces.
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    str
+        Unicode string without punctuation characters.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import strip_punctuation
+    >>> s = "A semicolon is a stronger break than a comma, but not as much as a full stop!"
+    >>> strip_punctuation(s)
+    u'A semicolon is a stronger break than a comma  but not as much as a full stop '
+
+    """
+
     s = utils.to_unicode(s)
     return RE_PUNCT.sub(" ", s)
 
 
 # unicode.translate cannot delete characters like str can
 strip_punctuation2 = strip_punctuation
+"""
+Same as strip_punctuation
+"""
+
 # def strip_punctuation2(s):
 #     s = utils.to_unicode(s)
 #     return s.translate(None, string.punctuation)
@@ -63,11 +107,58 @@ def strip_punctuation(s):
 
 
 def strip_tags(s):
+    """Takes string and removes tags.
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    str
+        Unicode string without tags.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import strip_tags
+    >>> s = "<i>Hello</i> <b>World</b>!"
+    >>> strip_tags(s)
+    u'Hello World!'
+
+    """
+
     s = utils.to_unicode(s)
     return RE_TAGS.sub("", s)
 
 
 def strip_short(s, minsize=3):
+    """Takes string and removes words with length lesser than minsize (default = 3).
+
+    Parameters
+    ----------
+    s : str
+    minsize : int, optional
+
+    Returns
+    -------
+    str
+        Unicode string without words with length lesser than minsize.
+
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import strip_short
+    >>> s = "salut les amis du 59"
+    >>> strip_short(s)
+    u'salut les amis'
+
+    >>> from gensim.parsing.preprocessing import strip_short
+    >>> s = "one two three four five six seven eight nine ten"
+    >>> strip_short(s,5)
+    u'three seven eight'
+
+    """
+
     s = utils.to_unicode(s)
     return " ".join(e for e in s.split() if len(e) >= minsize)
 
@@ -76,6 +167,26 @@ def strip_short(s, minsize=3):
 
 
 def strip_numeric(s):
+    """Takes string and removes digits from it.
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    str
+        Unicode string without digits.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import strip_numeric
+    >>> s = "0text24gensim365test"
+    >>> strip_numeric(s)
+    u'textgensimtest'
+
+    """
+
     s = utils.to_unicode(s)
     return RE_NUMERIC.sub("", s)
 
@@ -84,6 +195,27 @@ def strip_numeric(s):
 
 
 def strip_non_alphanum(s):
+    """Takes string and removes not a word characters from it.
+    (Word characters - alphanumeric & underscore)
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    str
+        Unicode string without not a word characters.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import strip_non_alphanum
+    >>> s = "if-you#can%read$this&then@this#method^works"
+    >>> strip_non_alphanum(s)
+    u'if you can read this then this method works'
+
+    """
+
     s = utils.to_unicode(s)
     return RE_NONALPHA.sub(" ", s)
 
@@ -92,6 +224,27 @@ def strip_non_alphanum(s):
 
 
 def strip_multiple_whitespaces(s):
+    r"""Takes string, removes repeating in a row whitespace characters (spaces, tabs, line breaks) from it
+    and turns tabs & line breaks into spaces.
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    str
+        Unicode string without repeating in a row whitespace characters.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import strip_multiple_whitespaces
+    >>> s = "salut" + '\r' + " les" + '\n' + "         loulous!"
+    >>> strip_multiple_whitespaces(s)
+    u'salut les loulous!'
+
+    """
+
     s = utils.to_unicode(s)
     return RE_WHITESPACE.sub(" ", s)
 
@@ -101,22 +254,61 @@ def strip_multiple_whitespaces(s):
 
 
 def split_alphanum(s):
+    """Takes string, adds spaces between digits & letters.
+
+    Parameters
+    ----------
+    s : str
+
+    Returns
+    -------
+    str
+        Unicode string with spaces between digits & letters.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import split_alphanum
+    >>> s = "24.0hours7 days365 a1b2c3"
+    >>> split_alphanum(s)
+    u'24.0 hours 7 days 365 a 1 b 2 c 3'
+
+    """
+
     s = utils.to_unicode(s)
     s = RE_AL_NUM.sub(r"\1 \2", s)
     return RE_NUM_AL.sub(r"\1 \2", s)
 
 
 def stem_text(text):
+    """Takes string, tranforms it into lowercase and (porter-)stemmed version.
+
+    Parameters
+    ----------
+    text : str
+
+    Returns
+    -------
+    str
+        Lowercase and (porter-)stemmed version of string `text`.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import stem_text
+    >>> text = "While it is quite useful to be able to search a large collection of documents almost instantly for a joint occurrence of a collection of exact words, for many searching purposes, a little fuzziness would help. "
+    >>> stem_text(text)
+    u'while it is quit us to be abl to search a larg collect of document almost instantli for a joint occurr of a collect of exact words, for mani search purposes, a littl fuzzi would help.'
+
     """
-    Return lowercase and (porter-)stemmed version of string `text`.
-    """
+
     text = utils.to_unicode(text)
     p = PorterStemmer()
     return ' '.join(p.stem(word) for word in text.split())
 
 
 stem = stem_text
 
+
+
 DEFAULT_FILTERS = [
     lambda x: x.lower(), strip_tags, strip_punctuation,
     strip_multiple_whitespaces, strip_numeric,
@@ -125,17 +317,84 @@ def stem_text(text):
 
 
 def preprocess_string(s, filters=DEFAULT_FILTERS):
+    """Takes string, applies list of chosen filters to it, where filters are methods from this module. Default list of filters consists of: strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text. <function <lambda>> in signature means that we use lambda function for applying methods to filters.
+
+    Parameters
+    ----------
+    s : str
+    filters: list, optional
+
+    Returns
+    -------
+    list
+        List of unicode strings.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import preprocess_string
+    >>> s = "<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?"
+    >>> preprocess_string(s)
+    [u'hel', u'rld', u'weather', u'todai', u'isn']
+
+    >>> from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
+    >>> s = "<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?"
+    >>> CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation]
+    >>> preprocess_string(s,CUSTOM_FILTERS)
+    [u'hel', u'9lo', u'wo9', u'rld', u'th3', u'weather', u'is', u'really', u'g00d', u'today', u'isn', u't', u'it']
+
+    """
+
     s = utils.to_unicode(s)
     for f in filters:
         s = f(s)
     return s.split()
 
 
 def preprocess_documents(docs):
+    """Takes list of strings, splits it into sentences, then applies default filters to every sentence.
+
+    Parameters
+    ----------
+    docs : list
+
+    Returns
+    -------
+    list
+        List of lists, filled by unicode strings.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import preprocess_documents
+    >>> s = ["<i>Hel 9lo</i> <b>Wo9 rld</b>!", "Th3     weather_is really g00d today, isn't it?"]
+    >>> preprocess_documents(s)
+    [[u'hel', u'rld'], [u'weather', u'todai', u'isn']]
+
+    """
+
     return [preprocess_string(d) for d in docs]
 
 
 def read_file(path):
+    r"""Reads file in specified directory.
+
+    Parameters
+    ----------
+    path : str
+
+    Returns
+    -------
+    list
+        List of unicode strings.
+
+    Examples
+    --------
+    >>> from gensim.parsing.preprocessing import read_file
+    >>> path = "/media/work/october_2017/gensim/gensim/test/test_data/mihalcea_tarau.summ.txt"
+    >>> read_file(path)
+    "Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas.\nThe National Hurricane Center in Miami reported its position at 2 a.m. Sunday at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, Puerto Rico, and 200 miles southeast of Santo Domingo.\nThe National Weather Service in San Juan, Puerto Rico, said Gilbert was moving westward at 15 mph with a ``broad area of cloudiness and heavy weather'' rotating around the center of the storm.\nStrong winds associated with the Gilbert brought coastal flooding, strong southeast winds and up to 12 feet feet to Puerto Rico's south coast."
+
+    """
+
     with utils.smart_open(path) as fin:
         return fin.read()