diff --git a/gensim/parsing/__init__.py b/gensim/parsing/__init__.py index 5dcc010aec..5bbf84239e 100644 --- a/gensim/parsing/__init__.py +++ b/gensim/parsing/__init__.py @@ -1,8 +1,5 @@ -""" -This package contains functions to preprocess raw text -""" +"""This package contains functions to preprocess raw text""" -# bring model classes directly into package namespace, to save some typing from .porter import PorterStemmer # noqa:F401 from .preprocessing import (remove_stopwords, strip_punctuation, strip_punctuation2, # noqa:F401 strip_tags, strip_short, strip_numeric, diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py index 048e056418..92c52e0c6d 100644 --- a/gensim/parsing/porter.py +++ b/gensim/parsing/porter.py @@ -3,32 +3,27 @@ """Porter Stemming Algorithm This is the Porter stemming algorithm, ported to Python from the version coded up in ANSI C by the author. It may be be regarded -as canonical, in that it follows the algorithm presented in +as canonical, in that it follows the algorithm presented in [1]_, see also [2]_ + +Author - Vivake Gupta (v@nano.com), optimizations and cleanup of the code by Lars Buitinck. + +Examples: +--------- +>>> from gensim.parsing.porter import PorterStemmer +>>> +>>> p = PorterStemmer() +>>> p.stem("apple") +'appl' +>>> +>>> p.stem_sentence("Cats and ponies have meeting") +'cat and poni have meet' +>>> +>>> p.stem_documents(["Cats and ponies", "have meeting"]) +['cat and poni', 'have meet'] + +.. [1] Porter, 1980, An algorithm for suffix stripping, http://www.cs.odu.edu/~jbollen/IR04/readings/readings5.pdf +.. [2] http://www.tartarus.org/~martin/PorterStemmer -Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, -no. 3, pp 130-137, - -only differing from it at the points maked --DEPARTURE-- below. - -See also http://www.tartarus.org/~martin/PorterStemmer - -The algorithm as described in the paper could be exactly replicated -by adjusting the points of DEPARTURE, but this is barely necessary, -because (a) the points of DEPARTURE are definitely improvements, and -(b) no encoding of the Porter stemmer I have seen is anything like -as exact as this version, even with the points of DEPARTURE! - -Vivake Gupta (v@nano.com) - -Release 1: January 2001 - -Further adjustments by Santiago Bruno (bananabruno@gmail.com) -to allow word input not restricted to one word per line, leading -to: - -Release 2: July 2008 - -Optimizations and cleanup of the code by Lars Buitinck, July 2012. """ @@ -36,22 +31,47 @@ class PorterStemmer(object): + """Class contains implementation of Porter stemming algorithm. + + Attributes + -------- + b : str + Buffer holding a word to be stemmed. The letters are in b[0], b[1] ... ending at b[`k`]. + k : int + Readjusted downwards as the stemming progresses. + j : int + Word length. + + """ def __init__(self): - """The main part of the stemming algorithm starts here. - b is a buffer holding a word to be stemmed. The letters are in b[0], - b[1] ... ending at b[k]. k is readjusted downwards as the stemming - progresses. - - Note that only lower case sequences are stemmed. Forcing to lower case - should be done before stem(...) is called. - """ - self.b = "" # buffer for word to be stemmed self.k = 0 self.j = 0 # j is a general offset into the string def _cons(self, i): - """True <=> b[i] is a consonant.""" + """Check if b[i] is a consonant letter. + + Parameters + ---------- + i : int + Index for `b`. + + Returns + ------- + bool + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "hi" + >>> p._cons(1) + False + >>> p.b = "meow" + >>> p._cons(3) + True + + """ ch = self.b[i] if ch in "aeiou": return False @@ -60,7 +80,7 @@ def _cons(self, i): return True def _m(self): - """Returns the number of consonant sequences between 0 and j. + """Calculate the number of consonant sequences between 0 and j. If c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence, @@ -69,7 +89,21 @@ def _m(self): vc gives 1 vcvc gives 2 vcvcvc gives 3 - .... + + Returns + ------- + int + The number of consonant sequences between 0 and j. + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "aobm" + >>> p.j = 11 + >>> p._m() + 2 + """ i = 0 while True: @@ -98,39 +132,144 @@ def _m(self): i += 1 def _vowelinstem(self): - """True <=> 0,...j contains a vowel""" + """Check if b[0: j + 1] contains a vowel letter. + + Returns + ------- + bool + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "gnsm" + >>> p.j = 3 + >>> p._vowelinstem() + False + + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "gensim" + >>> p.j = 5 + >>> p._vowelinstem() + True + + """ return not all(self._cons(i) for i in xrange(self.j + 1)) def _doublec(self, j): - """True <=> j,(j-1) contain a double consonant.""" + """Check if b[j - 1: j + 1] contain a double consonant letter. + + Parameters + ---------- + j : int + Index for `b` + + Returns + ------- + bool + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "real" + >>> p.j = 3 + >>> p._doublec(3) + False + + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "really" + >>> p.j = 5 + >>> p._doublec(4) + True + + """ return j > 0 and self.b[j] == self.b[j - 1] and self._cons(j) def _cvc(self, i): - """True <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. This is used when trying to - restore an e at the end of a short word, e.g. + """Check if b[j - 2: j + 1] makes the (consonant, vowel, consonant) pattern and also + if the second 'c' is not 'w', 'x' or 'y'. This is used when trying to restore an 'e' at the end of a short word, + e.g. cav(e), lov(e), hop(e), crim(e), but snow, box, tray. + + Parameters + ---------- + i : int + Index for `b` + + Returns + ------- + bool + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "lib" + >>> p.j = 2 + >>> p._cvc(2) + True + + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "dll" + >>> p.j = 2 + >>> p._cvc(2) + False + + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "wow" + >>> p.j = 2 + >>> p._cvc(2) + False - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. """ if i < 2 or not self._cons(i) or self._cons(i - 1) or not self._cons(i - 2): return False return self.b[i] not in "wxy" def _ends(self, s): - """True <=> 0,...k ends with the string s.""" + """Check if b[: k + 1] ends with `s`. + + Parameters + ---------- + s : str + + Returns + ------- + bool + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.b = "cowboy" + >>> p.j = 5 + >>> p.k = 2 + >>> p._ends("cow") + True + + """ if s[-1] != self.b[self.k]: # tiny speed-up - return 0 + return False length = len(s) if length > (self.k + 1): - return 0 + return False if self.b[self.k - length + 1:self.k + 1] != s: - return 0 + return False self.j = self.k - length - return 1 + return True def _setto(self, s): - """Set (j+1),...k to the characters in the string s, adjusting k.""" + """Append `s` to `b`, adjusting `k`. + + Parameters + ---------- + s : str + + """ self.b = self.b[:self.j + 1] + s self.k = len(self.b) - 1 @@ -139,7 +278,7 @@ def _r(self, s): self._setto(s) def _step1ab(self): - """Get rid of plurals and -ed or -ing. E.g., + """Get rid of plurals and -ed or -ing. caresses -> caress ponies -> poni @@ -158,6 +297,7 @@ def _step1ab(self): messing -> mess meetings -> meet + """ if self.b[self.k] == 's': if self._ends("sses"): @@ -184,7 +324,7 @@ def _step1ab(self): self._setto("e") def _step1c(self): - """Turn terminal y to i when there is another vowel in the stem.""" + """Turn terminal 'y' to 'i' when there is another vowel in the stem.""" if self._ends("y") and self._vowelinstem(): self.b = self.b[:self.k] + 'i' @@ -193,6 +333,7 @@ def _step2(self): So, -ization ( = -ize plus -ation) maps to -ize etc. Note that the string before the suffix must give _m() > 0. + """ ch = self.b[self.k - 1] if ch == 'a': @@ -272,7 +413,7 @@ def _step3(self): self._r("") def _step4(self): - """_step4() takes off -ant, -ence etc., in context vcvc.""" + """Takes off -ant, -ence etc., in context vcvc.""" ch = self.b[self.k - 1] if ch == 'a': if not self._ends("al"): @@ -329,8 +470,7 @@ def _step4(self): self.k = self.j def _step5(self): - """Remove a final -e if _m() > 1, and change -ll to -l if m() > 1. - """ + """Remove a final -e if _m() > 1, and change -ll to -l if m() > 1.""" k = self.j = self.k if self.b[k] == 'e': a = self._m() @@ -340,7 +480,25 @@ def _step5(self): self.k -= 1 def stem(self, w): - """Stem the word w, return the stemmed form.""" + """Stem the word `w`. + + Parameters + ---------- + w : str + + Returns + ------- + str + Stemmed version of `w`. + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.stem("ponies") + 'poni' + + """ w = w.lower() k = len(w) - 1 if k <= 1: @@ -363,9 +521,49 @@ def stem(self, w): return self.b[:self.k + 1] def stem_sentence(self, txt): + """Stem the sentence `txt`. + + Parameters + ---------- + txt : str + Input sentence. + + Returns + ------- + str + Stemmed sentence. + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.stem_sentence("Wow very nice woman with apple") + 'wow veri nice woman with appl' + + """ return " ".join(self.stem(x) for x in txt.split()) def stem_documents(self, docs): + """Stem documents. + + Parameters + ---------- + docs : list of str + Input documents + + Returns + ------- + list of str + Stemmed documents. + + Examples + -------- + >>> from gensim.parsing.porter import PorterStemmer + >>> p = PorterStemmer() + >>> p.stem_documents(["Have a very nice weekend", "Have a very nice weekend"]) + ['have a veri nice weekend', 'have a veri nice weekend'] + + """ return [self.stem_sentence(x) for x in docs] diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index 6d9fa59079..f0cf22a6e8 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -3,6 +3,36 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains methods for parsing and preprocessing strings. Let's consider the most noticeable: + +* :func:`~gensim.parsing.preprocessing.remove_stopwords` - remove all stopwords from string +* :func:`~gensim.parsing.preprocessing.preprocess_string` - preprocess string (in default NLP meaning) + +Examples: +--------- +>>> from gensim.parsing.preprocessing import remove_stopwords +>>> remove_stopwords("Better late than never, but better never late.") +u'Better late never, better late.' +>>> +>>> preprocess_string("Hel 9lo Wo9 rld! Th3 weather_is really g00d today, isn't it?") +[u'hel', u'rld', u'weather', u'todai', u'isn'] + + +Data: +----- + +.. data:: STOPWORDS - Set of stopwords from Stone, Denis, Kwantes (2010). +.. data:: RE_PUNCT - Regexp for search an punctuation. +.. data:: RE_TAGS - Regexp for search an tags. +.. data:: RE_NUMERIC - Regexp for search an numbers. +.. data:: RE_NONALPHA - Regexp for search an non-alphabetic character. +.. data:: RE_AL_NUM - Regexp for search a position between letters and digits. +.. data:: RE_NUM_AL - Regexp for search a position between digits and letters . +.. data:: RE_WHITESPACE - Regexp for search space characters. +.. data:: DEFAULT_FILTERS - List of function for string preprocessing. + +""" + import re import string import glob @@ -11,104 +41,262 @@ from gensim.parsing.porter import PorterStemmer -# improved list from Stone, Denis, Kwantes (2010) -STOPWORDS = """ -a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be -became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can -cannot cant co computer con could couldnt cry de describe -detail did didn do does doesn doing don done down due during -each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen -fify fill find fire first five for former formerly forty found four from front full further get give go -had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie -if in inc indeed interest into is it its itself keep last latter latterly least less ltd -just -kg km -made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely -neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off -often on once one only onto or other others otherwise our ours ourselves out over own part per -perhaps please put rather re -quite -rather really regarding -same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten -than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under -until up unless upon us used using -various very very via -was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you -your yours yourself yourselves -""" -STOPWORDS = frozenset(w for w in STOPWORDS.split() if w) +STOPWORDS = frozenset([ + 'all', 'six', 'just', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'four', 'not', 'own', 'through', + 'using', 'fify', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', + 'much', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'yourselves', 'under', + 'ours', 'two', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', + 'de', 'none', 'cannot', 'every', 'un', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'regarding', + 'several', 'hereafter', 'did', 'always', 'who', 'didn', 'whither', 'this', 'someone', 'either', 'each', 'become', + 'thereupon', 'sometime', 'side', 'towards', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'doing', 'km', + 'eg', 'some', 'back', 'used', 'up', 'go', 'namely', 'computer', 'are', 'further', 'beyond', 'ourselves', 'yet', + 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', + 'everything', 'behind', 'does', 'various', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', + 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', + 'although', 'found', 'alone', 're', 'along', 'quite', 'fifteen', 'by', 'both', 'about', 'last', 'would', + 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', + 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', + 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', + 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', + 'been', 'next', 'anyone', 'eleven', 'cry', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', + 'hundred', 'really', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', + 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'kg', 'herself', 'former', 'those', 'he', 'me', 'myself', + 'made', 'twenty', 'these', 'was', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', + 'nine', 'can', 'whether', 'of', 'your', 'toward', 'my', 'say', 'something', 'and', 'whereafter', 'whenever', + 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'doesn', 'an', 'as', 'itself', 'at', + 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', + 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', + 'becomes', 'you', 'if', 'nobody', 'unless', 'whereas', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', + 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'off', 'a', 'don', 'whereby', 'third', 'i', 'whole', 'noone', + 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'with', + 'make', 'once' +]) + + +RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE) +RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) +RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) +RE_NONALPHA = re.compile(r"\W", re.UNICODE) +RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) +RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) +RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) def remove_stopwords(s): + """Remove :const:`~gensim.parsing.preprocessing.STOPWORDS` from `s`. + + Parameters + ---------- + s : str + + Returns + ------- + str + Unicode string without :const:`~gensim.parsing.preprocessing.STOPWORDS`. + + Examples + -------- + >>> from gensim.parsing.preprocessing import remove_stopwords + >>> remove_stopwords("Better late than never, but better never late.") + u'Better late never, better late.' + + """ s = utils.to_unicode(s) return " ".join(w for w in s.split() if w not in STOPWORDS) -RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE) +def strip_punctuation(s): + """Replace punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`. + Parameters + ---------- + s : str -def strip_punctuation(s): + Returns + ------- + str + Unicode string without punctuation characters. + + Examples + -------- + >>> from gensim.parsing.preprocessing import strip_punctuation + >>> strip_punctuation("A semicolon is a stronger break than a comma, but not as much as a full stop!") + u'A semicolon is a stronger break than a comma but not as much as a full stop ' + + """ s = utils.to_unicode(s) return RE_PUNCT.sub(" ", s) -# unicode.translate cannot delete characters like str can strip_punctuation2 = strip_punctuation -# def strip_punctuation2(s): -# s = utils.to_unicode(s) -# return s.translate(None, string.punctuation) -RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) +def strip_tags(s): + """Remove tags from `s` using :const:`~gensim.parsing.preprocessing.RE_TAGS`. + Parameters + ---------- + s : str -def strip_tags(s): + Returns + ------- + str + Unicode string without tags. + + Examples + -------- + >>> from gensim.parsing.preprocessing import strip_tags + >>> strip_tags("Hello World!") + u'Hello World!' + + """ s = utils.to_unicode(s) return RE_TAGS.sub("", s) def strip_short(s, minsize=3): + """Remove words with length lesser than `minsize` from `s`. + + Parameters + ---------- + s : str + minsize : int, optional + + Returns + ------- + str + Unicode string without short words. + + Examples + -------- + >>> from gensim.parsing.preprocessing import strip_short + >>> strip_short("salut les amis du 59") + u'salut les amis' + >>> + >>> strip_short("one two three four five six seven eight nine ten", minsize=5) + u'three seven eight' + + """ s = utils.to_unicode(s) return " ".join(e for e in s.split() if len(e) >= minsize) -RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) +def strip_numeric(s): + """Remove digits from `s` using :const:`~gensim.parsing.preprocessing.RE_NUMERIC`. + Parameters + ---------- + s : str -def strip_numeric(s): + Returns + ------- + str + Unicode string without digits. + + Examples + -------- + >>> from gensim.parsing.preprocessing import strip_numeric + >>> strip_numeric("0text24gensim365test") + u'textgensimtest' + + """ s = utils.to_unicode(s) return RE_NUMERIC.sub("", s) -RE_NONALPHA = re.compile(r"\W", re.UNICODE) +def strip_non_alphanum(s): + """Remove non-alphabetic characters from `s` using :const:`~gensim.parsing.preprocessing.RE_NONALPHA`. + Parameters + ---------- + s : str -def strip_non_alphanum(s): + Returns + ------- + str + Unicode string with alphabetic characters only. + + Notes + ----- + Word characters - alphanumeric & underscore. + + Examples + -------- + >>> from gensim.parsing.preprocessing import strip_non_alphanum + >>> strip_non_alphanum("if-you#can%read$this&then@this#method^works") + u'if you can read this then this method works' + + """ s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s) -RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) +def strip_multiple_whitespaces(s): + r"""Remove repeating whitespace characters (spaces, tabs, line breaks) from `s` + and turns tabs & line breaks into spaces using :const:`~gensim.parsing.preprocessing.RE_WHITESPACE`. + Parameters + ---------- + s : str -def strip_multiple_whitespaces(s): + Returns + ------- + str + Unicode string without repeating in a row whitespace characters. + + Examples + -------- + >>> from gensim.parsing.preprocessing import strip_multiple_whitespaces + >>> strip_multiple_whitespaces("salut" + '\r' + " les" + '\n' + " loulous!") + u'salut les loulous!' + + """ s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s) -RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) -RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) +def split_alphanum(s): + """Add spaces between digits & letters in `s` using :const:`~gensim.parsing.preprocessing.RE_AL_NUM`. + Parameters + ---------- + s : str -def split_alphanum(s): + Returns + ------- + str + Unicode string with spaces between digits & letters. + + Examples + -------- + >>> from gensim.parsing.preprocessing import split_alphanum + >>> split_alphanum("24.0hours7 days365 a1b2c3") + u'24.0 hours 7 days 365 a 1 b 2 c 3' + + """ s = utils.to_unicode(s) s = RE_AL_NUM.sub(r"\1 \2", s) return RE_NUM_AL.sub(r"\1 \2", s) def stem_text(text): - """ - Return lowercase and (porter-)stemmed version of string `text`. + """Transform `s` into lowercase and stem it. + + Parameters + ---------- + text : str + + Returns + ------- + str + Unicode lowercased and porter-stemmed version of string `text`. + + Examples + -------- + >>> from gensim.parsing.preprocessing import stem_text + >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.") + u'while it is quit us to be abl to search a larg collect of document almost instantly.' + """ text = utils.to_unicode(text) p = PorterStemmer() @@ -117,6 +305,7 @@ def stem_text(text): stem = stem_text + DEFAULT_FILTERS = [ lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, @@ -125,6 +314,40 @@ def stem_text(text): def preprocess_string(s, filters=DEFAULT_FILTERS): + """Apply list of chosen filters to `s`. + + Default list of filters: + + * :func:`~gensim.parsing.preprocessing.strip_tags`, + * :func:`~gensim.parsing.preprocessing.strip_punctuation`, + * :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`, + * :func:`~gensim.parsing.preprocessing.strip_numeric`, + * :func:`~gensim.parsing.preprocessing.remove_stopwords`, + * :func:`~gensim.parsing.preprocessing.strip_short`, + * :func:`~gensim.parsing.preprocessing.stem_text`. + + Parameters + ---------- + s : str + filters: list of functions, optional + + Returns + ------- + list of str + Processed strings (cleaned). + + Examples + -------- + >>> from gensim.parsing.preprocessing import preprocess_string + >>> preprocess_string("Hel 9lo Wo9 rld! Th3 weather_is really g00d today, isn't it?") + [u'hel', u'rld', u'weather', u'todai', u'isn'] + >>> + >>> s = "Hel 9lo Wo9 rld! Th3 weather_is really g00d today, isn't it?" + >>> CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation] + >>> preprocess_string(s, CUSTOM_FILTERS) + [u'hel', u'9lo', u'wo9', u'rld', u'th3', u'weather', u'is', u'really', u'g00d', u'today', u'isn', u't', u'it'] + + """ s = utils.to_unicode(s) for f in filters: s = f(s) @@ -132,6 +355,24 @@ def preprocess_string(s, filters=DEFAULT_FILTERS): def preprocess_documents(docs): + """Apply :const:`~gensim.parsing.preprocessing.DEFAULT_FILTERS` to the documents strings. + + Parameters + ---------- + docs : list of str + + Returns + ------- + list of (list of str) + Processed documents split by whitespace. + + Examples + -------- + >>> from gensim.parsing.preprocessing import preprocess_documents + >>> preprocess_documents(["Hel 9lo Wo9 rld!", "Th3 weather_is really g00d today, isn't it?"]) + [[u'hel', u'rld'], [u'weather', u'todai', u'isn']] + + """ return [preprocess_string(d) for d in docs]