diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 07cb70630c..de04b36c62 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -41,23 +41,23 @@ TOKEN_MAX_LEN = 15 -RE_P0 = re.compile('', re.DOTALL | re.UNICODE) # comments -RE_P1 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # footnotes -RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages -RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template -RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template -RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description -RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE) # simplify links, keep description -RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images -RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files -RE_P9 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links -RE_P10 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # math content -RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags -RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting -RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting -RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories +RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE) # comments +RE_P1 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # footnotes +RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) # links to languages +RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) # template +RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) # template +RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description +RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) # simplify links, keep description +RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images +RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files +RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links +RE_P10 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # math content +RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) # all other tags +RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting +RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting +RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories # Remove File and Image template -RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) +RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) # MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that # ought to be ignored @@ -81,7 +81,7 @@ def filter_wiki(raw): def remove_markup(text): - text = re.sub(RE_P2, "", text) # remove the last list (=languages) + text = re.sub(RE_P2, '', text) # remove the last list (=languages) # the wiki markup is recursive (markup inside markup etc) # instead of writing a recursive grammar, here we deal with that by removing # markup in a loop, starting with inner-most expressions and working outwards, @@ -91,11 +91,11 @@ def remove_markup(text): iters = 0 while True: old, iters = text, iters + 1 - text = re.sub(RE_P0, "", text) # remove comments + text = re.sub(RE_P0, '', text) # remove comments text = re.sub(RE_P1, '', text) # remove footnotes - text = re.sub(RE_P9, "", text) # remove outside links - text = re.sub(RE_P10, "", text) # remove math content - text = re.sub(RE_P11, "", text) # remove all remaining tags + text = re.sub(RE_P9, '', text) # remove outside links + text = re.sub(RE_P10, '', text) # remove math content + text = re.sub(RE_P11, '', text) # remove all remaining tags text = re.sub(RE_P14, '', text) # remove categories text = re.sub(RE_P5, '\\3', text) # remove urls, keep description text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py index 4193da0820..8124101acd 100644 --- a/gensim/examples/dmlcz/sources.py +++ b/gensim/examples/dmlcz/sources.py @@ -28,7 +28,7 @@ if sys.version_info[0] >= 3: unicode = str -PAT_TAG = re.compile('<(.*?)>(.*)') +PAT_TAG = re.compile(r'<(.*?)>(.*)') logger = logging.getLogger('gensim.corpora.sources') diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index ab25361f60..6d9fa59079 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -44,7 +44,7 @@ def remove_stopwords(s): return " ".join(w for w in s.split() if w not in STOPWORDS) -RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) +RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE) def strip_punctuation(s): diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index fa6a56b887..5f33bbcea9 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -21,13 +21,13 @@ HAS_PATTERN = False -SEPARATOR = r"@" -RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) -AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE) -AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE) -AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE) -UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE) -UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE) +SEPARATOR = r'@' +RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) +AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) +AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE) +UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE) +UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE) def split_sentences(text): diff --git a/gensim/utils.py b/gensim/utils.py index fc3ca51906..74e623f0b3 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -79,7 +79,7 @@ def smart_open(fname, mode='rb'): return open(fname, mode) -PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) +PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE) RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) @@ -1039,7 +1039,7 @@ def has_pattern(): return False -def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, +def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, stopwords=frozenset(), min_length=2, max_length=15): """ This function is only available when the optional 'pattern' package is installed.