diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 6bd96da716..b0e5094ac0 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Сorpus in Blei's LDA-C format.""" from __future__ import with_statement @@ -19,30 +17,44 @@ from six.moves import xrange -logger = logging.getLogger('gensim.corpora.bleicorpus') +logger = logging.getLogger(__name__) class BleiCorpus(IndexedCorpus): - """ - Corpus in Blei's LDA-C format. + """Corpus in Blei's LDA-C format. The corpus is represented as two files: one describing the documents, and another describing the mapping between words and their ids. Each document is one line:: - N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN + N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN + + + The vocabulary is a file with words, one word per line; word at line K has an implicit `id=K`. - The vocabulary is a file with words, one word per line; word at line K has an - implicit ``id=K``. """ def __init__(self, fname, fname_vocab=None): """ - Initialize the corpus from a file. - `fname_vocab` is the file with vocabulary; if not specified, it defaults to - `fname.vocab`. + Parameters + ---------- + fname : str + Path to corpus. + fname_vocab : str, optional + Vocabulary file. If `fname_vocab` is None, searching one of variants: + + * `fname`.vocab + * `fname`/vocab.txt + * `fname_without_ext`.vocab + * `fname_folder`/vocab.txt + + Raises + ------ + IOError + If vocabulary file doesn't exist. + """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -67,8 +79,13 @@ def __init__(self, fname, fname_vocab=None): self.id2word = dict(enumerate(words)) def __iter__(self): - """ - Iterate over the corpus, returning one sparse vector at a time. + """Iterate over the corpus, returning one sparse (BoW) vector at a time. + + Yields + ------ + list of (int, float) + Document's BoW representation. + """ lineno = -1 with utils.smart_open(self.fname) as fin: @@ -77,6 +94,19 @@ def __iter__(self): self.length = lineno + 1 def line2doc(self, line): + """Convert line in Blei LDA-C format to document (BoW representation). + + Parameters + ---------- + line : str + Line in Blei's LDA-C format. + + Returns + ------- + list of (int, float) + Document's BoW representation. + + """ parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) @@ -86,14 +116,28 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """ - Save a corpus in the LDA-C format. - - There are actually two files saved: `fname` and `fname.vocab`, where - `fname.vocab` is the vocabulary file. + """Save a corpus in the LDA-C format. + + Notes + ----- + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. + + Parameters + ---------- + fname : str + Path to output file. + corpus : iterable of iterable of (int, float) + Input corpus in BoW format. + id2word : dict of (str, str), optional + Mapping id -> word for `corpus`. + metadata : bool, optional + THIS PARAMETER WILL BE IGNORED. + + Returns + ------- + list of int + Offsets for each line in file (in bytes). - This function is automatically called by `BleiCorpus.serialize`; don't - call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -121,8 +165,19 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return the document stored at file position `offset`. + """Get document corresponding to `offset`. + Offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`. + + Parameters + ---------- + offset : int + Position of the document in the file (in bytes). + + Returns + ------- + list of (int, float) + Document in BoW format. + """ with utils.smart_open(self.fname) as f: f.seek(offset) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 969437e571..16a88a93e9 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -4,10 +4,7 @@ # Copyright (C) 2013 Zygmunt Zając # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Corpus in CSV format. - -""" +"""Corpus in CSV format.""" from __future__ import with_statement @@ -18,22 +15,28 @@ from gensim import interfaces, utils -logger = logging.getLogger('gensim.corpora.csvcorpus') +logger = logging.getLogger(__name__) class CsvCorpus(interfaces.CorpusABC): - """ - Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically - based on the file content. + """Corpus in CSV format. + Notes + ----- + The CSV delimiter, headers etc. are guessed automatically based on the file content. All row values are expected to be ints/floats. """ def __init__(self, fname, labels): """ - Initialize the corpus from a file. - `labels` = are class labels present in the input file? => skip the first column + + Parameters + ---------- + fname : str + Path to corpus. + labels : bool + If True - ignore first column (class labels). """ logger.info("loading corpus from %s", fname) @@ -48,8 +51,12 @@ def __init__(self, fname, labels): logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) def __iter__(self): - """ - Iterate over the corpus, returning one sparse vector at a time. + """Iterate over the corpus, returning one BoW vector at a time. + + Yields + ------ + list of (int, float) + Document in BoW format. """ reader = csv.reader(utils.smart_open(self.fname), self.dialect) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 559081b886..c4e58cb95a 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -5,17 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Indexed corpus is a mechanism for random-accessing corpora. - -While the standard corpus interface in gensim allows iterating over corpus with -`for doc in corpus: pass`, indexed corpus allows accessing the documents with -`corpus[docno]` (in O(1) look-up time). - -This functionality is achieved by storing an extra file (by default named the same -as the corpus file plus '.index' suffix) that stores the byte offset of the beginning -of each document. -""" +"""Base Indexed Corpus class.""" import logging import six @@ -24,24 +14,44 @@ from gensim import interfaces, utils -logger = logging.getLogger('gensim.corpora.indexedcorpus') +logger = logging.getLogger(__name__) class IndexedCorpus(interfaces.CorpusABC): + """Indexed corpus is a mechanism for random-accessing corpora. + + While the standard corpus interface in gensim allows iterating over corpus, + we'll show it with :class:`~gensim.corpora.mmcorpus.MmCorpus`. + + >>> from gensim.corpora import MmCorpus + >>> from gensim.test.utils import datapath + >>> + >>> corpus = MmCorpus(datapath('testcorpus.mm')) + >>> for doc in corpus: + ... pass + + :class:`~gensim.corpora.indexedcorpus.IndexedCorpus` allows accessing the documents with index + in :math:`{O}(1)` look-up time. + + >>> document_index = 3 + >>> doc = corpus[document_index] + + Notes + ----- + This functionality is achieved by storing an extra file (by default named the same as the `fname.index`) + that stores the byte offset of the beginning of each document. + + """ + def __init__(self, fname, index_fname=None): """ - Initialize this abstract base class, by loading a previously saved index - from `index_fname` (or `fname.index` if `index_fname` is not set). - This index will allow subclasses to support the `corpus[docno]` syntax - (random access to document #`docno` in O(1)). - - >>> # save corpus in SvmLightCorpus format with an index - >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] - >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) - >>> # load back as a document stream (*not* plain Python list) - >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') - >>> print(corpus_with_random_access[1]) - [(0, 1.0), (1, 2.0)] + + Parameters + ---------- + fname : str + Path to corpus. + index_fname : str, optional + Path to index, if not provided - used `fname.index`. """ try: @@ -58,25 +68,38 @@ def __init__(self, fname, index_fname=None): @classmethod def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): - """ - Iterate through the document stream `corpus`, saving the documents to `fname` - and recording byte offset of each document. Save the resulting index - structure to file `index_fname` (or `fname`.index is not set). - - This relies on the underlying corpus class `serializer` providing (in - addition to standard iteration): + """Serialize corpus with offset metadata, allows to use direct indexes after loading. + + Parameters + ---------- + fname : str + Path to output file. + corpus : iterable of iterable of (int, float) + Corpus in BoW format. + id2word : dict of (str, str), optional + Mapping id -> word. + index_fname : str, optional + Where to save resulting index, if None - store index to `fname`.index. + progress_cnt : int, optional + Number of documents after which progress info is printed. + labels : bool, optional + If True - ignore first column (class labels). + metadata : bool, optional + If True - ensure that serialize will write out article titles to a pickle file. + + Examples + -------- + >>> from gensim.corpora import MmCorpus + >>> from gensim.test.utils import get_tmpfile + >>> + >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]] + >>> output_fname = get_tmpfile("test.mm") + >>> + >>> MmCorpus.serialize(output_fname, corpus) + >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access + >>> print(mm[1]) # retrieve document no. 42, etc. + [(1, 0.1)] - * `save_corpus` method that returns a sequence of byte offsets, one for - each saved document, - * the `docbyoffset(offset)` method, which returns a document - positioned at `offset` bytes within the persistent storage (file). - * metadata if set to true will ensure that serialize will write out article titles to a pickle file. - - Example: - - >>> MmCorpus.serialize('test.mm', corpus) - >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access - >>> print(mm[42]) # retrieve document no. 42, etc. """ if getattr(corpus, 'fname', None) == fname: raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) @@ -107,9 +130,17 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, utils.pickle(offsets, index_fname) def __len__(self): - """ - Return the index length if the corpus is indexed. Otherwise, make a pass - over self to calculate the corpus length and cache this number. + """Get the index length. + + Notes + ----- + If the corpus is not indexed, also count corpus length and cache this value. + + Returns + ------- + int + Length of index. + """ if self.index is not None: return len(self.index) @@ -119,11 +150,34 @@ def __len__(self): return self.length def __getitem__(self, docno): + """Get document by `docno` index. + + Parameters + ---------- + docno : {int, iterable of int} + Document number or iterable of numbers (like a list of str). + + Returns + ------- + list of (int, float) + If `docno` is int - return document in BoW format. + + :class:`~gensim.utils.SlicedCorpus` + If `docno` is iterable of int - return several documents in BoW format + wrapped to :class:`~gensim.utils.SlicedCorpus`. + + Raises + ------ + RuntimeError + If index isn't exist. + + """ if self.index is None: raise RuntimeError("Cannot call corpus[docid] without an index") if isinstance(docno, (slice, list, numpy.ndarray)): return utils.SlicedCorpus(self, docno) elif isinstance(docno, six.integer_types + (numpy.integer,)): return self.docbyoffset(self.index[docno]) + # TODO: no `docbyoffset` method, should be defined in this class else: raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index c19aa321e2..459274cfae 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Corpus in SVMlight format. -""" +"""Corpus in SVMlight format.""" from __future__ import with_statement @@ -18,17 +16,16 @@ from gensim.corpora import IndexedCorpus -logger = logging.getLogger('gensim.corpora.svmlightcorpus') +logger = logging.getLogger(__name__) class SvmLightCorpus(IndexedCorpus): - """ - Corpus in SVMlight format. + """Corpus in SVMlight format. Quoting http://svmlight.joachims.org/: - The input file contains the training examples. The first lines - may contain comments and are ignored if they start with #. Each of the following - lines represents one training example and is of the following format:: + The input file contains the training examples. The first lines may contain comments and are ignored + if they start with #. Each of the following lines represents one training example + and is of the following format:: .=. : : ... : # .=. +1 | -1 | 0 | @@ -38,21 +35,24 @@ class SvmLightCorpus(IndexedCorpus): The "qid" feature (used for SVMlight ranking), if present, is ignored. - Although not mentioned in the specification above, SVMlight also expect its - feature ids to be 1-based (counting starts at 1). We convert features to 0-base - internally by decrementing all ids when loading a SVMlight input file, and - increment them again when saving as SVMlight. + Notes + ----- + Although not mentioned in the specification above, SVMlight also expect its feature ids to be 1-based + (counting starts at 1). We convert features to 0-base internally by decrementing all ids when loading a SVMlight + input file, and increment them again when saving as SVMlight. """ def __init__(self, fname, store_labels=True): """ - Initialize the corpus from a file. - Although vector labels (~SVM target class) are not used in gensim in any way, - they are parsed and stored in `self.labels` for convenience. Set `store_labels=False` - to skip storing these labels (e.g. if there are too many vectors to store - the self.labels array in memory). + Parameters + ---------- + fname: str + Path to corpus. + store_labels : bool, optional + Whether to store labels (~SVM target class). They currently have no application but stored + in `self.labels` for convenience by default. """ IndexedCorpus.__init__(self, fname) @@ -64,8 +64,13 @@ def __init__(self, fname, store_labels=True): self.labels = [] def __iter__(self): - """ - Iterate over the corpus, returning one sparse vector at a time. + """ Iterate over the corpus, returning one sparse (BoW) vector at a time. + + Yields + ------ + list of (int, float) + Document in BoW format. + """ lineno = -1 self.labels = [] @@ -80,14 +85,29 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): - """ - Save a corpus in the SVMlight format. + """Save a corpus in the SVMlight format. + + The SVMlight `` class tag is taken from the `labels` array, or set to 0 for all documents + if `labels` is not supplied. + + Parameters + ---------- + fname : str + Path to output file. + corpus : iterable of iterable of (int, float) + Corpus in BoW format. + id2word : dict of (str, str), optional + Mapping id -> word. + labels : list or False + An SVMlight `` class tags or False if not present. + metadata : bool + ARGUMENT WILL BE IGNORED. + + Returns + ------- + list of int + Offsets for each line in file (in bytes). - The SVMlight `` class tag is taken from the `labels` array, or set - to 0 for all documents if `labels` is not supplied. - - This function is automatically called by `SvmLightCorpus.serialize`; don't - call it directly, call `serialize` instead. """ logger.info("converting corpus to SVMlight format: %s", fname) @@ -100,16 +120,37 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return the document stored at file position `offset`. + """Get the document stored at file position `offset`. + + Parameters + ---------- + offset : int + Document's position. + + Returns + ------- + tuple of (int, float) + """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())[0] + # TODO: it brakes if gets None from line2doc def line2doc(self, line): - """ - Create a document from a single line (string) in SVMlight format + """Get a document from a single line in SVMlight format. + This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.doc2line`. + + Parameters + ---------- + line : str + Line in SVMLight format. + + Returns + ------- + (list of (int, float), str) + Document in BoW format and target class label. + """ line = utils.to_unicode(line) line = line[: line.find('#')].strip() @@ -125,8 +166,21 @@ def line2doc(self, line): @staticmethod def doc2line(doc, label=0): - """ - Output the document in SVMlight format, as a string. Inverse function to `line2doc`. + """Convert BoW representation of document in SVMlight format. + This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.line2doc`. + + Parameters + ---------- + doc : list of (int, float) + Document in BoW format. + label : int, optional + Document label (if provided). + + Returns + ------- + str + `doc` in SVMlight format. + """ pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base return "%s %s\n" % (label, pairs) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py old mode 100755 new mode 100644 index 7148b90884..cd57f20109 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -7,15 +7,19 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump. +"""Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump. + +Notes +----- +If you have the `pattern` package installed, this module will use a fancy lemmatization to get a lemma +of each token (instead of plain alphabetic tokenizer). The package is available at [1]_ . -If you have the `pattern` package installed, this module will use a fancy -lemmatization to get a lemma of each token (instead of plain alphabetic -tokenizer). The package is available at https://github.com/clips/pattern . +See :mod:`~gensim.scripts.make_wiki` for a canned (example) script based on this module. + +References +---------- +.. [1] https://github.com/clips/pattern -See scripts/process_wiki.py for a canned (example) script based on this -module. """ @@ -34,45 +38,74 @@ logger = logging.getLogger(__name__) -# ignore articles shorter than ARTICLE_MIN_WORDS characters (after full preprocessing) ARTICLE_MIN_WORDS = 50 +"""Ignore shorter articles (after full preprocessing).""" # default thresholds for lengths of individual tokens TOKEN_MIN_LEN = 2 TOKEN_MAX_LEN = 15 -RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE) # comments -RE_P1 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # footnotes -RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) # links to languages -RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) # template -RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) # template -RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description -RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) # simplify links, keep description -RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images -RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files -RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links -RE_P10 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # math content -RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) # all other tags -RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting -RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting -RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories -RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) # Remove File and Image template - - -# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that -# ought to be ignored +RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE) +"""Comments.""" +RE_P1 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) +"""Footnotes.""" +RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) +"""Links to languages.""" +RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) +"""Template.""" +RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) +"""Template.""" +RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) +"""Remove URL, keep description.""" +RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) +"""Simplify links, keep description.""" +RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) +"""Keep description of images.""" +RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) +"""Keep description of files.""" +RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) +"""External links.""" +RE_P10 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) +"""Math content.""" +RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) +"""All other tags.""" +RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) +"""Table formatting.""" +RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) +"""Table cell formatting.""" +RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) +"""Categories.""" +RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) +"""Remove File and Image templates.""" + IGNORED_NAMESPACES = [ 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject', 'Special', 'Talk' ] +"""MediaWiki namespaces [2]_ that ought to be ignored. + +References +---------- +.. [2] https://www.mediawiki.org/wiki/Manual:Namespace + +""" def filter_wiki(raw): - """ - Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode - or utf-8 encoded string. + """Filter out wiki markup from `raw`, leaving only text. + + Parameters + ---------- + raw : str + Unicode or utf-8 encoded string. + + Returns + ------- + str + `raw` without markup. + """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) @@ -82,6 +115,19 @@ def filter_wiki(raw): def remove_markup(text): + """Filter out wiki markup from `text`, leaving only text. + + Parameters + ---------- + text : str + String containing markup. + + Returns + ------- + str + `text` without markup. + + """ text = re.sub(RE_P2, '', text) # remove the last list (=languages) # the wiki markup is recursive (markup inside markup etc) # instead of writing a recursive grammar, here we deal with that by removing @@ -119,12 +165,24 @@ def remove_markup(text): def remove_template(s): """Remove template wikimedia markup. - Return a copy of `s` with all the wikimedia markup template removed. See - http://meta.wikimedia.org/wiki/Help:Template for wikimedia templates - details. + Parameters + ---------- + s : str + String containing markup template. + + Returns + ------- + str + Сopy of `s` with all the wikimedia markup template removed. See [4]_ for wikimedia templates details. + + Notes + ----- + Since template can be nested, it is difficult remove them using regular expressions. + + References + ---------- + .. [4] http://meta.wikimedia.org/wiki/Help:Template - Note: Since template can be nested, it is difficult remove them using - regular expresssions. """ # Find the start and end position of each template by finding the opening @@ -157,9 +215,20 @@ def remove_template(s): def remove_file(s): """Remove the 'File:' and 'Image:' markup, keeping the file caption. - Return a copy of `s` with all the 'File:' and 'Image:' markup replaced by - their corresponding captions. See http://www.mediawiki.org/wiki/Help:Images - for the markup details. + Parameters + ---------- + s : str + String containing 'File:' and 'Image:' markup. + + Returns + ------- + str + Сopy of `s` with all the 'File:' and 'Image:' markup replaced by their corresponding captions. [3]_ + + References + ---------- + .. [3] http://www.mediawiki.org/wiki/Help:Images + """ # The regex RE_P15 match a File: or Image: markup for match in re.finditer(RE_P15, s): @@ -170,13 +239,26 @@ def remove_file(s): def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """ - Tokenize a piece of text from wikipedia. The input string `content` is assumed - to be mark-up free (see `filter_wiki()`). + """Tokenize a piece of text from wikipedia. Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. - Return list of tokens as utf8 bytestrings. + Parameters + ---------- + content : str + String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`). + token_min_len : int + Minimal token length. + token_max_len : int + Maximal token length. + lower : bool + If True - convert `content` to lower case. + + Returns + ------- + list of str + List of tokens from `content`. + """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [ @@ -186,7 +268,19 @@ def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, def get_namespace(tag): - """Returns the namespace of tag.""" + """Get the namespace of tag. + + Parameters + ---------- + tag : str + Namespace or tag. + + Returns + ------- + str + Matched namespace or tag. + + """ m = re.match("^{(.*?)}", tag) namespace = m.group(1) if m else "" if not namespace.startswith("http://www.mediawiki.org/xml/export-"): @@ -198,10 +292,19 @@ def get_namespace(tag): def extract_pages(f, filter_namespaces=False): - """ - Extract pages from a MediaWiki database dump = open file-like object `f`. + """Extract pages from a MediaWiki database dump. - Return an iterable over (str, str, str) which generates (title, content, pageid) triplets. + Parameters + ---------- + f : file + File-like object. + filter_namespaces : list of str or bool + Namespaces that will be extracted. + + Yields + ------ + tuple of (str or None, str, str) + Title, text and page id. """ elems = (elem for _, elem in iterparse(f, events=("end",))) @@ -247,12 +350,35 @@ def extract_pages(f, filter_namespaces=False): def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """ - Parse a wikipedia article, returning its content as a list of tokens - (utf8-encoded strings). + """Parse a wikipedia article, extract all tokens. + + Notes + ----- + Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages + like japanese or thai to perform better tokenization. + The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool). + + Parameters + ---------- + args : (str, bool, str, int) + Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title, + page identificator. + tokenizer_func : function + Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`). + Needs to have interface: + tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. + token_min_len : int + Minimal token length. + token_max_len : int + Maximal token length. + lower : bool + If True - convert article text to lower case. + + Returns + ------- + (list of str, str, int) + List of tokens from article, title and page id. - Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better - tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). """ text, lemmatize, title, pageid = args text = filter_wiki(text) @@ -264,13 +390,36 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, def init_to_ignore_interrupt(): - """Should only be used when master is prepared to handle termination of child processes.""" + """Enables interruption ignoring. + + Warnings + -------- + Should only be used when master is prepared to handle termination of + child processes. + + """ signal.signal(signal.SIGINT, signal.SIG_IGN) def _process_article(args): - """Should not be called explicitly. Use `process_article` instead.""" + """Same as :func:`~gensim.corpora.wikicorpus.process_article`, but with args in list format. + Parameters + ---------- + args : [(str, bool, str, int), (function, int, int, bool)] + First element - same as `args` from :func:`~gensim.corpora.wikicorpus.process_article`, + second element is tokenizer function, token minimal length, token maximal length, lowercase flag. + + Returns + ------- + (list of str, str, int) + List of tokens from article, title and page id. + + Warnings + -------- + Should not be called explicitly. Use :func:`~gensim.corpora.wikicorpus.process_article` instead. + + """ tokenizer_func, token_min_len, token_max_len, lower = args[-1] args = args[:-1] @@ -281,17 +430,33 @@ def _process_article(args): class WikiCorpus(TextCorpus): - """ - Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 - or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. + """Treat a wikipedia articles dump as a **read-only** corpus. + + Supported dump formats: - The documents are extracted on-the-fly, so that the whole (massive) dump - can stay compressed on disk. + * wiki--pages-articles.xml.bz2 + * wiki-latest-pages-articles.xml.bz2 - **Note:** "multistream" archives are *not* supported in Python 2 due to - `limitations in the core bz2 library + The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. + + Notes + ----- + Dumps for English wikipedia can be founded `here `_. + + Attributes + ---------- + metadata : bool + Whether to write articles titles to serialized corpus. + + Warnings + -------- + "Multistream" archives are *not* supported in Python 2 due to `limitations in the core bz2 library `_. + Examples + -------- + >>> from gensim.corpora import WikiCorpus, MmCorpus + >>> >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping @@ -299,26 +464,37 @@ class WikiCorpus(TextCorpus): def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """ - Initialize the corpus. Unless a dictionary is provided, this scans the - corpus once, to determine its vocabulary. - - If `pattern` package is installed, use fancier shallow parsing to get - token lemmas. Otherwise, use simple regexp tokenization. You can override - this automatic logic by forcing the `lemmatize` parameter explicitly. - self.metadata if set to true will ensure that serialize will write out article titles to a pickle file. - - Set `article_min_tokens` as a min threshold for article token count (defaults to 50). Any article below this is - ignored. - - Set `tokenizer_func` (defaults to `tokenize`) with a custom function reference to control tokenization else use - the default regexp tokenization. Set this parameter for languages like japanese or thai to perform better - tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). The - parameter values are as configured on the class instance by default. - - Set `lower` to control if everything should be converted to lowercase or not (default True). - - Set `token_min_len`, `token_max_len` as thresholds for token lengths that are returned (default to 2 and 15). + """Initialize the corpus. + + Unless a dictionary is provided, this scans the corpus once, + to determine its vocabulary. + + Parameters + ---------- + fname : str + Path to file with wikipedia dump. + processes : int, optional + Number of processes to run, defaults to **number of cpu - 1**. + lemmatize : bool + Whether to use lemmatization instead of simple regexp tokenization. + Defaults to `True` if *pattern* package installed. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional + Dictionary, if not provided, this scans the corpus once, to determine its vocabulary + (this needs **really long time**). + filter_namespaces : tuple of str + Namespaces to consider. + tokenizer_func : function, optional + Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. + Need to support interface: + tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. + article_min_tokens : int, optional + Minimum tokens in article. Article will be ignored if number of tokens is less. + token_min_len : int, optional + Minimal token length. + token_max_len : int, optional + Maximal token length. + lower : bool, optional + If True - convert all text to lower case. """ self.fname = fname @@ -336,18 +512,23 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.dictionary = dictionary or Dictionary(self.get_texts()) def get_texts(self): - """ - Iterate over the dump, returning text version of each article as a list - of tokens. + """Iterate over the dump, yielding list of tokens for each article. - Only articles of sufficient length are returned (short articles & redirects - etc are ignored). This is controlled by `article_min_tokens` on the class instance. - - Note that this iterates over the **texts**; if you want vectors, just use - the standard corpus interface instead of this function:: + Notes + ----- + This iterates over the **texts**. If you want vectors, just use the standard corpus interface + instead of this method: >>> for vec in wiki_corpus: >>> print(vec) + + Yields + ------ + list of str + If `metadata` is False, yield only list of token extracted from the article. + (list of str, (int, str)) + List of tokens (extracted from the article), page id and article title otherwise. + """ articles, articles_all = 0, 0