diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index ec804d0dfd..a736849b4e 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -5,15 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module implements the concept of Dictionary -- a mapping between words and -their integer ids. - -Dictionaries can be created from a corpus and can later be pruned according to -document frequency (removing (un)common words via the :func:`Dictionary.filter_extremes` method), -save/loaded from disk (via :func:`Dictionary.save` and :func:`Dictionary.load` methods), merged -with other dictionary (:func:`Dictionary.merge_with`) etc. -""" +"""This module implements the concept of Dictionary -- a mapping between words and their integer ids.""" from __future__ import with_statement @@ -32,32 +24,79 @@ unicode = str -logger = logging.getLogger('gensim.corpora.dictionary') +logger = logging.getLogger(__name__) class Dictionary(utils.SaveLoad, Mapping): - """ - Dictionary encapsulates the mapping between normalized words and their integer ids. + """Dictionary encapsulates the mapping between normalized words and their integer ids. + + Attributes + --------- + token2id : dict of (str, int) + token -> tokenId. + id2token : dict of (int, str) + Reverse mapping for token2id, initialized in lazy manner to save memory. + dfs : dict of (int, int) + Document frequencies: token_id -> in how many documents contain this token. + num_docs : int + Number of documents processed. + num_pos : int + Total number of corpus positions (number of processed words). + num_nnz : int + Total number of non-zeroes in the BOW matrix. - The main function is `doc2bow`, which converts a collection of words to its - bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. """ def __init__(self, documents=None, prune_at=2000000): """ - If `documents` are given, use them to initialize Dictionary (see `add_documents()`). + + Parameters + ---------- + documents : iterable of iterable of str, optional + Documents that used for initialization. + prune_at : int, optional + Total number of unique words. Dictionary will keep not more than `prune_at` words. + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> + >>> texts = [['human', 'interface', 'computer']] + >>> dct = Dictionary(texts) # fit dictionary + >>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # update dictionary with new documents + >>> dct.doc2bow(["dog", "computer", "non_existent_word"]) + [(0, 1), (6, 1)] + """ - self.token2id = {} # token -> tokenId - self.id2token = {} # reverse mapping for token2id; only formed on request, to save memory - self.dfs = {} # document frequencies: tokenId -> in how many documents this token appeared + self.token2id = {} + self.id2token = {} + self.dfs = {} - self.num_docs = 0 # number of documents processed - self.num_pos = 0 # total number of corpus positions - self.num_nnz = 0 # total number of non-zeroes in the BOW matrix + self.num_docs = 0 + self.num_pos = 0 + self.num_nnz = 0 if documents is not None: self.add_documents(documents, prune_at=prune_at) def __getitem__(self, tokenid): + """Get token by provided `tokenid`. + + Parameters + ---------- + tokenid : int + Id of token + + Returns + ------- + str + Token corresponding to `tokenid`. + + Raises + ------ + KeyError + If `tokenid` isn't contained in :class:`~gensim.corpora.dictionary.Dictionary`. + + """ if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly @@ -65,6 +104,7 @@ def __getitem__(self, tokenid): return self.id2token[tokenid] # will throw for non-existent ids def __iter__(self): + """Iterate over tokens that stored.""" return iter(self.keys()) if PY3: @@ -78,12 +118,24 @@ def itervalues(self): return self.values() def keys(self): - """Return a list of all token ids.""" + """Get all stored ids. + + Returns + ------- + list of int + List of all token ids. + + """ return list(self.token2id.values()) def __len__(self): - """ - Return the number of token->id mappings in the dictionary. + """Get number of stored tokens. + + Returns + ------- + int + Number of stored tokens. + """ return len(self.token2id) @@ -93,20 +145,44 @@ def __str__(self): @staticmethod def from_documents(documents): + """Create :class:`~gensim.corpora.dictionary.Dictionary` based on `documents` + + Parameters + ---------- + documents : iterable of iterable of str + Input corpus. + + Returns + ------- + :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary filled by `documents`. + + """ return Dictionary(documents=documents) def add_documents(self, documents, prune_at=2000000): - """ - Update dictionary from a collection of documents. Each document is a list - of tokens = **tokenized and normalized** strings (either utf8 or unicode). + """Update dictionary from a collection of `documents`. - This is a convenience wrapper for calling `doc2bow` on each document - with `allow_update=True`, which also prunes infrequent words, keeping the - total number of unique words <= `prune_at`. This is to save memory on very - large inputs. To disable this pruning, set `prune_at=None`. + Parameters + ---------- + documents : iterable of iterable of str + Input corpus. All tokens should be already **tokenized and normalized**. + prune_at : int, optional + Total number of unique words. Dictionary will keep not more than `prune_at` words. + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> + >>> corpus = ["máma mele maso".split(), "ema má máma".split()] + + >>> dct = Dictionary(corpus) + >>> len(dct) + 5 + >>> dct.add_documents([["this","is","sparta"],["just","joking"]]) + >>> len(dct) + 10 - >>> print(Dictionary(["máma mele maso".split(), "ema má máma".split()])) - Dictionary(5 unique tokens) """ for docno, document in enumerate(documents): # log progress & run a regular check for pruning, once every 10k docs @@ -124,19 +200,34 @@ def add_documents(self, documents, prune_at=2000000): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """ - Convert `document` (a list of words) into the bag-of-words format = list - of `(token_id, token_count)` 2-tuples. Each word is assumed to be a - **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing - is done on the words in `document`; apply tokenization, stemming etc. before - calling this method. - - If `allow_update` is set, then also update dictionary in the process: create - ids for new words. At the same time, update document frequencies -- for - each word appearing in this document, increase its document frequency (`self.dfs`) - by one. - - If `allow_update` is **not** set, this function is `const`, aka read-only. + """Convert `document` into the bag-of-words (BoW) format = list of (token_id, token_count). + + Parameters + ---------- + document : list of str + Input document. + allow_update : bool, optional + If True - update dictionary in the process (i.e. add new tokens and update frequencies). + return_missing : bool, optional + Also return missing tokens (that doesn't contains in current dictionary). + + Return + ------ + list of (int, int) + BoW representation of `document` + list of (int, int), dict of (str, int) + If `return_missing` is True, return BoW representation of `document` + dictionary with missing + tokens and their frequencies. + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()]) + >>> dct.doc2bow(["this","is","máma"]) + [(2, 1)] + >>> dct.doc2bow(["this","is","máma"], return_missing=True) + ([(2, 1)], {u'this': 1, u'is': 1}) + """ if isinstance(document, string_types): raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") @@ -176,35 +267,30 @@ def doc2bow(self, document, allow_update=False, return_missing=False): def doc2idx(self, document, unknown_word_index=-1): """Convert `document` (a list of words) into a list of indexes = list of `token_id`. - Each word is assumed to be a **tokenized and normalized** string (either unicode or utf8-encoded). - No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling - this method. - - Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`, - defaults to -1. - Notes ----- - This function is `const`, aka read-only + Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`. Parameters ---------- document : list of str - Tokenized, normalized and preprocessed words + Input document unknown_word_index : int, optional Index to use for words not in the dictionary. Returns ------- list of int - Indexes in the dictionary for words in the `document` preserving the order of words + Indexes in the dictionary for words in the `document` (preserving the order of words). Examples -------- - >>> dictionary_obj = Dictionary() - >>> dictionary_obj.token2id = {'computer': 0, 'human': 1, 'response': 2, 'survey': 3} - >>> dictionary_obj.doc2idx(document=['human', 'computer', 'interface'], unknown_word_index=-1) - [1, 0, -1] + >>> from gensim.corpora import Dictionary + >>> + >>> corpus = [["a", "a", "b"], ["a", "c"]] + >>> dct = Dictionary(corpus) + >>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"]) + [0, 0, 2, -1, 2] """ if isinstance(document, string_types): @@ -214,21 +300,44 @@ def doc2idx(self, document, unknown_word_index=-1): return [self.token2id.get(word, unknown_word_index) for word in document] def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): - """ - Filter out tokens that appear in + """Filter tokens in dictionary by frequency. + + Parameters + ---------- + no_below : int, optional + Keep tokens which are contained in at least `no_below` documents. + no_above : float, optional + Keep tokens which are contained in no more than `no_above` documents + (fraction of total corpus size, not an absolute number). + keep_n : int, optional + Keep only the first `keep_n` most frequent tokens. + keep_tokens : iterable of str + Iterable of tokens that **must** stay in dictionary after filtering. + + Notes + ----- + For tokens that appear in: + + #. Less than `no_below` documents (absolute number) or \n + #. More than `no_above` documents (fraction of total corpus size, **not absolute number**). + #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). - 1. less than `no_below` documents (absolute number) or - 2. more than `no_above` documents (fraction of total corpus size, *not* - absolute number). - 3. if tokens are given in keep_tokens (list of strings), they will be kept regardless of - the `no_below` and `no_above` settings - 4. after (1), (2) and (3), keep only the first `keep_n` most frequent tokens (or - keep all if `None`). After the pruning, shrink resulting gaps in word ids. + Due to the gap shrinking, the same word may have a different word id before and after the call to this function! + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> dct = Dictionary(corpus) + >>> len(dct) + 5 + >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1) + >>> len(dct) + 1 - **Note**: Due to the gap shrinking, the same word may have a different - word id before and after the call to this function! """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold @@ -259,13 +368,25 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N logger.info("resulting dictionary: %s", self) def filter_n_most_frequent(self, remove_n): - """ - Filter out the 'remove_n' most frequent tokens that appear in the documents. + """Filter out the 'remove_n' most frequent tokens that appear in the documents. - After the pruning, shrink resulting gaps in word ids. + Parameters + ---------- + remove_n : int + Number of the most frequent tokens that will be removed. + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> dct = Dictionary(corpus) + >>> len(dct) + 5 + >>> dct.filter_n_most_frequent(2) + >>> len(dct) + 3 - **Note**: Due to the gap shrinking, the same word may have a different - word id before and after the call to this function! """ # determine which tokens to keep most_frequent_ids = (v for v in itervalues(self.token2id)) @@ -279,11 +400,33 @@ def filter_n_most_frequent(self, remove_n): logger.info("resulting dictionary: %s", self) def filter_tokens(self, bad_ids=None, good_ids=None): - """ - Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep - selected `good_ids` in the mapping and remove the rest. + """Remove the selected `bad_ids` tokens from :class:`~gensim.corpora.dictionary.Dictionary`. + Alternative - keep selected `good_ids` in :class:`~gensim.corpora.dictionary.Dictionary` and remove the rest. + + Parameters + ---------- + bad_ids : iterable of int, optional + Collection of word ids to be removed. + good_ids : collection of int, optional + Keep selected collection of word ids and remove the rest. + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> dct = Dictionary(corpus) + >>> 'ema' in dct.token2id + True + >>> dct.filter_tokens(bad_ids=[dct.token2id['ema']]) + >>> 'ema' in dct.token2id + False + >>> len(dct) + 4 + >>> dct.filter_tokens(good_ids=[dct.token2id['maso']]) + >>> len(dct) + 1 - `bad_ids` and `good_ids` are collections of word ids to be removed. """ if bad_ids is not None: bad_ids = set(bad_ids) @@ -296,13 +439,7 @@ def filter_tokens(self, bad_ids=None, good_ids=None): self.compactify() def compactify(self): - """ - Assign new word ids to all words. - - This is done to make the ids more compact, e.g. after some tokens have - been removed via :func:`filter_tokens` and there are gaps in the id series. - Calling this method will remove the gaps. - """ + """Assign new word ids to all words, shrinking gaps.""" logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id @@ -314,14 +451,48 @@ def compactify(self): self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)} def save_as_text(self, fname, sort_by_word=True): - """ - Save this Dictionary to a text file, in format: - `num_docs` - `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, - or by decreasing word frequency. + """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. + + Parameters + ---------- + fname : str + Path to output file. + sort_by_word : bool, optional + if True - sort by word in lexicographical order. + + Notes + ----- + Format:: + + num_docs + id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE] + id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE] + .... + id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] + + Warnings + -------- + Text format should be use for corpus inspection. Use :meth:`~gensim.corpora.dictionary.Dictionary.save` and + :meth:`~gensim.corpora.dictionary.Dictionary.load` to store in binary format (pickle) for better performance. + + See Also + -------- + :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> from gensim.test.utils import get_tmpfile + >>> + >>> tmp_fname = get_tmpfile("dictionary") + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> + >>> dct = Dictionary(corpus) + >>> dct.save_as_text(tmp_fname) + >>> + >>> loaded_dct = Dictionary.load_from_text("testdata") + >>> assert dct.token2id == loaded_dct.token2id - Note: text format should be use for corpus inspection. Use `save`/`load` - to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: @@ -337,24 +508,41 @@ def save_as_text(self, fname, sort_by_word=True): fout.write(utils.to_utf8(line)) def merge_with(self, other): - """ - Merge another dictionary into this dictionary, mapping same tokens to the - same ids and new tokens to new ids. The purpose is to merge two corpora - created using two different dictionaries, one from `self` and one from `other`. + """Merge another dictionary into this dictionary, mapping same tokens to the same ids and new tokens to new ids. + Notes + ----- + The purpose is to merge two corpora created using two different dictionaries: `self` and `other`. `other` can be any id=>word mapping (a dict, a Dictionary object, ...). - Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, - will convert documents from a corpus built using the `other` dictionary - into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`). + Get a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents + from a corpus built using the `other` dictionary into a document using the new, merged dictionary. + + Warnings + -------- + This method will change `self` dictionary. + + Parameters + ---------- + other : :class:`~gensim.corpora.dictionary.Dictionary` + Other dictionary. - Example: + Return + ------ + :class:`gensim.models.VocabTransform` + Transformation object. - >>> dict1 = Dictionary(some_documents) - >>> dict2 = Dictionary(other_documents) # ids not compatible with dict1! - >>> dict2_to_dict1 = dict1.merge_with(dict2) - >>> # now we can merge corpora from the two incompatible dictionaries into one - >>> merged_corpus = itertools.chain(some_corpus_from_dict1, dict2_to_dict1[some_corpus_from_dict2]) + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> + >>> corpus_1, corpus_2 = [["a", "b", "c"]], [["a", "f", "f"]] + >>> dct_1, dct_2 = Dictionary(corpus_1), Dictionary(corpus_2) + >>> dct_1.doc2bow(corpus_2[0]) + [(0, 1)] + >>> transformer = dct_1.merge_with(dct_2) + >>> dct_1.doc2bow(corpus_2[0]) + [(0, 1), (3, 2)] """ old2new = {} @@ -383,9 +571,32 @@ def merge_with(self, other): @staticmethod def load_from_text(fname): - """ - Load a previously stored Dictionary from a text file. - Mirror function to `save_as_text`. + """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file. + Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. + + Parameters + ---------- + fname: str + Path to file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. + + See Also + -------- + :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text` + + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> from gensim.test.utils import get_tmpfile + >>> + >>> tmp_fname = get_tmpfile("dictionary") + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> + >>> dct = Dictionary(corpus) + >>> dct.save_as_text(tmp_fname) + >>> + >>> loaded_dct = Dictionary.load_from_text("testdata") + >>> assert dct.token2id == loaded_dct.token2id + """ result = Dictionary() with utils.smart_open(fname) as f: @@ -412,20 +623,38 @@ def load_from_text(fname): @staticmethod def from_corpus(corpus, id2word=None): - """ - Create Dictionary from an existing corpus. This can be useful if you only - have a term-document BOW matrix (represented by `corpus`), but not the - original text corpus. + """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus. - This will scan the term-document count matrix for all word ids that - appear in it, then construct and return Dictionary which maps each - `word_id -> id2word[word_id]`. + Parameters + ---------- + corpus : iterable of iterable of (int, number) + Corpus in BoW format. + id2word : dict of (int, object) + Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used. - `id2word` is an optional dictionary that maps the `word_id` to a token. In - case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` - will be used. - """ + Notes + ----- + This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original + text corpus. This method will scan the term-document count matrix for all word ids that appear in it, + then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`. + `id2word` is an optional dictionary that maps the `word_id` to a token. + In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used. + + Returns + ------- + :class:`~gensim.corpora.dictionary.Dictionary` + Inferred dictionary from corpus. + Examples + -------- + >>> from gensim.corpora import Dictionary + >>> + >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] + >>> dct = Dictionary.from_corpus(corpus) + >>> len(dct) + 3 + + """ result = Dictionary() max_id = -1 for docno, document in enumerate(corpus): diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 687ec241ac..79065e5ead 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -5,19 +5,32 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module implements the `"hashing trick" `_ -- -a mapping between words and their integer ids using a fixed, static mapping. The -static mapping has a constant memory footprint, regardless of the number of word-types (features) -in your corpus, so it's suitable for processing extremely large corpora. +"""This module implements the "hashing trick" [1]_ -- a mapping between words and their integer ids +using a fixed and static mapping. + +Notes +----- +The static mapping has a constant memory footprint, regardless of the number of word-types (features) in your corpus, +so it's suitable for processing extremely large corpora. The ids are computed as `hash(word) % id_range`, +where `hash` is a user-configurable function (`zlib.adler32` by default). + +Advantages: + +* New words can be represented immediately, without an extra pass through the corpus + to collect all the ids first. +* Can be used with non-repeatable (once-only) streams of documents. +* All tokens will be used (not only that you see in documents), typical problem + for :class:`~gensim.corpora.dictionary.Dictionary`. -The ids are computed as `hash(word) % id_range`, where `hash` is a user-configurable -function (adler32 by default). Using HashDictionary, new words can be represented immediately, -without an extra pass through the corpus to collect all the ids first. This is another -advantage: HashDictionary can be used with non-repeatable (once-only) streams of documents. -A disadvantage of HashDictionary is that, unlike plain :class:`Dictionary`, several words may map -to the same id, causing hash collisions. The word<->id mapping is no longer a bijection. +Disadvantages: + +* Words may map to the same id, causing hash collisions. The word <-> id mapping is no longer a bijection. + + +References +---------- +.. [1] http://en.wikipedia.org/wiki/Hashing-Trick """ @@ -35,23 +48,40 @@ class HashDictionary(utils.SaveLoad, dict): - """ - HashDictionary encapsulates the mapping between normalized words and their - integer ids. - - Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary - step. The documents can be computed immediately, from an uninitialized `HashDictionary`, - without seeing the rest of the corpus first. - - The main function is `doc2bow`, which converts a collection of words to its - bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. + """Encapsulates the mapping between normalized words and their integer ids. + + Notes + ----- + Unlike :class:`~gensim.corpora.dictionary.Dictionary`, + building a :class:`~gensim.corpora.hashdictionary.HashDictionary` before using it **isn't a necessary step**. + The documents can be computed immediately, from an uninitialized + :class:`~gensim.corpora.hashdictionary.HashDictionary` without seeing the rest of the corpus first. + + Examples + -------- + >>> from gensim.corpora import HashDictionary + >>> + >>> texts = [['human', 'interface', 'computer']] + >>> dct = HashDictionary(texts) + >>> dct.doc2bow(texts[0]) + [(10608, 1), (12466, 1), (31002, 1)] """ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True): """ - By default, keep track of debug statistics and mappings. If you find yourself - running out of memory (or are sure you don't need the debug info), set - `debug=False`. + + Parameters + ---------- + documents : iterable of iterable of str + Iterable of documents, if given - use them to initialization. + id_range : int, optional + Number of hash-values in table, used as `id = myhash(key) % id_range`. + myhash : function + Hash function, should support interface myhash(str) -> int, used `zlib.adler32` by default. + debug : bool + If True - store raw tokens mapping (as str <-> id). + If you find yourself running out of memory (or not sure that you really need raw tokens), set `debug=False`. + """ self.myhash = myhash # hash fnc: string->integer self.id_range = id_range # hash range: id = myhash(key) % id_range @@ -72,17 +102,39 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr self.add_documents(documents) def __getitem__(self, tokenid): - """ - Return all words that have mapped to the given id so far, as a set. + """Get all words that have mapped to the given id so far, as a set. + + Warnings + -------- + Works only if `debug=True`. + + Parameters + ---------- + tokenid : int + Token identifier (result of hashing). + + Return + ------ + set of str + Set of all corresponding words. - Only works if `self.debug` was enabled. """ return self.id2token.get(tokenid, set()) def restricted_hash(self, token): - """ - Calculate id of the given token. Also keep track of what words were mapped - to what ids, for debugging reasons. + """Calculate id of the given token. + Also keep track of what words were mapped to what ids, for debugging reasons. + + Parameters + ---------- + token : str + Input token. + + Return + ------ + int + Hash value of `token`. + """ h = self.myhash(utils.to_utf8(token)) % self.id_range if self.debug: @@ -91,13 +143,11 @@ def restricted_hash(self, token): return h def __len__(self): - """ - Return the number of distinct ids = the entire dictionary size. - """ + """Get the number of distinct ids = the entire dictionary size.""" return self.id_range def keys(self): - """Return a list of all token ids.""" + """Get a list of all token ids.""" return range(len(self)) def __str__(self): @@ -108,12 +158,29 @@ def from_documents(*args, **kwargs): return HashDictionary(*args, **kwargs) def add_documents(self, documents): - """ - Build dictionary from a collection of documents. Each document is a list - of tokens = **tokenized and normalized** utf-8 encoded strings. + """Build dictionary from a collection of documents. + + Notes + ----- + This is only a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`. + + Parameters + ---------- + documents : iterable of list of str + Collection of documents. + + Examples + -------- + >>> from gensim.corpora import HashDictionary + >>> + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> dct = HashDictionary(corpus) + >>> "sparta" in dct.token2id + False + >>> dct.add_documents([["this","is","sparta"],["just","joking"]]) # add more documents in dictionary + >>> "sparta" in dct.token2id + True - This is only a convenience wrapper for calling `doc2bow` on each document - with `allow_update=True`. """ for docno, document in enumerate(documents): if docno % 10000 == 0: @@ -125,17 +192,43 @@ def add_documents(self, documents): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """ - Convert `document` (a list of words) into the bag-of-words format = list - of `(token_id, token_count)` 2-tuples. Each word is assumed to be a - **tokenized and normalized** utf-8 encoded string. No further preprocessing - is done on the words in `document`; apply tokenization, stemming etc. before - calling this method. - - If `allow_update` or `self.allow_update` is set, then also update dictionary - in the process: update overall corpus statistics and document frequencies. - For each id appearing in this document, increase its document frequency - (`self.dfs`) by one. + """Convert `document` into the bag-of-words format, like [(1, 4), (150, 1), (2005, 2)]. + + Notes + ----- + Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing + is done on the words in `document` (apply tokenization, stemming etc) before calling this method. + + If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall + corpus statistics and document frequencies. For each id appearing in this document, increase its document + frequency (`self.dfs`) by one. + + Parameters + ---------- + document : list of str + Is a list of tokens = **tokenized and normalized** strings (either utf8 or unicode). + allow_update : bool, optional + If True - update dictionary in the process. + return_missing : bool, optional + Show token_count for missing words. HAVE NO SENSE FOR THIS CLASS, BECAUSE WE USING HASHING-TRICK. + + Return + ------ + list of (int, int) + Document in Bag-of-words (BoW) format. + list of (int, int), dict + If `return_missing=True`, return document in Bag-of-words (BoW) format + empty dictionary. + + Examples + -------- + >>> from gensim.corpora import HashDictionary + >>> + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> dct = HashDictionary(corpus) + >>> dct.doc2bow(["this","is","máma"]) + [(1721, 1), (5280, 1), (22493, 1)] + >>> dct.doc2bow(["this","is","máma"], return_missing=True) + ([(1721, 1), (5280, 1), (22493, 1)], {}) """ result = {} @@ -167,19 +260,40 @@ def doc2bow(self, document, allow_update=False, return_missing=False): return result def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): - """ - Remove document frequency statistics for tokens that appear in - - 1. less than `no_below` documents (absolute number) or - 2. more than `no_above` documents (fraction of total corpus size, *not* - absolute number). - 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or - keep all if `None`). - - **Note:** since HashDictionary's id range is fixed and doesn't depend on - the number of tokens seen, this doesn't really "remove" anything. It only - clears some supplementary statistics, for easier debugging and a smaller RAM - footprint. + """Filter tokens in dictionary by frequency. + + Parameters + ---------- + no_below : int, optional + Keep tokens which are contained in at least `no_below` documents. + no_above : float, optional + Keep tokens which are contained in no more than `no_above` documents + (fraction of total corpus size, not an absolute number). + keep_n : int, optional + Keep only the first `keep_n` most frequent tokens. + + Notes + ----- + For tokens that appear in: + + #. Less than `no_below` documents (absolute number) or \n + #. More than `no_above` documents (fraction of total corpus size, **not absolute number**). + #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). + + Since :class:`~gensim.corpora.hashdictionary.HashDictionary` id range is fixed and doesn't depend on the number + of tokens seen, this doesn't really "remove" anything. + It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint. + + Examples + -------- + >>> from gensim.corpora import HashDictionary + >>> + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> dct = HashDictionary(corpus) + >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1) + >>> print dct.token2id + {'maso': 15025} + """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] @@ -200,13 +314,28 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): ) def save_as_text(self, fname): - """ - Save this HashDictionary to a text file, for easier debugging. + """Save this HashDictionary to a text file. + Parameters + ---------- + fname : str + Path to output file. + + Notes + ----- The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. - Note: use `save`/`load` to store in binary format instead (pickle). + + Examples + -------- + >>> from gensim.corpora import HashDictionary + >>> from gensim.test.utils import get_tmpfile + >>> + >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] + >>> data = HashDictionary(corpus) + >>> data.save_as_text(get_tmpfile("dictionary_in_text_format")) + """ logger.info("saving HashDictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: