diff --git a/docs/src/models/rpmodel.rst b/docs/src/models/rpmodel.rst index 47eba01262..91ef71872a 100644 --- a/docs/src/models/rpmodel.rst +++ b/docs/src/models/rpmodel.rst @@ -5,4 +5,6 @@ :synopsis: Random Projections :members: :inherited-members: - + :undoc-members: + :show-inheritance: + :special-members: __getitem__ diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 0c8f7c8b26..0826a7c359 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -5,6 +5,35 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""Random Projections (also known as Random Indexing). + +For theoretical background on Random Projections, see [1]_. + + +Examples +-------- +>>> from gensim.models import RpModel +>>> from gensim.corpora import Dictionary +>>> from gensim.test.utils import common_texts, temporary_file +>>> +>>> dictionary = Dictionary(common_texts) # fit dictionary +>>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format +>>> +>>> model = RpModel(corpus, id2word=dictionary) # fit model +>>> result = model[corpus[3]] # apply model to document, result is vector in BoW format +>>> +>>> with temporary_file("model_file") as fname: +... model.save(fname) # save model to file +... loaded_model = RpModel.load(fname) # load model + + +References +---------- +.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis, + https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf + +""" + import logging import numpy as np @@ -16,30 +45,21 @@ class RpModel(interfaces.TransformationABC): - """ - Objects of this class allow building and maintaining a model for Random Projections - (also known as Random Indexing). For theoretical background on RP, see: - - Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." - The main methods are: + def __init__(self, corpus, id2word=None, num_topics=300): + """ - 1. constructor, which creates the random projection matrix - 2. the [] method, which transforms a simple count representation into the TfIdf - space. + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. - >>> rp = RpModel(corpus) - >>> print(rp[some_doc]) - >>> rp.save('/tmp/foo.rp_model') + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + Mapping `token_id` -> `token`, will be determine from corpus if `id2word == None`. - Model persistency is achieved via its load/save methods. - """ + num_topics : int, optional + Number of topics. - def __init__(self, corpus, id2word=None, num_topics=300): - """ - `id2word` is a mapping from word ids (integers) to words (strings). It is - used to determine the vocabulary size, as well as for debugging and topic - printing. If not set, it will be determined from the corpus. """ self.id2word = id2word self.num_topics = num_topics @@ -50,8 +70,13 @@ def __str__(self): return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) def initialize(self, corpus): - """ - Initialize the random projection matrix. + """Initialize the random projection matrix. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. + """ if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") @@ -73,8 +98,32 @@ def initialize(self, corpus): # are smarter and this is no longer needed? def __getitem__(self, bow): - """ - Return RP representation of the input vector and/or corpus. + """Get random-projection representation of the input vector or corpus. + + Parameters + ---------- + bow : {list of (int, int), iterable of list of (int, int)} + Input document or corpus. + + Returns + ------- + list of (int, float) + if `bow` is document OR + :class:`~gensim.interfaces.TransformedCorpus` + if `bow` is corpus. + + Examples + ---------- + >>> from gensim.models import RpModel + >>> from gensim.corpora import Dictionary + >>> from gensim.test.utils import common_texts + >>> + >>> dictionary = Dictionary(common_texts) # fit dictionary + >>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format + >>> + >>> model = RpModel(corpus, id2word=dictionary) # fit model + >>> result = model[corpus[0]] # apply model to document, result is vector in BoW format, i.e. [(1, 0.3), ... ] + """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) @@ -96,5 +145,13 @@ def __getitem__(self, bow): ] def __setstate__(self, state): + """Sets the internal state and updates freshly_loaded to True, called when unpicked. + + Parameters + ---------- + state : dict + State of the class. + + """ self.__dict__ = state self.freshly_loaded = True