Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix docstrings for gensim.models.rpmodel #1802

Merged
merged 14 commits into from
Dec 27, 2017
4 changes: 3 additions & 1 deletion docs/src/models/rpmodel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@
:synopsis: Random Projections
:members:
:inherited-members:

:undoc-members:
:show-inheritance:
:special-members: __getitem__
103 changes: 80 additions & 23 deletions gensim/models/rpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,35 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""Random Projections (also known as Random Indexing).

For theoretical background on Random Projections, see [1]_.


Examples
--------
>>> from gensim.models import RpModel
>>> from gensim.corpora import Dictionary
>>> from gensim.test.utils import common_texts, temporary_file
>>>
>>> dictionary = Dictionary(common_texts) # fit dictionary
>>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format
>>>
>>> model = RpModel(corpus, id2word=dictionary) # fit model
>>> result = model[corpus[3]] # apply model to document, result is vector in BoW format
>>>
>>> with temporary_file("model_file") as fname:
... model.save(fname) # save model to file
... loaded_model = RpModel.load(fname) # load model


References
----------
.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis,
https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf

"""

import logging

import numpy as np
Expand All @@ -16,30 +45,21 @@


class RpModel(interfaces.TransformationABC):
"""
Objects of this class allow building and maintaining a model for Random Projections
(also known as Random Indexing). For theoretical background on RP, see:

Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis."

The main methods are:
def __init__(self, corpus, id2word=None, num_topics=300):
"""

1. constructor, which creates the random projection matrix
2. the [] method, which transforms a simple count representation into the TfIdf
space.
Parameters
----------
corpus : iterable of iterable of (int, int)
Input corpus.

>>> rp = RpModel(corpus)
>>> print(rp[some_doc])
>>> rp.save('/tmp/foo.rp_model')
id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
Mapping `token_id` -> `token`, will be determine from corpus if `id2word == None`.

Model persistency is achieved via its load/save methods.
"""
num_topics : int, optional
Number of topics.

def __init__(self, corpus, id2word=None, num_topics=300):
"""
`id2word` is a mapping from word ids (integers) to words (strings). It is
used to determine the vocabulary size, as well as for debugging and topic
printing. If not set, it will be determined from the corpus.
"""
self.id2word = id2word
self.num_topics = num_topics
Expand All @@ -50,8 +70,13 @@ def __str__(self):
return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)

def initialize(self, corpus):
"""
Initialize the random projection matrix.
"""Initialize the random projection matrix.

Parameters
----------
corpus : iterable of iterable of (int, int)
Input corpus.

"""
if self.id2word is None:
logger.info("no word id mapping provided; initializing from corpus, assuming identity")
Expand All @@ -73,8 +98,32 @@ def initialize(self, corpus):
# are smarter and this is no longer needed?

def __getitem__(self, bow):
"""
Return RP representation of the input vector and/or corpus.
"""Get random-projection representation of the input vector or corpus.

Parameters
----------
bow : {list of (int, int), iterable of list of (int, int)}
Input document or corpus.

Returns
-------
list of (int, float)
if `bow` is document OR
:class:`~gensim.interfaces.TransformedCorpus`
if `bow` is corpus.

Examples
----------
>>> from gensim.models import RpModel
>>> from gensim.corpora import Dictionary
>>> from gensim.test.utils import common_texts
>>>
>>> dictionary = Dictionary(common_texts) # fit dictionary
>>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format
>>>
>>> model = RpModel(corpus, id2word=dictionary) # fit model
>>> result = model[corpus[0]] # apply model to document, result is vector in BoW format, i.e. [(1, 0.3), ... ]

"""
# if the input vector is in fact a corpus, return a transformed corpus as result
is_corpus, bow = utils.is_corpus(bow)
Expand All @@ -96,5 +145,13 @@ def __getitem__(self, bow):
]

def __setstate__(self, state):
"""Sets the internal state and updates freshly_loaded to True, called when unpicked.

Parameters
----------
state : dict
State of the class.

"""
self.__dict__ = state
self.freshly_loaded = True