From 7f699bf71d66aa925ab21c0e6ff4b9c60501376f Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Thu, 15 Feb 2018 00:15:28 +0530 Subject: [PATCH 01/15] Add docustrings for Author-topic model The PR aims to fix the docstrings for Author-topic model. --- gensim/models/atmodel.py | 170 +++++++++++++++++++-------------------- 1 file changed, 85 insertions(+), 85 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 5463e8a025..837553ccda 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -68,6 +68,18 @@ class AuthorTopicState(LdaState): """ def __init__(self, eta, lambda_shape, gamma_shape): + """Ïnitializes parameters for the Author-Topic model. + + Parameters + ---------- + eta: float + Dirichlet topic parameter for sparsity. + lambda_shape: float + Initialize topic parameters. + gamma_shape: int + Initialize topic parameters. + + """ self.eta = eta self.sstats = np.zeros(lambda_shape) self.gamma = np.zeros(gamma_shape) @@ -76,7 +88,16 @@ def __init__(self, eta, lambda_shape, gamma_shape): def construct_doc2author(corpus, author2doc): - """Make a mapping from document IDs to author IDs.""" + """Make a mapping from document IDs to author IDs. + + Parameters + ---------- + corpus: list of list of str + Corpus of documents. + author2doc: dict + Mapping of authors to documents. + + """ doc2author = {} for d, _ in enumerate(corpus): author_ids = [] @@ -88,7 +109,13 @@ def construct_doc2author(corpus, author2doc): def construct_author2doc(doc2author): - """Make a mapping from author IDs to document IDs.""" + """Make a mapping from author IDs to document IDs. + + Parameters + ---------- + doc2author: dict + Mapping of documents to authors. + """ # First get a set of all authors. authors_ids = set() @@ -107,18 +134,7 @@ def construct_author2doc(doc2author): class AuthorTopicModel(LdaModel): - """ - The constructor estimates the author-topic model parameters based - on a training corpus: - - >>> model = AuthorTopicModel(corpus, num_topics=10, author2doc=author2doc, id2word=id2word) - - The model can be updated (trained) with new documents via - - >>> model.update(other_corpus, other_author2doc) - - Model persistency is achieved through its `load`/`save` methods. - """ + """The constructor estimates the author-topic model parameters based on a training corpus.""" def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, @@ -126,77 +142,61 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ - If the iterable corpus and one of author2doc/doc2author dictionaries are given, - start training straight away. If not given, the model is left untrained - (presumably because you want to call the `update` method manually). - - `num_topics` is the number of requested latent topics to be extracted from - the training corpus. - - `id2word` is a mapping from word ids (integers) to words (strings). It is - used to determine the vocabulary size, as well as for debugging and topic - printing. - - `author2doc` is a dictionary where the keys are the names of authors, and the - values are lists of documents that the author contributes to. - - `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) - and the values are lists of author names. I.e. this is the reverse mapping of - `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be - supplied. - - `passes` is the number of times the model makes a pass over the entire trianing - data. - - `iterations` is the maximum number of times the model loops over each document - (M-step). The iterations stop when convergence is reached. - - `chunksize` controls the size of the mini-batches. - - `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic - (theta) and topic-word (lambda) distributions. Both default to a symmetric - 1.0/num_topics prior. - - `alpha` can be set to an explicit array = prior of your choice. It also - support special values of 'asymmetric' and 'auto': the former uses a fixed - normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric - prior directly from your data. - - `eta` can be a scalar for a symmetric prior over topic/word - distributions, or a vector of shape num_words, which can be used to - impose (user defined) asymmetric priors over the word distribution. - It also supports the special value 'auto', which learns an asymmetric - prior over words directly from your data. `eta` can also be a matrix - of shape num_topics x num_words, which can be used to impose - asymmetric priors over the word distribution on a per-topic basis - (can not be learned from data). - - Calculate and log perplexity estimate from the latest mini-batch every - `eval_every` model updates. Set to None to disable perplexity estimation. - - `decay` and `offset` parameters are the same as Kappa and Tau_0 in - Hoffman et al, respectively. `decay` controls how quickly old documents are - forgotten, while `offset` down-weights early iterations. - - `minimum_probability` controls filtering the topics returned for a document (bow). - - `random_state` can be an integer or a numpy.random.RandomState object. Set the - state of the random number generator inside the author-topic model, to ensure - reproducibility of your experiments, for example. - - `serialized` indicates whether the input corpora to the model are simple - in-memory lists (`serialized = False`) or saved to the hard-drive - (`serialized = True`). Note that this behaviour is quite different from - other Gensim models. If your data is too large to fit in to memory, use - this functionality. Note that calling `AuthorTopicModel.update` with new - data may be cumbersome as it requires all the existing data to be - re-serialized. - - `serialization_path` must be set to a filepath, if `serialized = True` is - used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your - working directory by setting `serialization_path = serialized_model.mm`. An existing - file *cannot* be overwritten; either delete the old file or choose a different - name. + API for Author-Topic model. + + Parameters + ---------- + num_topic: int, optional + Number of topics to be extracted from the training corpus. + + id2word: dict of {int: str}, optional + A mapping from word ids (integers) to words (strings). + + author2doc: dict + A dictionary where keys are the names of authors and values are lists of + documents that the author contributes to. + + doc2author: dict + A dictionary where the keys are document IDs and the values are lists of author names. + + passes: int + Number of times the model makes a pass over the entire training data. + + iterations: int + Maximum number of times the model loops over each document + + chunksize: int + Controls the size of the mini-batches. + + alpha: float + Hyperparameters for author-topic model.Supports special values of 'asymmetric' + and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, + the latter learns an asymmetric prior directly from your data. + + eta: float + Hyperparameters for author-topic model. + + eval_every: int + Calculate and estimate log perplexity for latest mini-batch. + + decay: float + Controls how old documents are forgotten. + + offset: float + Controls down-weighting of iterations. + + minimum_probability: float + Controls filtering the topics returned for a document (bow). + + random_state: int or a numpy.random.RandomState object. + Set the state of the random number generator inside the author-topic model. + + serialized: bool + Indicates whether the input corpora to the model are simple lists + or saved to the hard-drive. + + serialization_path: str + Must be set to a filepath, if `serialized = True` is used. Example: From fc82f9ad79b2b3cc9cce99d925b0c96c008d5a1e Mon Sep 17 00:00:00 2001 From: Sourav Date: Sun, 18 Feb 2018 22:17:30 +0530 Subject: [PATCH 02/15] Make PEP8 fixes --- gensim/models/atmodel.py | 58 ++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 837553ccda..f89692a36d 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -69,7 +69,7 @@ class AuthorTopicState(LdaState): def __init__(self, eta, lambda_shape, gamma_shape): """Ïnitializes parameters for the Author-Topic model. - + Parameters ---------- eta: float @@ -78,7 +78,7 @@ def __init__(self, eta, lambda_shape, gamma_shape): Initialize topic parameters. gamma_shape: int Initialize topic parameters. - + """ self.eta = eta self.sstats = np.zeros(lambda_shape) @@ -89,14 +89,14 @@ def __init__(self, eta, lambda_shape, gamma_shape): def construct_doc2author(corpus, author2doc): """Make a mapping from document IDs to author IDs. - + Parameters ---------- corpus: list of list of str Corpus of documents. author2doc: dict Mapping of authors to documents. - + """ doc2author = {} for d, _ in enumerate(corpus): @@ -110,7 +110,7 @@ def construct_doc2author(corpus, author2doc): def construct_author2doc(doc2author): """Make a mapping from author IDs to document IDs. - + Parameters ---------- doc2author: dict @@ -143,7 +143,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d minimum_probability=0.01, random_state=None): """ API for Author-Topic model. - + Parameters ---------- num_topic: int, optional @@ -152,51 +152,51 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d id2word: dict of {int: str}, optional A mapping from word ids (integers) to words (strings). - author2doc: dict - A dictionary where keys are the names of authors and values are lists of + author2doc: dict + A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. doc2author: dict - A dictionary where the keys are document IDs and the values are lists of author names. + A dictionary where the keys are document IDs and the values are lists of author names. passes: int Number of times the model makes a pass over the entire training data. iterations: int Maximum number of times the model loops over each document - - chunksize: int + + chunksize: int Controls the size of the mini-batches. alpha: float - Hyperparameters for author-topic model.Supports special values of 'asymmetric' - and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, + Hyperparameters for author-topic model.Supports special values of 'asymmetric' + and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. - + eta: float Hyperparameters for author-topic model. - + eval_every: int Calculate and estimate log perplexity for latest mini-batch. - + decay: float Controls how old documents are forgotten. - + offset: float Controls down-weighting of iterations. - - minimum_probability: float + + minimum_probability: float Controls filtering the topics returned for a document (bow). - random_state: int or a numpy.random.RandomState object. + random_state: int or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model. - - serialized: bool - Indicates whether the input corpora to the model are simple lists + + serialized: bool + Indicates whether the input corpora to the model are simple lists or saved to the hard-drive. - serialization_path: str - Must be set to a filepath, if `serialized = True` is used. + serialization_path: str + Must be set to a filepath, if `serialized = True` is used. Example: @@ -287,8 +287,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.random_state = utils.get_random_state(random_state) assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), ( - "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % - (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms) + "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % + (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms) ) # VB constants @@ -435,7 +435,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c # phi is computed implicitly below, for ai, a in enumerate(authors_d): tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]])\ - * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T) + * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T) # Update gamma. # Interpolation between document d's "local" gamma (tilde_gamma), @@ -516,7 +516,7 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): corpus_words = sum(cnt for document in chunk for _, cnt in document) subsample_ratio = 1.0 * total_docs / len(chunk) perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / \ - (subsample_ratio * corpus_words) + (subsample_ratio * corpus_words) logger.info( "%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words", perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words From 5e81b7e81017a0e30a994299215b8c3c0ab6e32a Mon Sep 17 00:00:00 2001 From: Sourav Date: Tue, 20 Feb 2018 18:37:40 +0530 Subject: [PATCH 03/15] Add docstrings for rest of the functions. --- gensim/models/atmodel.py | 159 ++++++++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 44 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index f89692a36d..d0a8823737 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -74,7 +74,7 @@ def __init__(self, eta, lambda_shape, gamma_shape): ---------- eta: float Dirichlet topic parameter for sparsity. - lambda_shape: float + lambda_shape: int Initialize topic parameters. gamma_shape: int Initialize topic parameters. @@ -94,7 +94,7 @@ def construct_doc2author(corpus, author2doc): ---------- corpus: list of list of str Corpus of documents. - author2doc: dict + author2doc: dict of (str, str) Mapping of authors to documents. """ @@ -113,7 +113,7 @@ def construct_author2doc(doc2author): Parameters ---------- - doc2author: dict + doc2author: dict of (str, str) Mapping of documents to authors. """ @@ -148,63 +148,48 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d ---------- num_topic: int, optional Number of topics to be extracted from the training corpus. - id2word: dict of {int: str}, optional A mapping from word ids (integers) to words (strings). - author2doc: dict A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author: dict A dictionary where the keys are document IDs and the values are lists of author names. - passes: int Number of times the model makes a pass over the entire training data. - iterations: int Maximum number of times the model loops over each document - chunksize: int Controls the size of the mini-batches. - alpha: float Hyperparameters for author-topic model.Supports special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. - eta: float - Hyperparameters for author-topic model. - + Hyperparameters for author-topic model.\ eval_every: int Calculate and estimate log perplexity for latest mini-batch. - decay: float Controls how old documents are forgotten. - offset: float Controls down-weighting of iterations. - minimum_probability: float Controls filtering the topics returned for a document (bow). - random_state: int or a numpy.random.RandomState object. - Set the state of the random number generator inside the author-topic model. - + Set the state of the random number generator inside the author-topic model. serialized: bool Indicates whether the input corpora to the model are simple lists or saved to the hard-drive. - serialization_path: str - Must be set to a filepath, if `serialized = True` is used. - - Example: + Must be set to a filepath, if `serialized = True` is used. + Example + ------- + >>> import numpy as np + >>> from gensim.models import AuthorTopicModel >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents - >>> model = AuthorTopicModel( - ... corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) """ # NOTE: this doesn't call constructor of a base class, but duplicates most of this code @@ -306,6 +291,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy) def __str__(self): + """Return a string representation of AuthorTopicModel class.""" return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \ (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) @@ -332,6 +318,11 @@ def extend_corpus(self, corpus): are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended. + Parameters + ---------- + corpus: list of list of str + Corpus of documents. + """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. @@ -354,7 +345,13 @@ def extend_corpus(self, corpus): self.corpus.extend(corpus) def compute_phinorm(self, expElogthetad, expElogbetad): - """Efficiently computes the normalizing factor in phi.""" + """Efficiently computes the normalizing factor in phi. + + Parameters + ---------- + expElogthetad: numpy.ndarray + expElogbetad: numpy.ndarray + """ expElogtheta_sum = expElogthetad.sum(axis=0) phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100 @@ -379,6 +376,26 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c Avoids computing the `phi` variational parameter directly using the optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. + Parameters + ---------- + chunk: int + The chunk numer of the sparse document vector on which inference needs to be done. + author2doc: dict + A dictionary where keys are the names of authors and values are lists of + documents that the author contributes to. + doc2author: dict + A dictionary where the keys are document IDs and the values are lists of author names. + rhot: float + Value of rho for conducting inference on documents. + collect_sstats: boolean, optional + If True, collect sufficient statistics needed to update the model's topic-word + distributions, and return a 2-tuple `(gamma_chunk, sstats)`. + Otherwise, returns `(gamma_chunk, None)`.`gamma_chunk` is of shape + `len(chunk_authors) x self.num_topics`,where `chunk_authors` is the + number of authors in the documents in the current chunk. + chunk_doc_idx: numpy.ndarray + Assigns the value for document index. + """ try: len(chunk) @@ -486,9 +503,25 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_idx=None): """ - Perform inference on a chunk of documents, and accumulate the collected + Performs inference on a chunk of documents, and accumulate the collected sufficient statistics in `state` (or `self.state` if None). + Parameters + ---------- + chunk: int + The chunk numer of the sparse document vector on which inference needs to be done. + author2doc: dict + A dictionary where keys are the names of authors and values are lists of + documents that the author contributes to. + doc2author: dict + A dictionary where the keys are document IDs and the values are lists of author names. + rhot: float + Value of rho for conducting inference on documents. + state: int, optional + Initializes the state for a new E-M iteration. + chunk_doc_idx: numpy.ndarray + Assigns the value for document index. + """ # TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible. @@ -508,6 +541,15 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): documents as evaluation corpus. Also output the calculated statistics. incl. perplexity=2^(-bound), to log at INFO level. + Parameters + ---------- + chunk: int + The chunk numer of the sparse document vector on which inference needs to be done. + chunk_doc_idx: numpy.ndarray + Assigns the value for document index. + total_docs: int + Initializes the value for total number of documents. + """ # TODO: This method is very similar to the one in LdaModel. Refactor. @@ -557,21 +599,22 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be new documents. - Args: - corpus (gensim corpus): The corpus with which the author-topic model should be updated. - - author2doc (dict): author to document mapping corresponding to indexes in input - corpus. - - doc2author (dict): document to author mapping corresponding to indexes in input - corpus. - - chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np - array of not. np can in some settings turn the term IDs - into floats, these will be converted back into integers in - inference, which incurs a performance hit. For distributed - computing it may be desirable to keep the chunks as np - arrays. + Parameters + ---------- + corpus: list of list of str + The corpus with which the author-topic model should be updated. + author2doc: dict + A dictionary where keys are the names of authors and values are lists of + documents that the author contributes to. + doc2author: dict + A dictionary where the keys are document IDs and the values are lists of author names. + chunks_as_numpy: bool + Whether each chunk passed to `.inference` should be a np + array of not. np can in some settings turn the term IDs + into floats, these will be converted back into integers in + inference, which incurs a performance hit. For distributed + computing it may be desirable to keep the chunks as np + arrays. For other parameter settings, see :class:`AuthorTopicModel` constructor. @@ -807,8 +850,23 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, corrsponding to this test set are provided. There must not be any new authors passed to this method. `chunk_doc_idx` is not needed in this case. - To obtain the per-word bound, compute: + Parameters + ---------- + chunk: int + The chunk numer of the sparse document vector on which inference needs to be done. + author2doc: dict + A dictionary where keys are the names of authors and values are lists of + documents that the author contributes to. + doc2author: dict + A dictionary where the keys are document IDs and the values are lists of author names. + chunk_doc_idx: numpy.ndarray + Assigns the value for document index. + subsample_ratio: float, optional + Used for calculation of word score for estimation of variational bound. + + Example + ------- >>> corpus_words = sum(cnt for document in corpus for _, cnt in document) >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words @@ -928,6 +986,12 @@ def get_author_topics(self, author_name, minimum_probability=None): Obtaining topic probabilities of each word, as in LDA (via `per_word_topics`), is not supported. + Parameters + ---------- + author_name: str + Name of the author for which the topic distribution needs to be estimated. + minimum_probability: float, optional + Sets the minimum probability value for showing the topics of a given author. """ author_id = self.author2id[author_name] @@ -950,10 +1014,17 @@ def __getitem__(self, author_names, eps=None): Return topic distribution for input author as a list of (topic_id, topic_probabiity) 2-tuples. - Ingores topics with probaility less than `eps`. + Ignores topics with probaility less than `eps`. Do not call this method directly, instead use `model[author_names]`. + Parameters + ---------- + author_names: str + Name of the author for which the topic distribution needs to be estimated. + eps: float, optional + Sets the minimum probability value for showing the topics of a given author. + """ if isinstance(author_names, list): items = [] From 5017968ad27939e23b62f9f236fd9ebdc63b45bb Mon Sep 17 00:00:00 2001 From: Sourav Date: Tue, 20 Feb 2018 18:54:56 +0530 Subject: [PATCH 04/15] Make updates to the docstrings --- gensim/models/atmodel.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index d0a8823737..8c866508e6 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -166,9 +166,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. eta: float - Hyperparameters for author-topic model.\ + Hyperparameters for author-topic model. + update_every: int + Make updates in topic probaility for latest mini-batch. eval_every: int Calculate and estimate log perplexity for latest mini-batch. + gamma_threshold: float + Threshold value of gamma(topic difference between consecutive two topics) + until which the iterations continue. decay: float Controls how old documents are forgotten. offset: float @@ -608,6 +613,23 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, documents that the author contributes to. doc2author: dict A dictionary where the keys are document IDs and the values are lists of author names. + chunksize: int + Controls the size of the mini-batches. + decay: float + Controls how old documents are forgotten. + offset: float + Controls down-weighting of iterations. + passes: int + Number of times the model makes a pass over the entire training data. + update_every: int + Make updates in topic probaility for latest mini-batch. + eval_every: int + Calculate and estimate log perplexity for latest mini-batch. + iterations: int + Maximum number of times the model loops over each document + gamma_threshold: float + Threshold value of gamma(topic difference between consecutive two topics) + until which the iterations continue. chunks_as_numpy: bool Whether each chunk passed to `.inference` should be a np array of not. np can in some settings turn the term IDs @@ -616,8 +638,6 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, computing it may be desirable to keep the chunks as np arrays. - For other parameter settings, see :class:`AuthorTopicModel` constructor. - """ # use parameters given in constructor, unless user explicitly overrode them From d4d13fd76e16341c0bd55ec7acd1706d75859531 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Thu, 22 Feb 2018 00:52:27 +0530 Subject: [PATCH 05/15] Make fixes to docstrings --- gensim/models/atmodel.py | 198 +++++++++++++++++++++++++-------------- 1 file changed, 127 insertions(+), 71 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 8c866508e6..536c685467 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -19,15 +19,12 @@ The model is closely related to Latent Dirichlet Allocation. The AuthorTopicModel class inherits the LdaModel class, and its usage is thus similar. -Distributed computation and multiprocessing is not implemented at the moment, but may be -coming in the future. +The model was introduced by Rosen-Zvi and co-authors in 2004 and is described in [1]_ +and a tutorial for using Author-topic model can be found at [2]_. -The model was introduced by Rosen-Zvi and co-authors in 2004 -(https://mimno.infosci.cornell.edu/info6150/readings/398.pdf). - -A tutorial can be found at -https://github.com/RaRe-Technologies/gensim/tree/develop/docs/notebooks/atmodel_tutorial.ipynb. +.. [1] The Author-Topic Model for Authors and Documents, https://arxiv.org/abs/1207.4169 +.. [2] https://github.com/RaRe-Technologies/gensim/tree/develop/docs/notebooks/atmodel_tutorial.ipynb. """ # TODO: this class inherits LdaModel and overwrites some methods. There is some code @@ -58,7 +55,7 @@ class AuthorTopicState(LdaState): """ NOTE: distributed mode not available yet in the author-topic model. This AuthorTopicState - object is kept so that when the time comes to imlement it, it will be easier. + object is kept so that when the time comes to implement it, it will be easier. Encapsulate information for distributed computation of AuthorTopicModel objects. @@ -94,8 +91,13 @@ def construct_doc2author(corpus, author2doc): ---------- corpus: list of list of str Corpus of documents. - author2doc: dict of (str, str) + author2doc: dict of (str: list of int) Mapping of authors to documents. + + Returns + ------- + dict of {int: list of str} + Document to Author mapping. """ doc2author = {} @@ -113,8 +115,24 @@ def construct_author2doc(doc2author): Parameters ---------- - doc2author: dict of (str, str) + doc2author: dict of {int: list of str) Mapping of documents to authors. + + Returns + ------- + dict of {str: list of int} + Mapping of authors to documents. + + Examples + -------- + >>> from gensim.models.atmodel import construct_author2doc + >>> doc2author = { + ... 0: ['john', 'jack'], + ... 1: ['john', 'jill'], + ... 2: ['john', 'jane', 'jack'] + ... } + >>> author2doc = construct_author2doc(doc2author) + """ # First get a set of all authors. @@ -146,46 +164,48 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d Parameters ---------- - num_topic: int, optional + corpus : iterable of iterable of (int, int) + Corpus of documents in appropriate format(BoW, UCI etc). + num_topics : int, optional Number of topics to be extracted from the training corpus. - id2word: dict of {int: str}, optional + id2word : dict of {int: str}, optional A mapping from word ids (integers) to words (strings). - author2doc: dict + author2doc : dict of {str: list of int} A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author: dict + doc2author : dict of {int: list of str} A dictionary where the keys are document IDs and the values are lists of author names. - passes: int + passes : int Number of times the model makes a pass over the entire training data. - iterations: int + iterations : int Maximum number of times the model loops over each document - chunksize: int + chunksize : int Controls the size of the mini-batches. - alpha: float + alpha : float Hyperparameters for author-topic model.Supports special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. - eta: float + eta : float Hyperparameters for author-topic model. - update_every: int + update_every : int Make updates in topic probaility for latest mini-batch. - eval_every: int + eval_every : int Calculate and estimate log perplexity for latest mini-batch. - gamma_threshold: float + gamma_threshold : float Threshold value of gamma(topic difference between consecutive two topics) until which the iterations continue. - decay: float + decay : float Controls how old documents are forgotten. - offset: float + offset : float Controls down-weighting of iterations. - minimum_probability: float + minimum_probability : float Controls filtering the topics returned for a document (bow). - random_state: int or a numpy.random.RandomState object. + random_state : int or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model. - serialized: bool + serialized : bool Indicates whether the input corpora to the model are simple lists or saved to the hard-drive. - serialization_path: str + serialization_path : str Must be set to a filepath, if `serialized = True` is used. Example @@ -193,7 +213,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d >>> import numpy as np >>> from gensim.models import AuthorTopicModel >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model - >>> model.update(corpus2) # update the author-topic model with additional documents + >>> model.update(corpus, author2doc) # update the author-topic model with additional documents """ @@ -325,9 +345,8 @@ def extend_corpus(self, corpus): Parameters ---------- - corpus: list of list of str + corpus : iterable of iterable of (int, int) Corpus of documents. - """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. @@ -355,7 +374,15 @@ def compute_phinorm(self, expElogthetad, expElogbetad): Parameters ---------- expElogthetad: numpy.ndarray + Value of variational distribution q(theta|gamma). expElogbetad: numpy.ndarray + Value of variational distribution q(beta|lambda). + + Returns + ------- + float + Value of normalizing factor. + """ expElogtheta_sum = expElogthetad.sum(axis=0) phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100 @@ -383,22 +410,22 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c Parameters ---------- - chunk: int + chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc: dict + author2doc : dict of {str: list of int} A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author: dict + doc2author : dict of {int: list of str} A dictionary where the keys are document IDs and the values are lists of author names. - rhot: float + rhot : float Value of rho for conducting inference on documents. - collect_sstats: boolean, optional + collect_sstats : boolean, optional If True, collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma_chunk, sstats)`. Otherwise, returns `(gamma_chunk, None)`.`gamma_chunk` is of shape `len(chunk_authors) x self.num_topics`,where `chunk_authors` is the number of authors in the documents in the current chunk. - chunk_doc_idx: numpy.ndarray + chunk_doc_idx : numpy.ndarray Assigns the value for document index. """ @@ -513,20 +540,24 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id Parameters ---------- - chunk: int + chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc: dict + author2doc : dict of {str: list of ints} A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author: dict + doc2author : dict of {int: list of str} A dictionary where the keys are document IDs and the values are lists of author names. - rhot: float + rhot : float Value of rho for conducting inference on documents. - state: int, optional + state : int, optional Initializes the state for a new E-M iteration. - chunk_doc_idx: numpy.ndarray + chunk_doc_idx : numpy.ndarray Assigns the value for document index. - + + Returns + ------- + float + Value of gamma for training of model. """ # TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible. @@ -548,13 +579,18 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): Parameters ---------- - chunk: int + chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - chunk_doc_idx: numpy.ndarray + chunk_doc_idx : numpy.ndarray Assigns the value for document index. - total_docs: int + total_docs : int Initializes the value for total number of documents. - + + Returns + ------- + float + Value of per-word likelihood bound. + """ # TODO: This method is very similar to the one in LdaModel. Refactor. @@ -606,31 +642,31 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, Parameters ---------- - corpus: list of list of str + corpus : iterable of iterable of (int, int) The corpus with which the author-topic model should be updated. - author2doc: dict + author2doc : dict of {str: list of ints} A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author: dict + doc2author : dict of {int: list of str} A dictionary where the keys are document IDs and the values are lists of author names. - chunksize: int + chunksize : int Controls the size of the mini-batches. - decay: float + decay : float Controls how old documents are forgotten. - offset: float + offset : float Controls down-weighting of iterations. - passes: int + passes : int Number of times the model makes a pass over the entire training data. - update_every: int + update_every : int Make updates in topic probaility for latest mini-batch. - eval_every: int + eval_every : int Calculate and estimate log perplexity for latest mini-batch. - iterations: int + iterations : int Maximum number of times the model loops over each document - gamma_threshold: float + gamma_threshold : float Threshold value of gamma(topic difference between consecutive two topics) until which the iterations continue. - chunks_as_numpy: bool + chunks_as_numpy : bool Whether each chunk passed to `.inference` should be a np array of not. np can in some settings turn the term IDs into floats, these will be converted back into integers in @@ -872,18 +908,22 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, Parameters ---------- - chunk: int + chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc: dict + author2doc : dict of {str: list of ints} A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author: dict + doc2author : dict of {int: list of str} A dictionary where the keys are document IDs and the values are lists of author names. - chunk_doc_idx: numpy.ndarray + chunk_doc_idx : numpy.ndarray Assigns the value for document index. - subsample_ratio: float, optional + subsample_ratio : float, optional Used for calculation of word score for estimation of variational bound. - + + Returns + ------- + float + Value of variational bound score. Example ------- @@ -1008,10 +1048,21 @@ def get_author_topics(self, author_name, minimum_probability=None): Parameters ---------- - author_name: str + author_name : str Name of the author for which the topic distribution needs to be estimated. - minimum_probability: float, optional + minimum_probability : float, optional Sets the minimum probability value for showing the topics of a given author. + + Returns + ------- + list of 2-tuples + Topic distribution of an author as a list of topic ID and its probability. + + Example + ------- + >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] + >>> print(author_vecs) + """ author_id = self.author2id[author_name] @@ -1040,11 +1091,16 @@ def __getitem__(self, author_names, eps=None): Parameters ---------- - author_names: str + author_names : str Name of the author for which the topic distribution needs to be estimated. - eps: float, optional + eps : float, optional Sets the minimum probability value for showing the topics of a given author. - + + Returns + ------- + list of 2-tuples + Topic distribution for the author as a list. + """ if isinstance(author_names, list): items = [] From 8e4459e496f521c9c42d728e59d3e1f4d9d1fd82 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 27 Feb 2018 20:49:15 +0530 Subject: [PATCH 06/15] Update atmodel.py --- gensim/models/atmodel.py | 44 ++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 536c685467..f38aa6ba57 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -54,10 +54,7 @@ class AuthorTopicState(LdaState): """ - NOTE: distributed mode not available yet in the author-topic model. This AuthorTopicState - object is kept so that when the time comes to implement it, it will be easier. - - Encapsulate information for distributed computation of AuthorTopicModel objects. + Encapsulate information for computation of AuthorTopicModel objects. Objects of this class are sent over the network, so try to keep them lean to reduce traffic. @@ -75,6 +72,11 @@ def __init__(self, eta, lambda_shape, gamma_shape): Initialize topic parameters. gamma_shape: int Initialize topic parameters. + + Note + ---- + Distributed mode not available yet in the author-topic model. This AuthorTopicState + object is kept so that when the time comes to implement it, it will be easier. """ self.eta = eta @@ -164,16 +166,16 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of list of (int, number) Corpus of documents in appropriate format(BoW, UCI etc). num_topics : int, optional Number of topics to be extracted from the training corpus. - id2word : dict of {int: str}, optional + id2word : dict of (int: str), optional A mapping from word ids (integers) to words (strings). - author2doc : dict of {str: list of int} + author2doc : dict of (str: list of int) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of {int: list of str} + doc2author : dict of (int: list of str) A dictionary where the keys are document IDs and the values are lists of author names. passes : int Number of times the model makes a pass over the entire training data. @@ -345,7 +347,7 @@ def extend_corpus(self, corpus): Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of list of (int, number) Corpus of documents. """ if self.serialized: @@ -412,10 +414,10 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c ---------- chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of {str: list of int} + author2doc : dict of (str: list of int) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of {int: list of str} + doc2author : dict of (int: list of str) A dictionary where the keys are document IDs and the values are lists of author names. rhot : float Value of rho for conducting inference on documents. @@ -542,10 +544,10 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id ---------- chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of {str: list of ints} + author2doc : dict of (str: list of ints) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of {int: list of str} + doc2author : dict of (int: list of str) A dictionary where the keys are document IDs and the values are lists of author names. rhot : float Value of rho for conducting inference on documents. @@ -642,12 +644,12 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of list of (int, number) The corpus with which the author-topic model should be updated. - author2doc : dict of {str: list of ints} + author2doc : dict of (str: list of ints) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of {int: list of str} + doc2author : dict of (int: list of str) A dictionary where the keys are document IDs and the values are lists of author names. chunksize : int Controls the size of the mini-batches. @@ -910,10 +912,10 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, ---------- chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of {str: list of ints} + author2doc : dict of (str: list of ints) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of {int: list of str} + doc2author : dict of (int: list of str) A dictionary where the keys are document IDs and the values are lists of author names. chunk_doc_idx : numpy.ndarray Assigns the value for document index. @@ -1085,8 +1087,6 @@ def __getitem__(self, author_names, eps=None): Return topic distribution for input author as a list of (topic_id, topic_probabiity) 2-tuples. - Ignores topics with probaility less than `eps`. - Do not call this method directly, instead use `model[author_names]`. Parameters @@ -1096,6 +1096,10 @@ def __getitem__(self, author_names, eps=None): eps : float, optional Sets the minimum probability value for showing the topics of a given author. + Warnings + -------- + Ignores topics with probaility less than `eps`. + Returns ------- list of 2-tuples From 98087f0be0d63e95def9b8f95af6acd91102d236 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 27 Feb 2018 21:47:22 +0530 Subject: [PATCH 07/15] Make update to example --- gensim/models/atmodel.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index f38aa6ba57..64f5939a22 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -214,10 +214,22 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d ------- >>> import numpy as np >>> from gensim.models import AuthorTopicModel - >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model + >>> from gensim.corpora import mmcorpus + >>> from gensim.test.utils import (datapath, common_dictionary as dictionary, common_corpus as corpus) + >>> author2doc = { + ... 'john': [0, 1, 2, 3, 4, 5, 6], + ... 'jane': [2, 3, 4, 5, 6, 7, 8], + ... 'jack': [0, 2, 4, 6, 8] + ... } + >>> doc2author = { + ... 0: ['john', 'jack'], + ... 1: ['john', 'jill'], + ... 2: ['john', 'jane', 'jack'] + ... } + >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + >>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) # train model >>> model.update(corpus, author2doc) # update the author-topic model with additional documents - """ # NOTE: this doesn't call constructor of a base class, but duplicates most of this code # so we have to set dtype to float64 default here From 0dbf1689fbe839f2683cba608e446bea0846c952 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Fri, 9 Mar 2018 00:07:07 +0530 Subject: [PATCH 08/15] Make final edits --- gensim/models/atmodel.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 64f5939a22..1bb80f5a97 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -388,9 +388,9 @@ def compute_phinorm(self, expElogthetad, expElogbetad): Parameters ---------- expElogthetad: numpy.ndarray - Value of variational distribution q(theta|gamma). + Value of variational distribution .. math:: q(theta|gamma). expElogbetad: numpy.ndarray - Value of variational distribution q(beta|lambda). + Value of variational distribution .. math:: q(beta|lambda). Returns ------- @@ -1069,11 +1069,27 @@ def get_author_topics(self, author_name, minimum_probability=None): Returns ------- - list of 2-tuples + list of (topic_id, topic_probability) as a 2-tuple Topic distribution of an author as a list of topic ID and its probability. Example ------- + >>> import numpy as np + >>> from gensim.models import AuthorTopicModel + >>> from gensim.corpora import mmcorpus + >>> from gensim.test.utils import (datapath, common_dictionary as dictionary, common_corpus as corpus) + >>> author2doc = { + ... 'john': [0, 1, 2, 3, 4, 5, 6], + ... 'jane': [2, 3, 4, 5, 6, 7, 8], + ... 'jack': [0, 2, 4, 6, 8] + ... } + >>> doc2author = { + ... 0: ['john', 'jack'], + ... 1: ['john', 'jill'], + ... 2: ['john', 'jane', 'jack'] + ... } + >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + >>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) # train model >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] >>> print(author_vecs) @@ -1114,7 +1130,7 @@ def __getitem__(self, author_names, eps=None): Returns ------- - list of 2-tuples + list of (topic_id, topic_probability) as a 2-tuple Topic distribution for the author as a list. """ From c7e4db85b87e1fc8ceaaeacb858068f80ed262b7 Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Tue, 13 Mar 2018 22:16:51 +0530 Subject: [PATCH 09/15] Update atmodel.py --- gensim/models/atmodel.py | 109 +++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 51 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 1bb80f5a97..856b264ea0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -19,12 +19,32 @@ The model is closely related to Latent Dirichlet Allocation. The AuthorTopicModel class inherits the LdaModel class, and its usage is thus similar. -The model was introduced by Rosen-Zvi and co-authors in 2004 and is described in [1]_ -and a tutorial for using Author-topic model can be found at [2]_. +The model was introduced by Rosen-Zvi and co-authors in 2004 and is described in +`The Author-Topic Model for Authors and Documents `_ +and a tutorial for using Author-topic model can be found at _. + +Example +------- +>>> import numpy as np +>>> from gensim.models import AuthorTopicModel +>>> from gensim.corpora import mmcorpus +>>> from gensim.test.utils import (datapath, common_dictionary as dictionary, common_corpus as corpus) +>>> author2doc = { +... 'john': [0, 1, 2, 3, 4, 5, 6], +... 'jane': [2, 3, 4, 5, 6, 7, 8], +... 'jack': [0, 2, 4, 6, 8] +... } +>>> doc2author = { +... 0: ['john', 'jack'], +... 1: ['john', 'jill'], +... 2: ['john', 'jane', 'jack'] +... } +>>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) +>>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) # train model +>>> model.update(corpus, author2doc) # update the author-topic model with additional documents +>>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] +>>> print(author_vecs) #Prints top authors -.. [1] The Author-Topic Model for Authors and Documents, https://arxiv.org/abs/1207.4169 - -.. [2] https://github.com/RaRe-Technologies/gensim/tree/develop/docs/notebooks/atmodel_tutorial.ipynb. """ # TODO: this class inherits LdaModel and overwrites some methods. There is some code @@ -68,7 +88,7 @@ def __init__(self, eta, lambda_shape, gamma_shape): ---------- eta: float Dirichlet topic parameter for sparsity. - lambda_shape: int + lambda_shape: (int, int) Initialize topic parameters. gamma_shape: int Initialize topic parameters. @@ -91,14 +111,14 @@ def construct_doc2author(corpus, author2doc): Parameters ---------- - corpus: list of list of str + corpus: iterable of list of str Corpus of documents. - author2doc: dict of (str: list of int) + author2doc: dict of (str, list of int) Mapping of authors to documents. Returns ------- - dict of {int: list of str} + dict of {int, list of str} Document to Author mapping. """ @@ -117,12 +137,12 @@ def construct_author2doc(doc2author): Parameters ---------- - doc2author: dict of {int: list of str) + doc2author: dict of {int, list of str) Mapping of documents to authors. Returns ------- - dict of {str: list of int} + dict of {str, list of int} Mapping of authors to documents. Examples @@ -166,16 +186,16 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d Parameters ---------- - corpus : iterable of list of (int, number) + corpus : iterable of list of str Corpus of documents in appropriate format(BoW, UCI etc). num_topics : int, optional Number of topics to be extracted from the training corpus. - id2word : dict of (int: str), optional + id2word : dict of (int, str), optional A mapping from word ids (integers) to words (strings). - author2doc : dict of (str: list of int) + author2doc : dict of (str, list of int) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of (int: list of str) + doc2author : dict of (int, list of str) A dictionary where the keys are document IDs and the values are lists of author names. passes : int Number of times the model makes a pass over the entire training data. @@ -210,26 +230,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d serialization_path : str Must be set to a filepath, if `serialized = True` is used. - Example - ------- - >>> import numpy as np - >>> from gensim.models import AuthorTopicModel - >>> from gensim.corpora import mmcorpus - >>> from gensim.test.utils import (datapath, common_dictionary as dictionary, common_corpus as corpus) - >>> author2doc = { - ... 'john': [0, 1, 2, 3, 4, 5, 6], - ... 'jane': [2, 3, 4, 5, 6, 7, 8], - ... 'jack': [0, 2, 4, 6, 8] - ... } - >>> doc2author = { - ... 0: ['john', 'jack'], - ... 1: ['john', 'jill'], - ... 2: ['john', 'jane', 'jack'] - ... } - >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) - >>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) # train model - >>> model.update(corpus, author2doc) # update the author-topic model with additional documents - + """ # NOTE: this doesn't call constructor of a base class, but duplicates most of this code # so we have to set dtype to float64 default here @@ -330,7 +331,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy) def __str__(self): - """Return a string representation of AuthorTopicModel class.""" + """Return a string representation of AuthorTopicModel class. + + Returns + ------- + str + String representation of Author-Topic model class. + """ return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \ (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) @@ -359,7 +366,7 @@ def extend_corpus(self, corpus): Parameters ---------- - corpus : iterable of list of (int, number) + corpus : iterable of list of str Corpus of documents. """ if self.serialized: @@ -388,9 +395,9 @@ def compute_phinorm(self, expElogthetad, expElogbetad): Parameters ---------- expElogthetad: numpy.ndarray - Value of variational distribution .. math:: q(theta|gamma). + Value of variational distribution :math: q(\theta|\gamma). expElogbetad: numpy.ndarray - Value of variational distribution .. math:: q(beta|lambda). + Value of variational distribution :math: q(\beta|\lambda). Returns ------- @@ -426,10 +433,10 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c ---------- chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of (str: list of int) + author2doc : dict of (str, list of int) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of (int: list of str) + doc2author : dict of (int, list of str) A dictionary where the keys are document IDs and the values are lists of author names. rhot : float Value of rho for conducting inference on documents. @@ -556,10 +563,10 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id ---------- chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of (str: list of ints) + author2doc : dict of (strm list of ints) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of (int: list of str) + doc2author : dict of (intm list of str) A dictionary where the keys are document IDs and the values are lists of author names. rhot : float Value of rho for conducting inference on documents. @@ -656,12 +663,12 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, Parameters ---------- - corpus : iterable of list of (int, number) + corpus : iterable of list of str The corpus with which the author-topic model should be updated. - author2doc : dict of (str: list of ints) + author2doc : dict of (str, list of ints) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of (int: list of str) + doc2author : dict of (int, list of str) A dictionary where the keys are document IDs and the values are lists of author names. chunksize : int Controls the size of the mini-batches. @@ -911,7 +918,7 @@ def rho(): def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None): """ Estimate the variational bound of documents from `corpus`: - E_q[log p(corpus)] - E_q[log q(corpus)] + :math: E_q[log p(corpus)] - E_q[log q(corpus)] There are basically two use cases of this method: 1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided, @@ -924,10 +931,10 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, ---------- chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of (str: list of ints) + author2doc : dict of (str, list of ints) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. - doc2author : dict of (int: list of str) + doc2author : dict of (int, list of str) A dictionary where the keys are document IDs and the values are lists of author names. chunk_doc_idx : numpy.ndarray Assigns the value for document index. @@ -1069,7 +1076,7 @@ def get_author_topics(self, author_name, minimum_probability=None): Returns ------- - list of (topic_id, topic_probability) as a 2-tuple + list of (int, float) as a 2-tuple Topic distribution of an author as a list of topic ID and its probability. Example @@ -1130,7 +1137,7 @@ def __getitem__(self, author_names, eps=None): Returns ------- - list of (topic_id, topic_probability) as a 2-tuple + list of (int, float) as a 2-tuple Topic distribution for the author as a list. """ From 0eb9184db05da4d8baaae422efdad7579ede54ec Mon Sep 17 00:00:00 2001 From: Sourav Singh Date: Fri, 23 Mar 2018 00:19:30 +0530 Subject: [PATCH 10/15] Update atmodel.py --- gensim/models/atmodel.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 856b264ea0..571811c482 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -16,12 +16,13 @@ The model can be updated with additional documents after training has been completed. It is also possible to continue training on the existing data. -The model is closely related to Latent Dirichlet Allocation. The AuthorTopicModel class +The model is closely related to :class:`gensim.models.ldamodel.LdaModel`. The AuthorTopicModel class inherits the LdaModel class, and its usage is thus similar. The model was introduced by Rosen-Zvi and co-authors in 2004 and is described in -`The Author-Topic Model for Authors and Documents `_ -and a tutorial for using Author-topic model can be found at _. +`The Author-Topic Model for Authors and Documents `_.The model correlates +the authorship information with the topics to give a better insight on the subject knowledge of an author. +A tutorial for using Author-topic model can be found at _. Example ------- @@ -43,7 +44,8 @@ >>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) # train model >>> model.update(corpus, author2doc) # update the author-topic model with additional documents >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] ->>> print(author_vecs) #Prints top authors +>>> print(author_vecs) #Prints all authors +>>> print """ @@ -563,7 +565,7 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id ---------- chunk : int The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of (strm list of ints) + author2doc : dict of (str, list of int) A dictionary where keys are the names of authors and values are lists of documents that the author contributes to. doc2author : dict of (intm list of str) @@ -1076,7 +1078,7 @@ def get_author_topics(self, author_name, minimum_probability=None): Returns ------- - list of (int, float) as a 2-tuple + list of (int, float) Topic distribution of an author as a list of topic ID and its probability. Example @@ -1137,7 +1139,7 @@ def __getitem__(self, author_names, eps=None): Returns ------- - list of (int, float) as a 2-tuple + list of (int, float) Topic distribution for the author as a list. """ From 2626d574a945217f404f8c7c743daf326dec07f3 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 2 Apr 2018 20:06:40 +0500 Subject: [PATCH 11/15] fix PEP8 --- gensim/models/atmodel.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 6bd4543dd3..804780b539 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -19,10 +19,11 @@ The model is closely related to :class:`gensim.models.ldamodel.LdaModel`. The AuthorTopicModel class inherits the LdaModel class, and its usage is thus similar. -The model was introduced by Rosen-Zvi and co-authors in 2004 and is described in -`The Author-Topic Model for Authors and Documents `_.The model correlates +The model was introduced by Rosen-Zvi and co-authors in 2004 and is described in +`The Author-Topic Model for Authors and Documents `_.The model correlates the authorship information with the topics to give a better insight on the subject knowledge of an author. -A tutorial for using Author-topic model can be found at _. +A tutorial for using Author-topic model can be found at +_. Example ------- @@ -94,7 +95,7 @@ def __init__(self, eta, lambda_shape, gamma_shape): Initialize topic parameters. gamma_shape: int Initialize topic parameters. - + Note ---- Distributed mode not available yet in the author-topic model. This AuthorTopicState @@ -117,7 +118,7 @@ def construct_doc2author(corpus, author2doc): Corpus of documents. author2doc: dict of (str, list of int) Mapping of authors to documents. - + Returns ------- dict of {int, list of str} @@ -141,12 +142,12 @@ def construct_author2doc(doc2author): ---------- doc2author: dict of {int, list of str) Mapping of documents to authors. - + Returns ------- dict of {str, list of int} Mapping of authors to documents. - + Examples -------- >>> from gensim.models.atmodel import construct_author2doc @@ -232,7 +233,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d serialization_path : str Must be set to a filepath, if `serialized = True` is used. - + """ # NOTE: this doesn't call constructor of a base class, but duplicates most of this code # so we have to set dtype to float64 default here @@ -334,7 +335,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d def __str__(self): """Return a string representation of AuthorTopicModel class. - + Returns ------- str @@ -400,12 +401,12 @@ def compute_phinorm(self, expElogthetad, expElogbetad): Value of variational distribution :math: q(\theta|\gamma). expElogbetad: numpy.ndarray Value of variational distribution :math: q(\beta|\lambda). - + Returns ------- float Value of normalizing factor. - + """ expElogtheta_sum = expElogthetad.sum(axis=0) phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100 @@ -576,7 +577,7 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id Initializes the state for a new E-M iteration. chunk_doc_idx : numpy.ndarray Assigns the value for document index. - + Returns ------- float @@ -608,12 +609,12 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): Assigns the value for document index. total_docs : int Initializes the value for total number of documents. - + Returns ------- float Value of per-word likelihood bound. - + """ # TODO: This method is very similar to the one in LdaModel. Refactor. @@ -942,7 +943,7 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, Assigns the value for document index. subsample_ratio : float, optional Used for calculation of word score for estimation of variational bound. - + Returns ------- float @@ -1148,12 +1149,12 @@ def get_author_topics(self, author_name, minimum_probability=None): Name of the author for which the topic distribution needs to be estimated. minimum_probability : float, optional Sets the minimum probability value for showing the topics of a given author. - + Returns ------- list of (int, float) Topic distribution of an author as a list of topic ID and its probability. - + Example ------- >>> import numpy as np @@ -1171,10 +1172,10 @@ def get_author_topics(self, author_name, minimum_probability=None): ... 2: ['john', 'jane', 'jack'] ... } >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) - >>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) # train model + >>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] >>> print(author_vecs) - + """ @@ -1206,16 +1207,16 @@ def __getitem__(self, author_names, eps=None): Name of the author for which the topic distribution needs to be estimated. eps : float, optional Sets the minimum probability value for showing the topics of a given author. - + Warnings -------- Ignores topics with probaility less than `eps`. - + Returns ------- list of (int, float) Topic distribution for the author as a list. - + """ if isinstance(author_names, list): items = [] From e8e91b7cd5c28e093829a1db52b31ffb5a7885e4 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 2 Apr 2018 22:56:45 +0500 Subject: [PATCH 12/15] fix atmodel[1] --- gensim/models/atmodel.py | 205 +++++++++++++++++---------------------- 1 file changed, 87 insertions(+), 118 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 804780b539..56f1377a73 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -6,50 +6,49 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Author-topic model in Python. +"""Author-topic model. -This module trains the author-topic model on documents and corresponding author-document -dictionaries. The training is online and is constant in memory w.r.t. the number of -documents. The model is *not* constant in memory w.r.t. the number of authors. +This module trains the author-topic model on documents and corresponding author-document dictionaries. +The training is online and is constant in memory w.r.t. the number of documents. +The model is *not* constant in memory w.r.t. the number of authors. The model can be updated with additional documents after training has been completed. It is also possible to continue training on the existing data. -The model is closely related to :class:`gensim.models.ldamodel.LdaModel`. The AuthorTopicModel class -inherits the LdaModel class, and its usage is thus similar. +The model is closely related to :class:`~gensim.models.ldamodel.LdaModel`. +The :class:`~gensim.models.atmodel.AuthorTopicModel` class inherits :class:`~gensim.models.ldamodel.LdaModel`, +and its usage is thus similar. -The model was introduced by Rosen-Zvi and co-authors in 2004 and is described in -`The Author-Topic Model for Authors and Documents `_.The model correlates -the authorship information with the topics to give a better insight on the subject knowledge of an author. -A tutorial for using Author-topic model can be found at -_. +The model was introduced by `Rosen-Zvi and co-authors: "The Author-Topic Model for Authors and Documents" +`_. The model correlates the authorship information with the topics to give a better +insight on the subject knowledge of an author. Example ------- ->>> import numpy as np >>> from gensim.models import AuthorTopicModel >>> from gensim.corpora import mmcorpus ->>> from gensim.test.utils import (datapath, common_dictionary as dictionary, common_corpus as corpus) +>>> from gensim.test.utils import common_dictionary, datapath, temporary_file + >>> author2doc = { ... 'john': [0, 1, 2, 3, 4, 5, 6], ... 'jane': [2, 3, 4, 5, 6, 7, 8], ... 'jack': [0, 2, 4, 6, 8] ... } ->>> doc2author = { -... 0: ['john', 'jack'], -... 1: ['john', 'jill'], -... 2: ['john', 'jane', 'jack'] -... } +>>> >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) ->>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) # train model ->>> model.update(corpus, author2doc) # update the author-topic model with additional documents +>>> +>>> with temporary_file("serialized") as s_path: +... model = AuthorTopicModel( +... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4, +... serialized=True, serialization_path=s_path +... ) +... +... model.update(corpus, author2doc) # update the author-topic model with additional documents +>>> +>>> # construct vectors for authors >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] ->>> print(author_vecs) #Prints all authors ->>> print """ - # TODO: this class inherits LdaModel and overwrites some methods. There is some code # duplication still, and a refactor could be made to avoid this. Comments with "TODOs" # are included in the code where this is the case, for example in the log_perplexity @@ -72,35 +71,24 @@ from six.moves import xrange import six -logger = logging.getLogger('gensim.models.atmodel') +logger = logging.getLogger(__name__) class AuthorTopicState(LdaState): - """ - Encapsulate information for computation of AuthorTopicModel objects. - - Objects of this class are sent over the network, so try to keep them lean to - reduce traffic. - - """ + """Encapsulate information for computation of :class:`~gensim.models.atmodel.AuthorTopicModel`.""" def __init__(self, eta, lambda_shape, gamma_shape): - """Ïnitializes parameters for the Author-Topic model. + """ Parameters ---------- - eta: float + eta: numpy.ndarray Dirichlet topic parameter for sparsity. lambda_shape: (int, int) Initialize topic parameters. gamma_shape: int Initialize topic parameters. - Note - ---- - Distributed mode not available yet in the author-topic model. This AuthorTopicState - object is kept so that when the time comes to implement it, it will be easier. - """ self.eta = eta self.sstats = np.zeros(lambda_shape) @@ -110,18 +98,18 @@ def __init__(self, eta, lambda_shape, gamma_shape): def construct_doc2author(corpus, author2doc): - """Make a mapping from document IDs to author IDs. + """Create a mapping from document IDs to author IDs. Parameters ---------- - corpus: iterable of list of str - Corpus of documents. + corpus: iterable of list of (int, float) + Corpus in BoW format. author2doc: dict of (str, list of int) Mapping of authors to documents. Returns ------- - dict of {int, list of str} + dict of (int, list of str) Document to Author mapping. """ @@ -140,26 +128,15 @@ def construct_author2doc(doc2author): Parameters ---------- - doc2author: dict of {int, list of str) - Mapping of documents to authors. + doc2author: dict of (int, list of str) + Mapping of document id to authors. Returns ------- - dict of {str, list of int} - Mapping of authors to documents. - - Examples - -------- - >>> from gensim.models.atmodel import construct_author2doc - >>> doc2author = { - ... 0: ['john', 'jack'], - ... 1: ['john', 'jill'], - ... 2: ['john', 'jane', 'jack'] - ... } - >>> author2doc = construct_author2doc(doc2author) + dict of (str, list of int) + Mapping of authors to document ids. """ - # First get a set of all authors. authors_ids = set() for d, a_doc_ids in doc2author.items(): @@ -919,24 +896,27 @@ def rho(): del other def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None): - """ - Estimate the variational bound of documents from `corpus`: - :math: E_q[log p(corpus)] - E_q[log q(corpus)] + """Estimate the variational bound of documents from `corpus`. + + :math:`\mathbb{E_{q}}[\log p(corpus)] - \mathbb{E_{q}}[\log q(corpus)]` + Notes + ----- There are basically two use cases of this method: - 1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided, - indicating the indexes of the documents in the training corpus. - 2. `chunk` is a test set (held-out data), and author2doc and doc2author - corrsponding to this test set are provided. There must not be any new authors - passed to this method. `chunk_doc_idx` is not needed in this case. + + #. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided, + indicating the indexes of the documents in the training corpus. + #. `chunk` is a test set (held-out data), and `author2doc` and `doc2author` corresponding to this test set + are provided. There must not be any new authors passed to this method, `chunk_doc_idx` is not needed + in this case. Parameters ---------- - chunk : int - The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of (str, list of ints) - A dictionary where keys are the names of authors and values are lists of - documents that the author contributes to. + chunk : iterable of list of (int, float) + Corpus in BoW format. + author2doc : dict of (str, list of int) + A dictionary where keys are the names of authors and values are lists of documents that the author + contributes to. doc2author : dict of (int, list of str) A dictionary where the keys are document IDs and the values are lists of author names. chunk_doc_idx : numpy.ndarray @@ -949,13 +929,7 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, float Value of variational bound score. - Example - ------- - >>> corpus_words = sum(cnt for document in corpus for _, cnt in document) - >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words - """ - # TODO: enable evaluation of documents with new authors. One could, for example, make it # possible to pass a list of documents to self.inference with no author dictionaries, # assuming all the documents correspond to one (unseen) author, learn the author's @@ -1049,13 +1023,19 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, return total_score def get_document_topics(self, word_id, minimum_probability=None): - """ - This method overwrites `LdaModel.get_document_topics` and simply raises an - exception. `get_document_topics` is not valid for the author-topic model, - use `get_author_topics` instead. + """Override :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` and simply raises an exception. - """ + Warnings + -------- + This method invalid for model, use :meth:`~gensim.models.atmodel.AuthorTopicModel.get_author_topics` or + :meth:`~gensim.models.atmodel.AuthorTopicModel.get_new_author_topics` instead. + + Raises + ------ + NotImplementedError + Always. + """ raise NotImplementedError( 'Method "get_document_topics" is not valid for the author-topic model. ' 'Use the "get_author_topics" method.' @@ -1069,7 +1049,7 @@ def get_new_author_topics(self, corpus, minimum_probability=None): Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of list of (int, float) Corpus in BoW format. minimum_probability : float, optional Ignore topics with probability below this value, if None - 1e-8 is used. @@ -1134,51 +1114,47 @@ def rollback_new_author_chages(): return new_author_topics def get_author_topics(self, author_name, minimum_probability=None): - """ - Return topic distribution the given author. - - Input as as a list of - (topic_id, topic_probability) 2-tuples. - Ignore topics with very low probability (below `minimum_probability`). - Obtaining topic probabilities of each word, as in LDA (via `per_word_topics`), - is not supported. + """Get topic distribution the given author. Parameters ---------- author_name : str Name of the author for which the topic distribution needs to be estimated. minimum_probability : float, optional - Sets the minimum probability value for showing the topics of a given author. + Sets the minimum probability value for showing the topics of a given author, topics with probability < + `minimum_probability` will be ignored. Returns ------- list of (int, float) - Topic distribution of an author as a list of topic ID and its probability. + Topic distribution of an author. Example ------- - >>> import numpy as np >>> from gensim.models import AuthorTopicModel >>> from gensim.corpora import mmcorpus - >>> from gensim.test.utils import (datapath, common_dictionary as dictionary, common_corpus as corpus) + >>> from gensim.test.utils import common_dictionary, datapath, temporary_file + >>> author2doc = { ... 'john': [0, 1, 2, 3, 4, 5, 6], ... 'jane': [2, 3, 4, 5, 6, 7, 8], ... 'jack': [0, 2, 4, 6, 8] ... } - >>> doc2author = { - ... 0: ['john', 'jack'], - ... 1: ['john', 'jill'], - ... 2: ['john', 'jane', 'jack'] - ... } + >>> >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) - >>> model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=dictionary, num_topics=4, passes=100) + >>> + >>> with temporary_file("serialized") as s_path: + ... model = AuthorTopicModel( + ... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4, + ... serialized=True, serialization_path=s_path + ... ) + ... + ... model.update(corpus, author2doc) # update the author-topic model with additional documents + >>> + >>> # construct vectors for authors >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] - >>> print(author_vecs) - """ - author_id = self.author2id[author_name] if minimum_probability is None: @@ -1195,27 +1171,20 @@ def get_author_topics(self, author_name, minimum_probability=None): return author_topics def __getitem__(self, author_names, eps=None): - """ - Return topic distribution for input author as a list of - (topic_id, topic_probabiity) 2-tuples. - - Do not call this method directly, instead use `model[author_names]`. + """Get topic distribution for input `author_names`. Parameters ---------- - author_names : str - Name of the author for which the topic distribution needs to be estimated. + author_names : {str, list of str} + Name(s) of the author for which the topic distribution needs to be estimated. eps : float, optional - Sets the minimum probability value for showing the topics of a given author. - - Warnings - -------- - Ignores topics with probaility less than `eps`. + The minimum probability value for showing the topics of a given author, topics with probability < `eps` + will be ignored. Returns ------- - list of (int, float) - Topic distribution for the author as a list. + list of (int, float) **or** list of list of (int, float) + Topic distribution for the author(s), type depends on type of `author_names`. """ if isinstance(author_names, list): From 6ed5d10e2b761523a9d9c2b023e676fab8808332 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 3 Apr 2018 09:39:27 +0500 Subject: [PATCH 13/15] fix atmodel[2] --- gensim/models/atmodel.py | 113 ++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 62 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 56f1377a73..0966d89fc5 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -162,7 +162,6 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ - API for Author-Topic model. Parameters ---------- @@ -573,18 +572,15 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id return gamma def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): - """ - Calculate and return per-word likelihood bound, using the `chunk` of - documents as evaluation corpus. Also output the calculated statistics. incl. - perplexity=2^(-bound), to log at INFO level. + """Calculate per-word likelihood bound, using the `chunk` of documents as evaluation corpus. Parameters ---------- - chunk : int - The chunk numer of the sparse document vector on which inference needs to be done. - chunk_doc_idx : numpy.ndarray + chunk : iterable of list of (int, float) + Corpus in BoW format. + chunk_doc_idx : numpy.ndarray, optional Assigns the value for document index. - total_docs : int + total_docs : int, optional Initializes the value for total number of documents. Returns @@ -593,7 +589,6 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): Value of per-word likelihood bound. """ - # TODO: This method is very similar to the one in LdaModel. Refactor. if total_docs is None: total_docs = len(chunk) @@ -610,73 +605,67 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): - """ - Train the model with new documents, by EM-iterating over `corpus` until - the topics converge (or until the maximum number of allowed iterations - is reached). `corpus` must be an iterable (repeatable stream of documents), - - This update also supports updating an already trained model (`self`) - with new documents from `corpus`; the two models are then merged in - proportion to the number of old vs. new documents. This feature is still - experimental for non-stationary input streams. - - For stationary input (no topic drift in new documents), on the other hand, - this equals the online update of Hoffman et al. and is guaranteed to - converge for any `decay` in (0.5, 1.0>. Additionally, for smaller - `corpus` sizes, an increasing `offset` may be beneficial (see + """Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the + maximum number of allowed iterations is reached). + + Notes + ----- + This update also supports updating an already trained model (self) + with new documents from `corpus`: the two models are then merged in proportion to the number of old vs. new + documents. This feature is still experimental for non-stationary input streams. + + For stationary input (no topic drift in new documents), on the other hand, this equals the online update of + `Hoffman et al. Stochastic Variational Inference + `_ and is guaranteed to converge for any `decay` + in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see Table 1 in Hoffman et al.) - If update is called with authors that already exist in the model, it will - resume training on not only new documents for that author, but also the - previously seen documents. This is necessary for those authors' topic + If update is called with authors that already exist in the model, it will resume training on not only new + documents for that author, but also the previously seen documents. This is necessary for those authors' topic distributions to converge. - Every time `update(corpus, author2doc)` is called, the new documents are - to appended to all the previously seen documents, and author2doc is - combined with the previously seen authors. + Every time `update(corpus, author2doc)` is called, the new documents are to appended to all the previously seen + documents, and author2doc is combined with the previously seen authors. To resume training on all the data seen by the model, simply call - `update()`. + :meth:`~gensim.models.atmodel.AuthorTopicModel.update`. - It is not possible to add new authors to existing documents, as all - documents in `corpus` are assumed to be new documents. + It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be + new documents. Parameters ---------- - corpus : iterable of list of str - The corpus with which the author-topic model should be updated. - author2doc : dict of (str, list of ints) - A dictionary where keys are the names of authors and values are lists of - documents that the author contributes to. - doc2author : dict of (int, list of str) + corpus : iterable of list of (int, float) + The corpus in BoW format. + author2doc : dict of (str, list of int), optional + A dictionary where keys are the names of authors and values are lists of document IDs that the author + contributes to. + doc2author : dict of (int, list of str), optional A dictionary where the keys are document IDs and the values are lists of author names. - chunksize : int + chunksize : int, optional Controls the size of the mini-batches. - decay : float + decay : float, optional Controls how old documents are forgotten. - offset : float + offset : float, optional Controls down-weighting of iterations. - passes : int + passes : int, optional Number of times the model makes a pass over the entire training data. - update_every : int - Make updates in topic probaility for latest mini-batch. - eval_every : int + update_every : int, optional + Make updates in topic probability for latest mini-batch. + eval_every : int, optional Calculate and estimate log perplexity for latest mini-batch. - iterations : int + iterations : int, optional Maximum number of times the model loops over each document - gamma_threshold : float + gamma_threshold : float, optional Threshold value of gamma(topic difference between consecutive two topics) until which the iterations continue. - chunks_as_numpy : bool - Whether each chunk passed to `.inference` should be a np - array of not. np can in some settings turn the term IDs - into floats, these will be converted back into integers in - inference, which incurs a performance hit. For distributed - computing it may be desirable to keep the chunks as np - arrays. + chunks_as_numpy : bool, optional + Whether each chunk passed to :meth:`~gensim.models.atmodel.AuthorTopicModel.inference` should be a numpy + array of not. Numpy can in some settings turn the term IDs into floats, these will be converted back into + integers in inference, which incurs a performance hit. For distributed computing (not supported now) + it may be desirable to keep the chunks as numpy arrays. """ - # use parameters given in constructor, unless user explicitly overrode them if decay is None: decay = self.decay @@ -914,15 +903,15 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, ---------- chunk : iterable of list of (int, float) Corpus in BoW format. - author2doc : dict of (str, list of int) - A dictionary where keys are the names of authors and values are lists of documents that the author - contributes to. - doc2author : dict of (int, list of str) - A dictionary where the keys are document IDs and the values are lists of author names. - chunk_doc_idx : numpy.ndarray + chunk_doc_idx : numpy.ndarray, optional Assigns the value for document index. subsample_ratio : float, optional Used for calculation of word score for estimation of variational bound. + author2doc : dict of (str, list of int), optinal + A dictionary where keys are the names of authors and values are lists of documents that the author + contributes to. + doc2author : dict of (int, list of str), optional + A dictionary where the keys are document IDs and the values are lists of author names. Returns ------- From db7484d993734d030dc8222dfb3e96fe3a67b960 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 3 Apr 2018 10:12:34 +0500 Subject: [PATCH 14/15] fix atmodel[3] --- gensim/models/atmodel.py | 163 +++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 82 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 0966d89fc5..78c51cf2df 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -165,50 +165,49 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d Parameters ---------- - corpus : iterable of list of str - Corpus of documents in appropriate format(BoW, UCI etc). + corpus : iterable of list of (int, float), optional + Corpus in BoW format num_topics : int, optional Number of topics to be extracted from the training corpus. - id2word : dict of (int, str), optional + id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional A mapping from word ids (integers) to words (strings). - author2doc : dict of (str, list of int) - A dictionary where keys are the names of authors and values are lists of - documents that the author contributes to. - doc2author : dict of (int, list of str) + author2doc : dict of (str, list of int), optional + A dictionary where keys are the names of authors and values are lists of document IDs that the author + contributes to. + doc2author : dict of (int, list of str), optional A dictionary where the keys are document IDs and the values are lists of author names. - passes : int - Number of times the model makes a pass over the entire training data. - iterations : int - Maximum number of times the model loops over each document - chunksize : int + chunksize : int, optional Controls the size of the mini-batches. - alpha : float + passes : int, optional + Number of times the model makes a pass over the entire training data. + iterations : int, optional + Maximum number of times the model loops over each document. + decay : float, optional + Controls how old documents are forgotten. + offset : float, optional + Controls down-weighting of iterations. + alpha : float, optional Hyperparameters for author-topic model.Supports special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. - eta : float + eta : float, optional Hyperparameters for author-topic model. - update_every : int - Make updates in topic probaility for latest mini-batch. - eval_every : int + update_every : int, optional + Make updates in topic probability for latest mini-batch. + eval_every : int, optional Calculate and estimate log perplexity for latest mini-batch. - gamma_threshold : float + gamma_threshold : float, optional Threshold value of gamma(topic difference between consecutive two topics) until which the iterations continue. - decay : float - Controls how old documents are forgotten. - offset : float - Controls down-weighting of iterations. - minimum_probability : float - Controls filtering the topics returned for a document (bow). - random_state : int or a numpy.random.RandomState object. - Set the state of the random number generator inside the author-topic model. - serialized : bool + serialized : bool, optional Indicates whether the input corpora to the model are simple lists or saved to the hard-drive. - serialization_path : str + serialization_path : str, optional Must be set to a filepath, if `serialized = True` is used. - + minimum_probability : float, optional + Controls filtering the topics returned for a document (bow). + random_state : {int, numpy.random.RandomState}, optional + Set the state of the random number generator inside the author-topic model. """ # NOTE: this doesn't call constructor of a base class, but duplicates most of this code @@ -310,21 +309,21 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy) def __str__(self): - """Return a string representation of AuthorTopicModel class. + """Get a string representation of object. Returns ------- str - String representation of Author-Topic model class. + String representation of current instance. + """ return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \ (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) def init_empty_corpus(self): - """ - Initialize an empty corpus. If the corpora are to be treated as lists, simply - initialize an empty list. If serialization is used, initialize an empty corpus - of the class `gensim.corpora.MmCorpus`. + """Initialize an empty corpus. + If the corpora are to be treated as lists, simply initialize an empty list. + If serialization is used, initialize an empty corpus using :class:`~gensim.corpora.mmcorpus.MmCorpus`. """ if self.serialized: @@ -337,16 +336,21 @@ def init_empty_corpus(self): self.corpus = [] def extend_corpus(self, corpus): - """ - Add new documents in `corpus` to `self.corpus`. If serialization is used, - then the entire corpus (`self.corpus`) is re-serialized and the new documents - are added in the process. If serialization is not used, the corpus, as a list - of documents, is simply extended. + """Add new documents from `corpus` to `self.corpus`. + + If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents + are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended. Parameters ---------- - corpus : iterable of list of str - Corpus of documents. + corpus : iterable of list of (int, float) + Corpus in BoW format + + Raises + ------ + AssertionError + If serialized == False and corpus isn't list. + """ if self.serialized: # Re-serialize the entire corpus while appending the new documents. @@ -374,9 +378,9 @@ def compute_phinorm(self, expElogthetad, expElogbetad): Parameters ---------- expElogthetad: numpy.ndarray - Value of variational distribution :math: q(\theta|\gamma). + Value of variational distribution :math:`q(\theta|\gamma)`. expElogbetad: numpy.ndarray - Value of variational distribution :math: q(\beta|\lambda). + Value of variational distribution :math:`q(\\beta|\lambda)`. Returns ------- @@ -390,44 +394,41 @@ def compute_phinorm(self, expElogthetad, expElogbetad): return phinorm def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None): - """ - Given a chunk of sparse document vectors, update gamma (parameters - controlling the topic weights) for each author corresponding to the - documents in the chunk. + """Give a `chunk` of sparse document vectors, update gamma for each author corresponding to the `chuck`. - The whole input chunk of document is assumed to fit in RAM; chunking of - a large corpus must be done earlier in the pipeline. - - If `collect_sstats` is True, also collect sufficient statistics needed - to update the model's topic-word distributions, and return a 2-tuple - `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. - `gamma_cunk` is of shape `len(chunk_authors) x self.num_topics`, where - `chunk_authors` is the number of authors in the documents in the - current chunk. + Warnings + -------- + The whole input chunk of document is assumed to fit in RAM, chunking of a large corpus must be done earlier + in the pipeline. Avoids computing the `phi` variational parameter directly using the - optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. + optimization presented in `Lee, Seung: "Algorithms for non-negative matrix factorization", NIPS 2001 + _`. Parameters ---------- - chunk : int - The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of (str, list of int) - A dictionary where keys are the names of authors and values are lists of - documents that the author contributes to. - doc2author : dict of (int, list of str) + chunk : iterable of list of (int, float) + Corpus in BoW format. + author2doc : dict of (str, list of int), optional + A dictionary where keys are the names of authors and values are lists of document IDs that the author + contributes to. + doc2author : dict of (int, list of str), optional A dictionary where the keys are document IDs and the values are lists of author names. rhot : float Value of rho for conducting inference on documents. collect_sstats : boolean, optional - If True, collect sufficient statistics needed to update the model's topic-word - distributions, and return a 2-tuple `(gamma_chunk, sstats)`. - Otherwise, returns `(gamma_chunk, None)`.`gamma_chunk` is of shape - `len(chunk_authors) x self.num_topics`,where `chunk_authors` is the - number of authors in the documents in the current chunk. - chunk_doc_idx : numpy.ndarray + If True - collect sufficient statistics needed to update the model's topic-word distributions, and return + `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. `gamma_chunk` is of shape + `len(chunk_authors) x self.num_topics`,where `chunk_authors` is the number of authors in the documents in + the current chunk. + chunk_doc_idx : numpy.ndarray, optional Assigns the value for document index. + Returns + ------- + (numpy.ndarray, numpy.ndarray) + gamma_chunk and sstats (if `collect_sstats == True`, otherwise - None) + """ try: len(chunk) @@ -534,32 +535,30 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c return gamma_chunk, sstats def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_idx=None): - """ - Performs inference on a chunk of documents, and accumulate the collected - sufficient statistics in `state` (or `self.state` if None). + """Performs inference (E-step) on a chunk of documents, and accumulate the collected sufficient statistics. Parameters ---------- - chunk : int - The chunk numer of the sparse document vector on which inference needs to be done. - author2doc : dict of (str, list of int) - A dictionary where keys are the names of authors and values are lists of - documents that the author contributes to. - doc2author : dict of (intm list of str) + chunk : iterable of list of (int, float) + Corpus in BoW format. + author2doc : dict of (str, list of int), optional + A dictionary where keys are the names of authors and values are lists of document IDs that the author + contributes to. + doc2author : dict of (int, list of str), optional A dictionary where the keys are document IDs and the values are lists of author names. rhot : float Value of rho for conducting inference on documents. state : int, optional - Initializes the state for a new E-M iteration. - chunk_doc_idx : numpy.ndarray + Initializes the state for a new E iteration. + chunk_doc_idx : numpy.ndarray, optional Assigns the value for document index. Returns ------- float Value of gamma for training of model. - """ + """ # TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible. if state is None: state = self.state From 82a9e407420670d49ef9263fc715ecc3f186401d Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 3 Apr 2018 10:17:37 +0500 Subject: [PATCH 15/15] fix atmodel[4] --- gensim/models/atmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 78c51cf2df..9253a6a6d8 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -403,7 +403,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c Avoids computing the `phi` variational parameter directly using the optimization presented in `Lee, Seung: "Algorithms for non-negative matrix factorization", NIPS 2001 - _`. + `_. Parameters ----------