From c3bc1b8b2f6e6d7b34057e4bbd84b95e32c5aadf Mon Sep 17 00:00:00 2001 From: Jonathan Schneider Date: Wed, 5 May 2021 19:20:43 +0200 Subject: [PATCH 1/4] Update link to Hoffman paper (online VB LDA) The previous link was Matthew Hoffman's Google Scholar profile or not the official one. Use full author names in the first occurrence and first author only afterward. --- gensim/models/ldamodel.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 52d9be3fcf..4b9de92490 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -13,9 +13,10 @@ for online training. The core estimation code is based on the `onlineldavb.py script -`_, by `Hoffman, Blei, Bach: +`_, by +`Matthew D. Hoffman, David M. Blei, Francis Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010 -`_. +`_. The algorithm: @@ -199,7 +200,7 @@ def blend(self, rhot, other, targetsize=None): The number of documents is stretched in both state objects, so that they are of comparable magnitude. This procedure corresponds to the stochastic gradient update from `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_, see equations (5) and (9). + `_, see equations (5) and (9). Parameters ---------- @@ -311,8 +312,9 @@ def load(cls, fname, *args, **kwargs): class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): - """Train and use Online Latent Dirichlet Allocation (OLDA) models as presented in - `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" `_. + """Train and use Online Latent Dirichlet Allocation models as presented in + `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" + `_. Examples ------- @@ -395,12 +397,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined. Corresponds to Kappa from - `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. + `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" + `_. offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. + Corresponds to Tau_0 from `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" + `_. eval_every : int, optional Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. iterations : int, optional @@ -643,7 +645,7 @@ def inference(self, chunk, collect_sstats=False): """Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights) for each document in the chunk. - This function does not modify the model The whole input chunk of document is assumed to fit in RAM; + This function does not modify the model. The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. Avoids computing the `phi` variational parameter directly using the optimization presented in `Lee, Seung: Algorithms for non-negative matrix factorization" @@ -863,8 +865,8 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, This update also supports updating an already trained model with new documents; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the - online update of `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. + online update of `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" + `_ and is guaranteed to converge for any `decay` in (0.5, 1.0). Additionally, for smaller corpus sizes, an increasing `offset` may be beneficial (see Table 1 in the same paper). @@ -878,12 +880,12 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined. Corresponds to Kappa from - `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. + `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" + `_. offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. + Corresponds to Tau_0 from `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" + `_. passes : int, optional Number of passes through the corpus during training. update_every : int, optional From 53109fa5a80dfdd7a0c4b6f4c239663ab27786d0 Mon Sep 17 00:00:00 2001 From: Jonathan Schneider Date: Tue, 11 May 2021 18:54:42 +0200 Subject: [PATCH 2/4] Refactor links to Hoffman paper and use latex symbols for original parameter names --- gensim/models/atmodel.py | 32 ++++++++++++++--------- gensim/models/ldamodel.py | 48 +++++++++++++++++------------------ gensim/models/ldamulticore.py | 28 ++++++++++---------- 3 files changed, 58 insertions(+), 50 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 4c579b5b1e..c1d79c5244 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -23,6 +23,9 @@ `_. The model correlates the authorship information with the topics to give a better insight on the subject knowledge of an author. +.. _'Online Learning for LDA' by Hoffman et al.: online-lda_ +.. _online-lda: https://papers.neurips.cc/paper/2010/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf + Example ------- .. sourcecode:: pycon @@ -185,9 +188,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d iterations : int, optional Maximum number of times the model loops over each document. decay : float, optional - Controls how old documents are forgotten. + A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten + when each new document is examined. Corresponds to :math:`\\kappa` from + `'Online Learning for LDA' by Hoffman et al.`_ offset : float, optional - Controls down-weighting of iterations. + Hyper-parameter that controls how much we will slow down the first steps the first few iterations. + Corresponds to :math:`\\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_ alpha : {float, numpy.ndarray of float, list of float, str}, optional A-priori belief on document-topic distribution, this can be: * scalar for a symmetric prior over document-topic distribution, @@ -618,15 +624,14 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, Notes ----- - This update also supports updating an already trained model (self) - with new documents from `corpus`: the two models are then merged in proportion to the number of old vs. new - documents. This feature is still experimental for non-stationary input streams. + This update also supports updating an already trained model (`self`) with new documents from `corpus`; + the two models are then merged in proportion to the number of old vs. new documents. + This feature is still experimental for non-stationary input streams. - For stationary input (no topic drift in new documents), on the other hand, this equals the online update of - `Hoffman et al. Stochastic Variational Inference - `_ and is guaranteed to converge for any `decay` - in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see - Table 1 in Hoffman et al.) + For stationary input (no topic drift in new documents), on the other hand, this equals the + online update of `'Online Learning for LDA' by Hoffman et al.`_ + and is guaranteed to converge for any `decay` in (0.5, 1]. Additionally, for smaller corpus sizes, an + increasing `offset` may be beneficial (see Table 1 in the same paper). If update is called with authors that already exist in the model, it will resume training on not only new documents for that author, but also the previously seen documents. This is necessary for those authors' topic @@ -653,9 +658,12 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, chunksize : int, optional Controls the size of the mini-batches. decay : float, optional - Controls how old documents are forgotten. + A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten + when each new document is examined. Corresponds to :math:`\\kappa` from + `'Online Learning for LDA' by Hoffman et al.`_ offset : float, optional - Controls down-weighting of iterations. + Hyper-parameter that controls how much we will slow down the first steps the first few iterations. + Corresponds to :math:`\\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_ passes : int, optional Number of times the model makes a pass over the entire training data. update_every : int, optional diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 4b9de92490..1a44407251 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -14,9 +14,12 @@ The core estimation code is based on the `onlineldavb.py script `_, by -`Matthew D. Hoffman, David M. Blei, Francis Bach: -Online Learning for Latent Dirichlet Allocation, NIPS 2010 -`_. +Matthew D. Hoffman, David M. Blei, Francis Bach: +`'Online Learning for Latent Dirichlet Allocation', NIPS 2010`_. + +.. _'Online Learning for Latent Dirichlet Allocation', NIPS 2010: online-lda_ +.. _'Online Learning for LDA' by Hoffman et al.: online-lda_ +.. _online-lda: https://papers.neurips.cc/paper/2010/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf The algorithm: @@ -199,8 +202,7 @@ def blend(self, rhot, other, targetsize=None): The number of documents is stretched in both state objects, so that they are of comparable magnitude. This procedure corresponds to the stochastic gradient update from - `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_, see equations (5) and (9). + `'Online Learning for LDA' by Hoffman et al.`_, see equations (5) and (9). Parameters ---------- @@ -312,9 +314,7 @@ def load(cls, fname, *args, **kwargs): class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): - """Train and use Online Latent Dirichlet Allocation models as presented in - `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_. + """Train and use Online Latent Dirichlet Allocation models as presented in `'Online Learning for LDA' by Hoffman et al.`_ Examples ------- @@ -396,13 +396,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, * 'auto': Learns an asymmetric prior from the corpus. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from - `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_. + when each new document is examined. + Corresponds to :math:`\\kappa` from `'Online Learning for LDA' by Hoffman et al.`_ offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_. + Corresponds to :math:`\\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_ eval_every : int, optional Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. iterations : int, optional @@ -862,13 +860,15 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, Notes ----- - This update also supports updating an already trained model with new documents; the two models are then merged - in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary - input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the - online update of `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_ - and is guaranteed to converge for any `decay` in (0.5, 1.0). Additionally, for smaller corpus sizes, an - increasing `offset` may be beneficial (see Table 1 in the same paper). + This update also supports updating an already trained model (`self`) with new documents from `corpus`; + the two models are then merged in proportion to the number of old vs. new documents. + This feature is still experimental for non-stationary input streams. + + For stationary input (no topic drift in new documents), on the other hand, + this equals the online update of `'Online Learning for LDA' by Hoffman et al.`_ + and is guaranteed to converge for any `decay` in (0.5, 1]. + Additionally, for smaller corpus sizes, + an increasing `offset` may be beneficial (see Table 1 in the same paper). Parameters ---------- @@ -879,13 +879,11 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, Number of documents to be used in each training chunk. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from - `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_. + when each new document is examined. Corresponds to :math:`\\kappa` from + `'Online Learning for LDA' by Hoffman et al.`_ offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" - `_. + Corresponds to :math:`\\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_ passes : int, optional Number of passes through the corpus during training. update_every : int, optional diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index 6e5479972a..fdb5ce70a9 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -38,8 +38,13 @@ unseen documents. The model can also be updated with new documents for online training. The core estimation code is based on the `onlineldavb.py script -`_, by `Hoffman, Blei, Bach: -Online Learning for Latent Dirichlet Allocation, NIPS 2010 `_. +`_, by +Matthew D. Hoffman, David M. Blei, Francis Bach: +`'Online Learning for Latent Dirichlet Allocation', NIPS 2010`_. + +.. _'Online Learning for Latent Dirichlet Allocation', NIPS 2010: online-lda_ +.. _'Online Learning for LDA' by Hoffman et al.: online-lda_ +.. _online-lda: https://papers.neurips.cc/paper/2010/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf Usage examples -------------- @@ -147,13 +152,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, * 'auto': Learns an asymmetric prior from the corpus. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from - `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. + when each new document is examined. Corresponds to :math:`\\kappa` from + `'Online Learning for LDA' by Hoffman et al.`_ offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. + Corresponds to :math:`\\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_ eval_every : int, optional Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. iterations : int, optional @@ -198,14 +201,13 @@ def update(self, corpus, chunks_as_numpy=False): Notes ----- - This update also supports updating an already trained model (`self`) - with new documents from `corpus`; the two models are then merged in - proportion to the number of old vs. new documents. This feature is still - experimental for non-stationary input streams. + This update also supports updating an already trained model (`self`) with new documents from `corpus`; + the two models are then merged in proportion to the number of old vs. new documents. + This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, - this equals the online update of Hoffman et al. and is guaranteed to - converge for any `decay` in (0.5, 1.0>. + this equals the online update of `'Online Learning for LDA' by Hoffman et al.`_ + and is guaranteed to converge for any `decay` in (0.5, 1]. Parameters ---------- From adb74a3620b245d7528e65f1874de8072f1321db Mon Sep 17 00:00:00 2001 From: Jonathan Schneider Date: Wed, 12 May 2021 10:25:47 +0200 Subject: [PATCH 3/4] Fix flake8 --- gensim/models/ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 1a44407251..6691ddcc31 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -314,7 +314,7 @@ def load(cls, fname, *args, **kwargs): class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): - """Train and use Online Latent Dirichlet Allocation models as presented in `'Online Learning for LDA' by Hoffman et al.`_ + """Train and use Online Latent Dirichlet Allocation model as presented in `'Online Learning for LDA' by Hoffman et al.`_ Examples ------- From 17a0e597c7f8a62b5aa87bee026bc7f47a280456 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 12 May 2021 22:32:34 +0900 Subject: [PATCH 4/4] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40cc7498c2..48c4aebb4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Changes - Improve & unify docs for dirichlet priors (PR [#3125](https://github.com/RaRe-Technologies/gensim/pull/3125), [@jonaschn](https://github.com/jonaschn)) - Materialize and copy the corpus passed to SoftCosineSimilarity (PR [#3128](https://github.com/RaRe-Technologies/gensim/pull/3128), [@Witiko](https://github.com/Witiko)) - [#3115](https://github.com/RaRe-Technologies/gensim/pull/3115): Make LSI dispatcher CLI param for number of jobs optional, by [@robguinness](https://github.com/robguinness)) +- Update link to Hoffman paper (online VB LDA) (PR [#3133](https://github.com/RaRe-Technologies/gensim/pull/3133), [@jonaschn](https://github.com/jonaschn)) ### Documentation