From fc3102697716d77892a34adcfb83fdfb398a28f4 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Wed, 31 May 2017 01:14:21 +0530 Subject: [PATCH 01/15] added topic coherence logging --- gensim/models/ldamodel.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 67398ab099..525003ee9f 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -39,6 +39,8 @@ from gensim.matutils import dirichlet_expectation from gensim.models import basemodel +import gensim + from itertools import chain from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma @@ -527,6 +529,12 @@ def log_perplexity(self, chunk, total_docs=None): (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words)) return perwordbound + def log_coherence(self, model, chunk): + cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass') + corpus_words = sum(cnt for document in chunk for _, cnt in document) + logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words) + return cm.get_coherence() + def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): @@ -634,6 +642,7 @@ def rho(): if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) + self.log_coherence(self, chunk) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it From ff460a5c3dec0af0b4ce1381aa7e55d57b79fdc4 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 1 Jun 2017 02:22:21 +0530 Subject: [PATCH 02/15] make coherence measure optional --- gensim/models/ldamodel.py | 56 +++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 525003ee9f..20cbd810c8 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -195,7 +195,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, - minimum_phi_value=0.01, per_word_topics=False): + minimum_phi_value=0.01, per_word_topics=False, texts=None, + coherence='u_mass', window_size=None, topn=10): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -239,6 +240,31 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, `random_state` can be a np.random.RandomState object or the seed for one + `texts` : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, eg:: + texts = [['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] + + `coherence` : Coherence measure to be used. Supported values are: + 'u_mass' + 'c_v' + 'c_uci' also popularly known as c_pmi + 'c_npmi' + For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. + For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed. + + `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their + probability estimator. For 'u_mass' this doesn't matter. + If left 'None' the default window sizes are used which are: + 'c_v' : 110 + 'c_uci' : 10 + 'c_npmi' : 10 + + `topn` Integer corresponding to the number of top words to be extracted from each topic. + Example: >>> lda = LdaModel(corpus, num_topics=100) # train model @@ -281,6 +307,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics + self.texts = texts + self.coherence = coherence + self.window_size = window_size + self.topn = topn + self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) @@ -529,15 +560,16 @@ def log_perplexity(self, chunk, total_docs=None): (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words)) return perwordbound - def log_coherence(self, model, chunk): - cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass') + def log_coherence(self, model, chunk, texts, coherence, window_size, topn): + cm = gensim.models.CoherenceModel(model=model, corpus=chunk, texts=texts, window_size=window_size, coherence=coherence, topn=topn) corpus_words = sum(cnt for document in chunk for _, cnt in document) logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words) return cm.get_coherence() def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, - gamma_threshold=None, chunks_as_numpy=False): + gamma_threshold=None, chunks_as_numpy=False, texts=None, + coherence=None, window_size=None, topn=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -584,6 +616,14 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold + if texts is None: + texts = self.texts + if coherence is None: + coherence = self.coherence + if window_size is None: + window_size = self.window_size + if topn is None: + topn = self.topn try: lencorpus = len(corpus) @@ -642,7 +682,13 @@ def rho(): if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) - self.log_coherence(self, chunk) + if self.texts is not None: + init = (chunk_no + 1) * chunksize - chunksize + end = init + chunksize + if end > lencorpus: + end = lencorpus + texts = self.texts[init:end] + self.log_coherence(self, chunk, texts, coherence, window_size, topn) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it From 33cc2467ce1b45999d3154a22490c1e8cd375689 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 1 Jun 2017 23:19:12 +0530 Subject: [PATCH 03/15] added comment for texts --- gensim/models/ldamodel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 20cbd810c8..043f1046b0 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -682,11 +682,13 @@ def rho(): if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) + # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi) if self.texts is not None: init = (chunk_no + 1) * chunksize - chunksize end = init + chunksize if end > lencorpus: end = lencorpus + # texts subarray corresponding to chunk texts = self.texts[init:end] self.log_coherence(self, chunk, texts, coherence, window_size, topn) From c0f5c26278bacdffe722aa15dedc3202cb6634dd Mon Sep 17 00:00:00 2001 From: parulsethi Date: Mon, 5 Jun 2017 22:53:07 +0530 Subject: [PATCH 04/15] log 'u_mass' only --- gensim/models/ldamodel.py | 68 +++++++-------------------------------- 1 file changed, 11 insertions(+), 57 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 043f1046b0..aef2168c3f 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -34,13 +34,12 @@ import numpy as np import numbers import os +import gensim from gensim import interfaces, utils, matutils from gensim.matutils import dirichlet_expectation from gensim.models import basemodel -import gensim - from itertools import chain from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma @@ -195,8 +194,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, - minimum_phi_value=0.01, per_word_topics=False, texts=None, - coherence='u_mass', window_size=None, topn=10): + minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -240,31 +238,6 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, `random_state` can be a np.random.RandomState object or the seed for one - `texts` : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, eg:: - texts = [['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] - - `coherence` : Coherence measure to be used. Supported values are: - 'u_mass' - 'c_v' - 'c_uci' also popularly known as c_pmi - 'c_npmi' - For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. - For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed. - - `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their - probability estimator. For 'u_mass' this doesn't matter. - If left 'None' the default window sizes are used which are: - 'c_v' : 110 - 'c_uci' : 10 - 'c_npmi' : 10 - - `topn` Integer corresponding to the number of top words to be extracted from each topic. - Example: >>> lda = LdaModel(corpus, num_topics=100) # train model @@ -307,11 +280,6 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics - self.texts = texts - self.coherence = coherence - self.window_size = window_size - self.topn = topn - self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) @@ -560,16 +528,18 @@ def log_perplexity(self, chunk, total_docs=None): (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words)) return perwordbound - def log_coherence(self, model, chunk, texts, coherence, window_size, topn): - cm = gensim.models.CoherenceModel(model=model, corpus=chunk, texts=texts, window_size=window_size, coherence=coherence, topn=topn) + def log_coherence(self, model, chunk): + """ + Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus. + """ + cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass') corpus_words = sum(cnt for document in chunk for _, cnt in document) logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words) return cm.get_coherence() def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, - gamma_threshold=None, chunks_as_numpy=False, texts=None, - coherence=None, window_size=None, topn=None): + gamma_threshold=None, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -616,14 +586,6 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold - if texts is None: - texts = self.texts - if coherence is None: - coherence = self.coherence - if window_size is None: - window_size = self.window_size - if topn is None: - topn = self.topn try: lencorpus = len(corpus) @@ -651,8 +613,8 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, logger.info( "running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " - "every %i documents, evaluating perplexity every %i documents, " - "iterating %ix with a convergence threshold of %f", + "every %i documents, evaluating perplexity and coherence " + "every %i documents, iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold) @@ -682,15 +644,7 @@ def rho(): if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) - # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi) - if self.texts is not None: - init = (chunk_no + 1) * chunksize - chunksize - end = init + chunksize - if end > lencorpus: - end = lencorpus - # texts subarray corresponding to chunk - texts = self.texts[init:end] - self.log_coherence(self, chunk, texts, coherence, window_size, topn) + self.log_coherence(self, chunk) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it From ad0d1aa14e0da6a6330a57ad32705caaa96222c7 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 6 Jun 2017 00:36:47 +0530 Subject: [PATCH 05/15] update coherence_tutorial with logging description --- docs/notebooks/topic_coherence_tutorial.ipynb | 164 +++++++++++------- 1 file changed, 102 insertions(+), 62 deletions(-) diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb index ea2cf4ef7e..848daa0a23 100644 --- a/docs/notebooks/topic_coherence_tutorial.ipynb +++ b/docs/notebooks/topic_coherence_tutorial.ipynb @@ -23,9 +23,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -63,9 +63,9 @@ }, "outputs": [], "source": [ - "logger = logging.getLogger()\n", - "logger.setLevel(logging.DEBUG)\n", - "logging.debug(\"test\")" + "logging.basicConfig(level=logging.DEBUG)\n", + "logger = logging.getLogger(__name__)\n", + "logger.setLevel(logging.DEBUG)" ] }, { @@ -104,10 +104,17 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])\n", + "INFO:gensim.corpora.dictionary:built Dictionary(12 unique tokens: ['graph', 'system', 'user', 'minors', 'human']...) from 9 documents (total 29 corpus positions)\n" + ] + } + ], "source": [ "dictionary = Dictionary(texts)\n", "corpus = [dictionary.doc2bow(text) for text in texts]" @@ -130,15 +137,72 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gensim.models.ldamodel:using symmetric alpha at 0.5\n", + "INFO:gensim.models.ldamodel:using symmetric eta at 0.08333333333333333\n", + "INFO:gensim.models.ldamodel:using serial LDA version on this node\n", + "INFO:gensim.models.ldamodel:running online LDA training, 2 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity and coherence every 9 documents, iterating 50x with a convergence threshold of 0.001000\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "DEBUG:gensim.models.ldamodel:bound: at document #0\n", + "INFO:gensim.models.ldamodel:-3.297 per-word bound, 9.8 perplexity estimate based on a held-out corpus of 9 documents with 29 words\n", + "INFO:gensim.models.ldamodel:-15.478 coherence estimate based on a held-out corpus of 9 documents with 29 words\n", + "INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #9/9\n", + "DEBUG:gensim.models.ldamodel:performing inference on a chunk of 9 documents\n", + "DEBUG:gensim.models.ldamodel:6/9 documents converged within 50 iterations\n", + "DEBUG:gensim.models.ldamodel:updating topics\n", + "INFO:gensim.models.ldamodel:topic #0 (0.500): 0.169*\"system\" + 0.120*\"human\" + 0.098*\"eps\" + 0.094*\"interface\" + 0.091*\"computer\" + 0.071*\"user\" + 0.067*\"graph\" + 0.062*\"trees\" + 0.062*\"time\" + 0.061*\"minors\"\n", + "INFO:gensim.models.ldamodel:topic #1 (0.500): 0.129*\"trees\" + 0.125*\"graph\" + 0.121*\"user\" + 0.098*\"response\" + 0.086*\"survey\" + 0.084*\"minors\" + 0.083*\"time\" + 0.080*\"system\" + 0.057*\"computer\" + 0.055*\"interface\"\n", + "INFO:gensim.models.ldamodel:topic diff=0.420882, rho=1.000000\n" + ] + } + ], + "source": [ + "goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gensim.models.ldamodel:using symmetric alpha at 0.5\n", + "INFO:gensim.models.ldamodel:using symmetric eta at 0.08333333333333333\n", + "INFO:gensim.models.ldamodel:using serial LDA version on this node\n", + "INFO:gensim.models.ldamodel:running online LDA training, 2 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity and coherence every 9 documents, iterating 1x with a convergence threshold of 0.001000\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "DEBUG:gensim.models.ldamodel:bound: at document #0\n", + "INFO:gensim.models.ldamodel:-3.317 per-word bound, 10.0 perplexity estimate based on a held-out corpus of 9 documents with 29 words\n", + "INFO:gensim.models.ldamodel:-16.260 coherence estimate based on a held-out corpus of 9 documents with 29 words\n", + "INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #9/9\n", + "DEBUG:gensim.models.ldamodel:performing inference on a chunk of 9 documents\n", + "DEBUG:gensim.models.ldamodel:0/9 documents converged within 1 iterations\n", + "DEBUG:gensim.models.ldamodel:updating topics\n", + "INFO:gensim.models.ldamodel:topic #0 (0.500): 0.138*\"system\" + 0.111*\"graph\" + 0.098*\"trees\" + 0.082*\"user\" + 0.081*\"survey\" + 0.075*\"interface\" + 0.073*\"time\" + 0.072*\"minors\" + 0.072*\"human\" + 0.069*\"eps\"\n", + "INFO:gensim.models.ldamodel:topic #1 (0.500): 0.114*\"user\" + 0.105*\"system\" + 0.097*\"trees\" + 0.086*\"response\" + 0.083*\"graph\" + 0.079*\"computer\" + 0.077*\"eps\" + 0.075*\"human\" + 0.074*\"minors\" + 0.073*\"time\"\n", + "INFO:gensim.models.ldamodel:topic diff=0.255033, rho=1.000000\n" + ] + } + ], "source": [ - "goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2)\n", "badLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=1, num_topics=2)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the LdaModel in gensim also logs the evaluation parameters `perplexity` and `coherence` according to `eval_every`. These parameter values can be used to monitor the LDA training and evaluate how the topics are improving during training. " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -148,9 +212,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -159,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": { "collapsed": true }, @@ -185,9 +249,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -234,7 +296,7 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -244,9 +306,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -373,9 +433,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -503,9 +561,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -522,9 +578,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -577,9 +631,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -603,9 +655,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -622,9 +672,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -656,7 +704,7 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -679,9 +727,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -701,7 +747,7 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ @@ -724,9 +770,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -779,9 +823,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -836,9 +878,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -872,23 +912,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" + "pygments_lexer": "ipython3", + "version": "3.4.3" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 940c7b775f824e49362c7d970080dc0267d5ed37 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 6 Jun 2017 01:34:23 +0530 Subject: [PATCH 06/15] add id2word parameter --- gensim/models/ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index aef2168c3f..9fdb2dc0f1 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -532,7 +532,7 @@ def log_coherence(self, model, chunk): """ Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus. """ - cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass') + cm = gensim.models.CoherenceModel(model=model, corpus=chunk, dictionary=self.id2word, coherence='u_mass') corpus_words = sum(cnt for document in chunk for _, cnt in document) logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words) return cm.get_coherence() From d47d7facffe33abbcea1664edb197a46c835da4c Mon Sep 17 00:00:00 2001 From: parulsethi Date: Fri, 9 Jun 2017 17:15:21 +0530 Subject: [PATCH 07/15] made requested changes --- gensim/models/ldamodel.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 9fdb2dc0f1..9fbc516553 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -528,14 +528,15 @@ def log_perplexity(self, chunk, total_docs=None): (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words)) return perwordbound - def log_coherence(self, model, chunk): + def log_coherence(self, chunk): """ Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus. """ - cm = gensim.models.CoherenceModel(model=model, corpus=chunk, dictionary=self.id2word, coherence='u_mass') + cm = gensim.models.CoherenceModel(model=self, corpus=chunk, dictionary=self.id2word, coherence='u_mass') + coherence = cm.get_coherence() corpus_words = sum(cnt for document in chunk for _, cnt in document) - logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words) - return cm.get_coherence() + logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words) + return coherence def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, @@ -644,7 +645,7 @@ def rho(): if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) - self.log_coherence(self, chunk) + self.log_coherence(chunk) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it From ee161189d625edecf118121a89996bf3226d15f1 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Fri, 9 Jun 2017 18:11:51 +0530 Subject: [PATCH 08/15] fix flake8 --- gensim/models/ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 9fbc516553..b68ca0c0dd 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -533,7 +533,7 @@ def log_coherence(self, chunk): Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus. """ cm = gensim.models.CoherenceModel(model=self, corpus=chunk, dictionary=self.id2word, coherence='u_mass') - coherence = cm.get_coherence() + coherence = cm.get_coherence() corpus_words = sum(cnt for document in chunk for _, cnt in document) logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words) return coherence From 5e37b3f168af431e6757ea62b85cf9f30d9bd58f Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 13 Jun 2017 04:37:13 +0530 Subject: [PATCH 09/15] make coherence measure optional --- gensim/models/ldamodel.py | 57 ++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index b68ca0c0dd..ad058d0c85 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -194,7 +194,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, - minimum_phi_value=0.01, per_word_topics=False): + minimum_phi_value=0.01, per_word_topics=False, + coherence='u_mass', texts=None, window_size=None, topn=10): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -238,6 +239,30 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, `random_state` can be a np.random.RandomState object or the seed for one + `texts` : Tokenized texts. Needed when eval_every is not None and coherence is logged for sliding_window_based measures c_v, c_uci, c_npmi. eg:: + texts = [['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] + + `coherence` : Coherence measure to be used for logging coherence. Supported values are: + 'u_mass' + 'c_v' + 'c_uci' also popularly known as c_pmi + 'c_npmi' + For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. + + `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their + probability estimator. For 'u_mass' this doesn't matter. + If left 'None' the default window sizes are used which are: + 'c_v' : 110 + 'c_uci' : 10 + 'c_npmi' : 10 + + `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging. + Example: >>> lda = LdaModel(corpus, num_topics=100) # train model @@ -280,6 +305,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics + self.texts = texts + self.coherence = coherence + self.window_size = window_size + self.topn = topn + self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) @@ -528,11 +558,11 @@ def log_perplexity(self, chunk, total_docs=None): (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words)) return perwordbound - def log_coherence(self, chunk): + def log_coherence(self, chunk, texts, coherence, window_size, topn): """ - Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus. + Log coherence using the `chunk` of documents as evaluation corpus. """ - cm = gensim.models.CoherenceModel(model=self, corpus=chunk, dictionary=self.id2word, coherence='u_mass') + cm = gensim.models.CoherenceModel(model=self, corpus=chunk, texts=texts, dictionary=self.id2word, coherence=coherence, window_size=window_size, topn=topn) coherence = cm.get_coherence() corpus_words = sum(cnt for document in chunk for _, cnt in document) logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words) @@ -540,7 +570,8 @@ def log_coherence(self, chunk): def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, - gamma_threshold=None, chunks_as_numpy=False): + gamma_threshold=None, chunks_as_numpy=False, coherence=None, + texts=None, window_size=None, topn=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -587,6 +618,14 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold + if coherence is None: + coherence = self.coherence + if texts is None: + texts = self.texts + if window_size is None: + window_size = self.window_size + if topn is None: + topn = self.topn try: lencorpus = len(corpus) @@ -645,7 +684,13 @@ def rho(): if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) - self.log_coherence(chunk) + # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi) + if texts is not None: + init = (chunk_no + 1) * chunksize - chunksize + end = init + chunksize + # texts subarray corresponding to current chunk + texts_chunk = texts[init:end] + self.log_coherence(chunk, texts_chunk, coherence, window_size, topn) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it From 4dad8f4cef88a8be82a23c017d9a05aee5f5b00c Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 13 Jun 2017 14:31:59 +0530 Subject: [PATCH 10/15] fix failing tests --- gensim/models/ldamodel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index ad058d0c85..11deacd735 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -260,7 +260,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, 'c_v' : 110 'c_uci' : 10 'c_npmi' : 10 - + `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging. Example: @@ -690,6 +690,8 @@ def rho(): end = init + chunksize # texts subarray corresponding to current chunk texts_chunk = texts[init:end] + else: + texts_chunk = None self.log_coherence(chunk, texts_chunk, coherence, window_size, topn) if self.dispatcher: From 950da4506e810e6ef3a934cc41728ba6cf79a436 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Fri, 16 Jun 2017 23:30:04 +0530 Subject: [PATCH 11/15] add diff logging --- gensim/models/ldamodel.py | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 11deacd735..50e3c643c2 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -35,6 +35,7 @@ import numbers import os import gensim +import copy from gensim import interfaces, utils, matutils from gensim.matutils import dirichlet_expectation @@ -194,8 +195,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, - minimum_phi_value=0.01, per_word_topics=False, - coherence='u_mass', texts=None, window_size=None, topn=10): + minimum_phi_value=0.01, per_word_topics=False, coherence='u_mass', + texts=None, window_size=None, topn=10, log_diff=False, distance="jaccard"): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -263,6 +264,10 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging. + `log_diff` set to True to log topic diff between consecutive epochs + + `distance` is the distance measure to use for `log_diff` + Example: >>> lda = LdaModel(corpus, num_topics=100) # train model @@ -305,6 +310,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics + self.log_diff = log_diff + self.distance = distance + self.texts = texts self.coherence = coherence self.window_size = window_size @@ -568,10 +576,19 @@ def log_coherence(self, chunk, texts, coherence, window_size, topn): logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words) return coherence + def log_epoch_diff(self, epoch, other_model): + """ + Log topic diff between consecutive epochs + """ + diff_matrix, annotation = self.diff(other_model) + diff_diagonal = np.diagonal(diff_matrix) + logger.info("Topic difference between %i and %i epoch %s", epoch - 1, epoch, diff_diagonal) + return diff_diagonal + def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False, coherence=None, - texts=None, window_size=None, topn=None): + texts=None, window_size=None, topn=None, log_diff=None, distance=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -626,6 +643,10 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, window_size = self.window_size if topn is None: topn = self.topn + if log_diff is None: + log_diff = self.log_diff + if distance is None: + distance = self.distance try: lencorpus = len(corpus) @@ -727,6 +748,15 @@ def rho(): other = LdaState(self.eta, self.state.sstats.shape) dirty = False # endfor single corpus iteration + + # log diff between consecutive epochs + if log_diff: + if pass_ == 0: + # save randomly initialized model for diff with first pass + previous = copy.deepcopy(self) + self.log_epoch_diff(pass_, previous) + previous = copy.deepcopy(self) + if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") From 017a75423b8cc55b110ae1bfeaabc96b40044874 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Fri, 16 Jun 2017 23:59:50 +0530 Subject: [PATCH 12/15] make distance measure optional for diff --- gensim/models/ldamodel.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 0aa39ea9db..6b4759d1b8 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -578,13 +578,16 @@ def log_coherence(self, chunk, texts, coherence, window_size, topn): logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words) return coherence - def log_epoch_diff(self, epoch, other_model): + def log_epoch_diff(self, epoch, other_model, distance): """ Log topic diff between consecutive epochs """ - diff_matrix, annotation = self.diff(other_model) + diff_matrix, annotation = self.diff(other_model, distance) diff_diagonal = np.diagonal(diff_matrix) - logger.info("Topic difference between %i and %i epoch %s", epoch - 1, epoch, diff_diagonal) + prev_epoch = epoch - 1 + if epoch == 0: + prev_epoch = "initial random model" + logger.info("Topic difference between %s and %s epoch: %s", prev_epoch, epoch, diff_diagonal) return diff_diagonal def update(self, corpus, chunksize=None, decay=None, offset=None, @@ -760,7 +763,7 @@ def rho(): if pass_ == 0: # save randomly initialized model for diff with first pass previous = copy.deepcopy(self) - self.log_epoch_diff(pass_, previous) + self.log_epoch_diff(pass_, previous, distance) previous = copy.deepcopy(self) if reallen != lencorpus: From f07d1d26c323f1a603b2290d49c5540f3f68f666 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Sat, 17 Jun 2017 03:02:36 +0530 Subject: [PATCH 13/15] fix flake8 --- gensim/models/ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 6b4759d1b8..1ca5e123fd 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -592,7 +592,7 @@ def log_epoch_diff(self, epoch, other_model, distance): def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, - gamma_threshold=None, chunks_as_numpy=False, coherence=None, + gamma_threshold=None, chunks_as_numpy=False, coherence=None, texts=None, window_size=None, topn=None, log_diff=None, distance=None): """ Train the model with new documents, by EM-iterating over `corpus` until From 2e3c474d2ea1f26d273b81bcb23f594741733453 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Wed, 28 Jun 2017 18:33:06 +0530 Subject: [PATCH 14/15] give relevant parameter names --- gensim/models/ldamodel.py | 66 ++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 1ca5e123fd..1346d870c0 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -198,7 +198,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, minimum_phi_value=0.01, per_word_topics=False, coherence='u_mass', - texts=None, window_size=None, topn=10, log_diff=False, distance="jaccard"): + coherence_texts=None, coherence_window_size=None, coherence_topn=10, + log_diff=False, diff_distance="jaccard"): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -231,7 +232,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, Turn on `distributed` to force distributed computing (see the `web tutorial `_ on how to set up a cluster of machines for gensim). - Calculate and log perplexity estimate from the latest mini-batch every + Calculate and log perplexity and coherence estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. @@ -242,14 +243,6 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, `random_state` can be a np.random.RandomState object or the seed for one - `texts` : Tokenized texts. Needed when eval_every is not None and coherence is logged for sliding_window_based measures c_v, c_uci, c_npmi. eg:: - texts = [['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] - `coherence` : Coherence measure to be used for logging coherence. Supported values are: 'u_mass' 'c_v' @@ -257,18 +250,26 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, 'c_npmi' For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. - `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their + `coherence_texts` : Tokenized texts. Needed when eval_every is not None and coherence is logged for sliding_window_based measures c_v, c_uci, c_npmi. eg:: + texts = [['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] + + `coherence_window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If left 'None' the default window sizes are used which are: 'c_v' : 110 'c_uci' : 10 'c_npmi' : 10 - `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging. + `coherence_topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging. `log_diff` set to True to log topic diff between consecutive epochs - `distance` is the distance measure to use for `log_diff` + `diff_distance` is the distance measure to use for `log_diff` Example: @@ -312,13 +313,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics - self.log_diff = log_diff - self.distance = distance - - self.texts = texts self.coherence = coherence - self.window_size = window_size - self.topn = topn + self.coherence_texts = coherence_texts + self.coherence_window_size = coherence_window_size + self.coherence_topn = coherence_topn + + self.log_diff = log_diff + self.diff_distance = diff_distance self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') @@ -593,7 +594,8 @@ def log_epoch_diff(self, epoch, other_model, distance): def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False, coherence=None, - texts=None, window_size=None, topn=None, log_diff=None, distance=None): + coherence_texts=None, coherence_window_size=None, coherence_topn=None, + log_diff=None, diff_distance=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -642,16 +644,16 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, gamma_threshold = self.gamma_threshold if coherence is None: coherence = self.coherence - if texts is None: - texts = self.texts - if window_size is None: - window_size = self.window_size - if topn is None: - topn = self.topn + if coherence_texts is None: + coherence_texts = self.coherence_texts + if coherence_window_size is None: + coherence_window_size = self.coherence_window_size + if coherence_topn is None: + coherence_topn = self.coherence_topn if log_diff is None: log_diff = self.log_diff - if distance is None: - distance = self.distance + if diff_distance is None: + diff_distance = self.diff_distance try: lencorpus = len(corpus) @@ -715,14 +717,14 @@ def rho(): if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi) - if texts is not None: + if coherence_texts is not None: init = (chunk_no + 1) * chunksize - chunksize end = init + chunksize # texts subarray corresponding to current chunk - texts_chunk = texts[init:end] + texts_chunk = coherence_texts[init:end] else: texts_chunk = None - self.log_coherence(chunk, texts_chunk, coherence, window_size, topn) + self.log_coherence(chunk, texts_chunk, coherence, coherence_window_size, coherence_topn) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it @@ -763,7 +765,7 @@ def rho(): if pass_ == 0: # save randomly initialized model for diff with first pass previous = copy.deepcopy(self) - self.log_epoch_diff(pass_, previous, distance) + self.log_epoch_diff(pass_, previous, diff_distance) previous = copy.deepcopy(self) if reallen != lencorpus: From 00e776280f6c7a3b42644641c9ec06dd9c037312 Mon Sep 17 00:00:00 2001 From: Parul Sethi Date: Wed, 28 Jun 2017 19:51:32 +0530 Subject: [PATCH 15/15] fix flake8 --- gensim/models/ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 1346d870c0..0422865d4e 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -198,7 +198,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, minimum_phi_value=0.01, per_word_topics=False, coherence='u_mass', - coherence_texts=None, coherence_window_size=None, coherence_topn=10, + coherence_texts=None, coherence_window_size=None, coherence_topn=10, log_diff=False, diff_distance="jaccard"): """ If given, start training from the iterable `corpus` straight away. If not given,