From fc3102697716d77892a34adcfb83fdfb398a28f4 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Wed, 31 May 2017 01:14:21 +0530
Subject: [PATCH 01/15] added topic coherence logging

---
 gensim/models/ldamodel.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 67398ab099..525003ee9f 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -39,6 +39,8 @@
 from gensim.matutils import dirichlet_expectation
 from gensim.models import basemodel
 
+import gensim
+
 from itertools import chain
 from scipy.special import gammaln, psi  # gamma function utils
 from scipy.special import polygamma
@@ -527,6 +529,12 @@ def log_perplexity(self, chunk, total_docs=None):
                     (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words))
         return perwordbound
 
+    def log_coherence(self, model, chunk):
+        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass')
+        corpus_words = sum(cnt for document in chunk for _, cnt in document)
+        logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words)
+        return cm.get_coherence()
+
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
                gamma_threshold=None, chunks_as_numpy=False):
@@ -634,6 +642,7 @@ def rho():
 
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                     self.log_perplexity(chunk, total_docs=lencorpus)
+                    self.log_coherence(self, chunk)
 
                 if self.dispatcher:
                     # add the chunk to dispatcher's job queue, so workers can munch on it

From ff460a5c3dec0af0b4ce1381aa7e55d57b79fdc4 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Thu, 1 Jun 2017 02:22:21 +0530
Subject: [PATCH 02/15] make coherence measure optional

---
 gensim/models/ldamodel.py | 56 +++++++++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 5 deletions(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 525003ee9f..20cbd810c8 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -195,7 +195,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                  eval_every=10, iterations=50, gamma_threshold=0.001,
                  minimum_probability=0.01, random_state=None, ns_conf={},
-                 minimum_phi_value=0.01, per_word_topics=False):
+                 minimum_phi_value=0.01, per_word_topics=False, texts=None,
+                 coherence='u_mass', window_size=None, topn=10):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -239,6 +240,31 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         `random_state` can be a np.random.RandomState object or the seed for one
 
+        `texts` : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, eg::
+                texts = [['system', 'human', 'system', 'eps'],
+                             ['user', 'response', 'time'],
+                             ['trees'],
+                             ['graph', 'trees'],
+                             ['graph', 'minors', 'trees'],
+                             ['graph', 'minors', 'survey']]
+        
+        `coherence` : Coherence measure to be used. Supported values are:
+                    'u_mass'
+                    'c_v'
+                    'c_uci' also popularly known as c_pmi
+                    'c_npmi'
+                    For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
+                    For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed.
+
+        `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their
+                      probability estimator. For 'u_mass' this doesn't matter.
+                      If left 'None' the default window sizes are used which are:
+                      'c_v' : 110
+                      'c_uci' : 10
+                      'c_npmi' : 10
+        
+        `topn` Integer corresponding to the number of top words to be extracted from each topic.
+
         Example:
 
         >>> lda = LdaModel(corpus, num_topics=100)  # train model
@@ -281,6 +307,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         self.minimum_phi_value = minimum_phi_value
         self.per_word_topics = per_word_topics
 
+        self.texts = texts
+        self.coherence = coherence
+        self.window_size = window_size
+        self.topn = topn
+
         self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
 
         assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
@@ -529,15 +560,16 @@ def log_perplexity(self, chunk, total_docs=None):
                     (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words))
         return perwordbound
 
-    def log_coherence(self, model, chunk):
-        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass')
+    def log_coherence(self, model, chunk, texts, coherence, window_size, topn):
+        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, texts=texts, window_size=window_size, coherence=coherence, topn=topn)
         corpus_words = sum(cnt for document in chunk for _, cnt in document)
         logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words)
         return cm.get_coherence()
 
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
-               gamma_threshold=None, chunks_as_numpy=False):
+               gamma_threshold=None, chunks_as_numpy=False, texts=None,
+               coherence=None, window_size=None, topn=None):
         """
         Train the model with new documents, by EM-iterating over `corpus` until
         the topics converge (or until the maximum number of allowed iterations
@@ -584,6 +616,14 @@ def update(self, corpus, chunksize=None, decay=None, offset=None,
             iterations = self.iterations
         if gamma_threshold is None:
             gamma_threshold = self.gamma_threshold
+        if texts is None:
+            texts = self.texts
+        if coherence is None:
+            coherence = self.coherence
+        if window_size is None:
+            window_size = self.window_size
+        if topn is None:
+            topn = self.topn
 
         try:
             lencorpus = len(corpus)
@@ -642,7 +682,13 @@ def rho():
 
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                     self.log_perplexity(chunk, total_docs=lencorpus)
-                    self.log_coherence(self, chunk)
+                    if self.texts is not None:
+                        init = (chunk_no + 1) * chunksize - chunksize
+                        end = init + chunksize
+                        if end > lencorpus:
+                            end = lencorpus
+                        texts = self.texts[init:end]
+                    self.log_coherence(self, chunk, texts, coherence, window_size, topn)
 
                 if self.dispatcher:
                     # add the chunk to dispatcher's job queue, so workers can munch on it

From 33cc2467ce1b45999d3154a22490c1e8cd375689 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Thu, 1 Jun 2017 23:19:12 +0530
Subject: [PATCH 03/15] added comment for texts

---
 gensim/models/ldamodel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 20cbd810c8..043f1046b0 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -682,11 +682,13 @@ def rho():
 
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                     self.log_perplexity(chunk, total_docs=lencorpus)
+                    # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi)
                     if self.texts is not None:
                         init = (chunk_no + 1) * chunksize - chunksize
                         end = init + chunksize
                         if end > lencorpus:
                             end = lencorpus
+                        # texts subarray corresponding to chunk
                         texts = self.texts[init:end]
                     self.log_coherence(self, chunk, texts, coherence, window_size, topn)
 

From c0f5c26278bacdffe722aa15dedc3202cb6634dd Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Mon, 5 Jun 2017 22:53:07 +0530
Subject: [PATCH 04/15] log 'u_mass' only

---
 gensim/models/ldamodel.py | 68 +++++++--------------------------------
 1 file changed, 11 insertions(+), 57 deletions(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 043f1046b0..aef2168c3f 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -34,13 +34,12 @@
 import numpy as np
 import numbers
 import os
+import gensim
 
 from gensim import interfaces, utils, matutils
 from gensim.matutils import dirichlet_expectation
 from gensim.models import basemodel
 
-import gensim
-
 from itertools import chain
 from scipy.special import gammaln, psi  # gamma function utils
 from scipy.special import polygamma
@@ -195,8 +194,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                  eval_every=10, iterations=50, gamma_threshold=0.001,
                  minimum_probability=0.01, random_state=None, ns_conf={},
-                 minimum_phi_value=0.01, per_word_topics=False, texts=None,
-                 coherence='u_mass', window_size=None, topn=10):
+                 minimum_phi_value=0.01, per_word_topics=False):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -240,31 +238,6 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         `random_state` can be a np.random.RandomState object or the seed for one
 
-        `texts` : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, eg::
-                texts = [['system', 'human', 'system', 'eps'],
-                             ['user', 'response', 'time'],
-                             ['trees'],
-                             ['graph', 'trees'],
-                             ['graph', 'minors', 'trees'],
-                             ['graph', 'minors', 'survey']]
-        
-        `coherence` : Coherence measure to be used. Supported values are:
-                    'u_mass'
-                    'c_v'
-                    'c_uci' also popularly known as c_pmi
-                    'c_npmi'
-                    For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
-                    For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed.
-
-        `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their
-                      probability estimator. For 'u_mass' this doesn't matter.
-                      If left 'None' the default window sizes are used which are:
-                      'c_v' : 110
-                      'c_uci' : 10
-                      'c_npmi' : 10
-        
-        `topn` Integer corresponding to the number of top words to be extracted from each topic.
-
         Example:
 
         >>> lda = LdaModel(corpus, num_topics=100)  # train model
@@ -307,11 +280,6 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         self.minimum_phi_value = minimum_phi_value
         self.per_word_topics = per_word_topics
 
-        self.texts = texts
-        self.coherence = coherence
-        self.window_size = window_size
-        self.topn = topn
-
         self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
 
         assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
@@ -560,16 +528,18 @@ def log_perplexity(self, chunk, total_docs=None):
                     (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words))
         return perwordbound
 
-    def log_coherence(self, model, chunk, texts, coherence, window_size, topn):
-        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, texts=texts, window_size=window_size, coherence=coherence, topn=topn)
+    def log_coherence(self, model, chunk):
+        """
+        Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus.
+        """
+        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass')
         corpus_words = sum(cnt for document in chunk for _, cnt in document)
         logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words)
         return cm.get_coherence()
 
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
-               gamma_threshold=None, chunks_as_numpy=False, texts=None,
-               coherence=None, window_size=None, topn=None):
+               gamma_threshold=None, chunks_as_numpy=False):
         """
         Train the model with new documents, by EM-iterating over `corpus` until
         the topics converge (or until the maximum number of allowed iterations
@@ -616,14 +586,6 @@ def update(self, corpus, chunksize=None, decay=None, offset=None,
             iterations = self.iterations
         if gamma_threshold is None:
             gamma_threshold = self.gamma_threshold
-        if texts is None:
-            texts = self.texts
-        if coherence is None:
-            coherence = self.coherence
-        if window_size is None:
-            window_size = self.window_size
-        if topn is None:
-            topn = self.topn
 
         try:
             lencorpus = len(corpus)
@@ -651,8 +613,8 @@ def update(self, corpus, chunksize=None, decay=None, offset=None,
         logger.info(
             "running %s LDA training, %s topics, %i passes over "
             "the supplied corpus of %i documents, updating model once "
-            "every %i documents, evaluating perplexity every %i documents, "
-            "iterating %ix with a convergence threshold of %f",
+            "every %i documents, evaluating perplexity and coherence "
+            "every %i documents, iterating %ix with a convergence threshold of %f",
             updatetype, self.num_topics, passes, lencorpus,
             updateafter, evalafter, iterations,
             gamma_threshold)
@@ -682,15 +644,7 @@ def rho():
 
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                     self.log_perplexity(chunk, total_docs=lencorpus)
-                    # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi)
-                    if self.texts is not None:
-                        init = (chunk_no + 1) * chunksize - chunksize
-                        end = init + chunksize
-                        if end > lencorpus:
-                            end = lencorpus
-                        # texts subarray corresponding to chunk
-                        texts = self.texts[init:end]
-                    self.log_coherence(self, chunk, texts, coherence, window_size, topn)
+                    self.log_coherence(self, chunk)
 
                 if self.dispatcher:
                     # add the chunk to dispatcher's job queue, so workers can munch on it

From ad0d1aa14e0da6a6330a57ad32705caaa96222c7 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Tue, 6 Jun 2017 00:36:47 +0530
Subject: [PATCH 05/15] update coherence_tutorial with logging description

---
 docs/notebooks/topic_coherence_tutorial.ipynb | 164 +++++++++++-------
 1 file changed, 102 insertions(+), 62 deletions(-)

diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb
index ea2cf4ef7e..848daa0a23 100644
--- a/docs/notebooks/topic_coherence_tutorial.ipynb
+++ b/docs/notebooks/topic_coherence_tutorial.ipynb
@@ -23,9 +23,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -63,9 +63,9 @@
    },
    "outputs": [],
    "source": [
-    "logger = logging.getLogger()\n",
-    "logger.setLevel(logging.DEBUG)\n",
-    "logging.debug(\"test\")"
+    "logging.basicConfig(level=logging.DEBUG)\n",
+    "logger = logging.getLogger(__name__)\n",
+    "logger.setLevel(logging.DEBUG)"
    ]
   },
   {
@@ -104,10 +104,17 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])\n",
+      "INFO:gensim.corpora.dictionary:built Dictionary(12 unique tokens: ['graph', 'system', 'user', 'minors', 'human']...) from 9 documents (total 29 corpus positions)\n"
+     ]
+    }
+   ],
    "source": [
     "dictionary = Dictionary(texts)\n",
     "corpus = [dictionary.doc2bow(text) for text in texts]"
@@ -130,15 +137,72 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:gensim.models.ldamodel:using symmetric alpha at 0.5\n",
+      "INFO:gensim.models.ldamodel:using symmetric eta at 0.08333333333333333\n",
+      "INFO:gensim.models.ldamodel:using serial LDA version on this node\n",
+      "INFO:gensim.models.ldamodel:running online LDA training, 2 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity and coherence every 9 documents, iterating 50x with a convergence threshold of 0.001000\n",
+      "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n",
+      "DEBUG:gensim.models.ldamodel:bound: at document #0\n",
+      "INFO:gensim.models.ldamodel:-3.297 per-word bound, 9.8 perplexity estimate based on a held-out corpus of 9 documents with 29 words\n",
+      "INFO:gensim.models.ldamodel:-15.478 coherence estimate based on a held-out corpus of 9 documents with 29 words\n",
+      "INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #9/9\n",
+      "DEBUG:gensim.models.ldamodel:performing inference on a chunk of 9 documents\n",
+      "DEBUG:gensim.models.ldamodel:6/9 documents converged within 50 iterations\n",
+      "DEBUG:gensim.models.ldamodel:updating topics\n",
+      "INFO:gensim.models.ldamodel:topic #0 (0.500): 0.169*\"system\" + 0.120*\"human\" + 0.098*\"eps\" + 0.094*\"interface\" + 0.091*\"computer\" + 0.071*\"user\" + 0.067*\"graph\" + 0.062*\"trees\" + 0.062*\"time\" + 0.061*\"minors\"\n",
+      "INFO:gensim.models.ldamodel:topic #1 (0.500): 0.129*\"trees\" + 0.125*\"graph\" + 0.121*\"user\" + 0.098*\"response\" + 0.086*\"survey\" + 0.084*\"minors\" + 0.083*\"time\" + 0.080*\"system\" + 0.057*\"computer\" + 0.055*\"interface\"\n",
+      "INFO:gensim.models.ldamodel:topic diff=0.420882, rho=1.000000\n"
+     ]
+    }
+   ],
+   "source": [
+    "goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:gensim.models.ldamodel:using symmetric alpha at 0.5\n",
+      "INFO:gensim.models.ldamodel:using symmetric eta at 0.08333333333333333\n",
+      "INFO:gensim.models.ldamodel:using serial LDA version on this node\n",
+      "INFO:gensim.models.ldamodel:running online LDA training, 2 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity and coherence every 9 documents, iterating 1x with a convergence threshold of 0.001000\n",
+      "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n",
+      "DEBUG:gensim.models.ldamodel:bound: at document #0\n",
+      "INFO:gensim.models.ldamodel:-3.317 per-word bound, 10.0 perplexity estimate based on a held-out corpus of 9 documents with 29 words\n",
+      "INFO:gensim.models.ldamodel:-16.260 coherence estimate based on a held-out corpus of 9 documents with 29 words\n",
+      "INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #9/9\n",
+      "DEBUG:gensim.models.ldamodel:performing inference on a chunk of 9 documents\n",
+      "DEBUG:gensim.models.ldamodel:0/9 documents converged within 1 iterations\n",
+      "DEBUG:gensim.models.ldamodel:updating topics\n",
+      "INFO:gensim.models.ldamodel:topic #0 (0.500): 0.138*\"system\" + 0.111*\"graph\" + 0.098*\"trees\" + 0.082*\"user\" + 0.081*\"survey\" + 0.075*\"interface\" + 0.073*\"time\" + 0.072*\"minors\" + 0.072*\"human\" + 0.069*\"eps\"\n",
+      "INFO:gensim.models.ldamodel:topic #1 (0.500): 0.114*\"user\" + 0.105*\"system\" + 0.097*\"trees\" + 0.086*\"response\" + 0.083*\"graph\" + 0.079*\"computer\" + 0.077*\"eps\" + 0.075*\"human\" + 0.074*\"minors\" + 0.073*\"time\"\n",
+      "INFO:gensim.models.ldamodel:topic diff=0.255033, rho=1.000000\n"
+     ]
+    }
+   ],
    "source": [
-    "goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2)\n",
     "badLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=1, num_topics=2)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, the LdaModel in gensim also logs the evaluation parameters `perplexity` and `coherence` according to `eval_every`. These parameter values can be used to monitor the LDA training and evaluate how the topics are improving during training. "
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -148,9 +212,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 7,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -159,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 8,
    "metadata": {
     "collapsed": true
    },
@@ -185,9 +249,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -234,7 +296,7 @@
    "cell_type": "code",
    "execution_count": 17,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -244,9 +306,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -373,9 +433,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -503,9 +561,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -522,9 +578,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -577,9 +631,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -603,9 +655,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -622,9 +672,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -656,7 +704,7 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -679,9 +727,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -701,7 +747,7 @@
    "cell_type": "code",
    "execution_count": 20,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
@@ -724,9 +770,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -779,9 +823,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -836,9 +878,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -872,23 +912,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }

From 940c7b775f824e49362c7d970080dc0267d5ed37 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Tue, 6 Jun 2017 01:34:23 +0530
Subject: [PATCH 06/15] add id2word parameter

---
 gensim/models/ldamodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index aef2168c3f..9fdb2dc0f1 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -532,7 +532,7 @@ def log_coherence(self, model, chunk):
         """
         Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus.
         """
-        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, coherence='u_mass')
+        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, dictionary=self.id2word, coherence='u_mass')
         corpus_words = sum(cnt for document in chunk for _, cnt in document)
         logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words)
         return cm.get_coherence()

From d47d7facffe33abbcea1664edb197a46c835da4c Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Fri, 9 Jun 2017 17:15:21 +0530
Subject: [PATCH 07/15] made requested changes

---
 gensim/models/ldamodel.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 9fdb2dc0f1..9fbc516553 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -528,14 +528,15 @@ def log_perplexity(self, chunk, total_docs=None):
                     (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words))
         return perwordbound
 
-    def log_coherence(self, model, chunk):
+    def log_coherence(self, chunk):
         """
         Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus.
         """
-        cm = gensim.models.CoherenceModel(model=model, corpus=chunk, dictionary=self.id2word, coherence='u_mass')
+        cm = gensim.models.CoherenceModel(model=self, corpus=chunk, dictionary=self.id2word, coherence='u_mass')
+        coherence =  cm.get_coherence()
         corpus_words = sum(cnt for document in chunk for _, cnt in document)
-        logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", cm.get_coherence(), len(chunk), corpus_words)
-        return cm.get_coherence()
+        logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words)
+        return coherence
 
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
@@ -644,7 +645,7 @@ def rho():
 
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                     self.log_perplexity(chunk, total_docs=lencorpus)
-                    self.log_coherence(self, chunk)
+                    self.log_coherence(chunk)
 
                 if self.dispatcher:
                     # add the chunk to dispatcher's job queue, so workers can munch on it

From ee161189d625edecf118121a89996bf3226d15f1 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Fri, 9 Jun 2017 18:11:51 +0530
Subject: [PATCH 08/15] fix flake8

---
 gensim/models/ldamodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 9fbc516553..b68ca0c0dd 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -533,7 +533,7 @@ def log_coherence(self, chunk):
         Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus.
         """
         cm = gensim.models.CoherenceModel(model=self, corpus=chunk, dictionary=self.id2word, coherence='u_mass')
-        coherence =  cm.get_coherence()
+        coherence = cm.get_coherence()
         corpus_words = sum(cnt for document in chunk for _, cnt in document)
         logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words)
         return coherence

From 5e37b3f168af431e6757ea62b85cf9f30d9bd58f Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Tue, 13 Jun 2017 04:37:13 +0530
Subject: [PATCH 09/15] make coherence measure optional

---
 gensim/models/ldamodel.py | 57 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index b68ca0c0dd..ad058d0c85 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -194,7 +194,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                  eval_every=10, iterations=50, gamma_threshold=0.001,
                  minimum_probability=0.01, random_state=None, ns_conf={},
-                 minimum_phi_value=0.01, per_word_topics=False):
+                 minimum_phi_value=0.01, per_word_topics=False,
+                 coherence='u_mass', texts=None, window_size=None, topn=10):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -238,6 +239,30 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         `random_state` can be a np.random.RandomState object or the seed for one
 
+        `texts` : Tokenized texts. Needed when eval_every is not None and coherence is logged for sliding_window_based measures c_v, c_uci, c_npmi. eg::
+                texts = [['system', 'human', 'system', 'eps'],
+                             ['user', 'response', 'time'],
+                             ['trees'],
+                             ['graph', 'trees'],
+                             ['graph', 'minors', 'trees'],
+                             ['graph', 'minors', 'survey']]
+
+        `coherence` : Coherence measure to be used for logging coherence. Supported values are:
+                    'u_mass'
+                    'c_v'
+                    'c_uci' also popularly known as c_pmi
+                    'c_npmi'
+                    For 'c_v', 'c_uci' and 'c_npmi' texts should be provided.
+
+        `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their
+                      probability estimator. For 'u_mass' this doesn't matter.
+                      If left 'None' the default window sizes are used which are:
+                      'c_v' : 110
+                      'c_uci' : 10
+                      'c_npmi' : 10
+        
+        `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging.
+
         Example:
 
         >>> lda = LdaModel(corpus, num_topics=100)  # train model
@@ -280,6 +305,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         self.minimum_phi_value = minimum_phi_value
         self.per_word_topics = per_word_topics
 
+        self.texts = texts
+        self.coherence = coherence
+        self.window_size = window_size
+        self.topn = topn
+
         self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
 
         assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
@@ -528,11 +558,11 @@ def log_perplexity(self, chunk, total_docs=None):
                     (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words))
         return perwordbound
 
-    def log_coherence(self, chunk):
+    def log_coherence(self, chunk, texts, coherence, window_size, topn):
         """
-        Log 'u_mass' coherence using the `chunk` of documents as evaluation corpus.
+        Log coherence using the `chunk` of documents as evaluation corpus.
         """
-        cm = gensim.models.CoherenceModel(model=self, corpus=chunk, dictionary=self.id2word, coherence='u_mass')
+        cm = gensim.models.CoherenceModel(model=self, corpus=chunk, texts=texts, dictionary=self.id2word, coherence=coherence, window_size=window_size, topn=topn)
         coherence = cm.get_coherence()
         corpus_words = sum(cnt for document in chunk for _, cnt in document)
         logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words)
@@ -540,7 +570,8 @@ def log_coherence(self, chunk):
 
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
-               gamma_threshold=None, chunks_as_numpy=False):
+               gamma_threshold=None, chunks_as_numpy=False, coherence=None, 
+               texts=None, window_size=None, topn=None):
         """
         Train the model with new documents, by EM-iterating over `corpus` until
         the topics converge (or until the maximum number of allowed iterations
@@ -587,6 +618,14 @@ def update(self, corpus, chunksize=None, decay=None, offset=None,
             iterations = self.iterations
         if gamma_threshold is None:
             gamma_threshold = self.gamma_threshold
+        if coherence is None:
+            coherence = self.coherence
+        if texts is None:
+            texts = self.texts
+        if window_size is None:
+            window_size = self.window_size
+        if topn is None:
+            topn = self.topn
 
         try:
             lencorpus = len(corpus)
@@ -645,7 +684,13 @@ def rho():
 
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                     self.log_perplexity(chunk, total_docs=lencorpus)
-                    self.log_coherence(chunk)
+                    # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi)
+                    if texts is not None:
+                        init = (chunk_no + 1) * chunksize - chunksize
+                        end = init + chunksize
+                        # texts subarray corresponding to current chunk
+                        texts_chunk = texts[init:end]
+                    self.log_coherence(chunk, texts_chunk, coherence, window_size, topn)
 
                 if self.dispatcher:
                     # add the chunk to dispatcher's job queue, so workers can munch on it

From 4dad8f4cef88a8be82a23c017d9a05aee5f5b00c Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Tue, 13 Jun 2017 14:31:59 +0530
Subject: [PATCH 10/15] fix failing tests

---
 gensim/models/ldamodel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index ad058d0c85..11deacd735 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -260,7 +260,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                       'c_v' : 110
                       'c_uci' : 10
                       'c_npmi' : 10
-        
+
         `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging.
 
         Example:
@@ -690,6 +690,8 @@ def rho():
                         end = init + chunksize
                         # texts subarray corresponding to current chunk
                         texts_chunk = texts[init:end]
+                    else:
+                        texts_chunk = None
                     self.log_coherence(chunk, texts_chunk, coherence, window_size, topn)
 
                 if self.dispatcher:

From 950da4506e810e6ef3a934cc41728ba6cf79a436 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Fri, 16 Jun 2017 23:30:04 +0530
Subject: [PATCH 11/15] add diff logging

---
 gensim/models/ldamodel.py | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 11deacd735..50e3c643c2 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -35,6 +35,7 @@
 import numbers
 import os
 import gensim
+import copy
 
 from gensim import interfaces, utils, matutils
 from gensim.matutils import dirichlet_expectation
@@ -194,8 +195,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                  eval_every=10, iterations=50, gamma_threshold=0.001,
                  minimum_probability=0.01, random_state=None, ns_conf={},
-                 minimum_phi_value=0.01, per_word_topics=False,
-                 coherence='u_mass', texts=None, window_size=None, topn=10):
+                 minimum_phi_value=0.01, per_word_topics=False, coherence='u_mass',
+                 texts=None, window_size=None, topn=10, log_diff=False, distance="jaccard"):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -263,6 +264,10 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging.
 
+        `log_diff` set to True to log topic diff between consecutive epochs
+
+        `distance` is the distance measure to use for `log_diff`
+
         Example:
 
         >>> lda = LdaModel(corpus, num_topics=100)  # train model
@@ -305,6 +310,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         self.minimum_phi_value = minimum_phi_value
         self.per_word_topics = per_word_topics
 
+        self.log_diff = log_diff
+        self.distance = distance
+
         self.texts = texts
         self.coherence = coherence
         self.window_size = window_size
@@ -568,10 +576,19 @@ def log_coherence(self, chunk, texts, coherence, window_size, topn):
         logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words)
         return coherence
 
+    def log_epoch_diff(self, epoch, other_model):
+        """
+        Log topic diff between consecutive epochs
+        """
+        diff_matrix, annotation = self.diff(other_model)
+        diff_diagonal = np.diagonal(diff_matrix)
+        logger.info("Topic difference between %i and %i epoch %s", epoch - 1, epoch, diff_diagonal)
+        return diff_diagonal
+
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
                gamma_threshold=None, chunks_as_numpy=False, coherence=None, 
-               texts=None, window_size=None, topn=None):
+               texts=None, window_size=None, topn=None, log_diff=None, distance=None):
         """
         Train the model with new documents, by EM-iterating over `corpus` until
         the topics converge (or until the maximum number of allowed iterations
@@ -626,6 +643,10 @@ def update(self, corpus, chunksize=None, decay=None, offset=None,
             window_size = self.window_size
         if topn is None:
             topn = self.topn
+        if log_diff is None:
+            log_diff = self.log_diff
+        if distance is None:
+            distance = self.distance
 
         try:
             lencorpus = len(corpus)
@@ -727,6 +748,15 @@ def rho():
                         other = LdaState(self.eta, self.state.sstats.shape)
                     dirty = False
             # endfor single corpus iteration
+
+            # log diff between consecutive epochs
+            if log_diff:
+                if pass_ == 0:
+                    # save randomly initialized model for diff with first pass
+                    previous = copy.deepcopy(self)
+                self.log_epoch_diff(pass_, previous)
+                previous = copy.deepcopy(self)
+
             if reallen != lencorpus:
                 raise RuntimeError("input corpus size changed during training (don't use generators as input)")
 

From 017a75423b8cc55b110ae1bfeaabc96b40044874 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Fri, 16 Jun 2017 23:59:50 +0530
Subject: [PATCH 12/15] make distance measure optional for diff

---
 gensim/models/ldamodel.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 0aa39ea9db..6b4759d1b8 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -578,13 +578,16 @@ def log_coherence(self, chunk, texts, coherence, window_size, topn):
         logger.info("%.3f coherence estimate based on a held-out corpus of %i documents with %i words", coherence, len(chunk), corpus_words)
         return coherence
 
-    def log_epoch_diff(self, epoch, other_model):
+    def log_epoch_diff(self, epoch, other_model, distance):
         """
         Log topic diff between consecutive epochs
         """
-        diff_matrix, annotation = self.diff(other_model)
+        diff_matrix, annotation = self.diff(other_model, distance)
         diff_diagonal = np.diagonal(diff_matrix)
-        logger.info("Topic difference between %i and %i epoch %s", epoch - 1, epoch, diff_diagonal)
+        prev_epoch = epoch - 1
+        if epoch == 0:
+            prev_epoch = "initial random model"
+        logger.info("Topic difference between %s and %s epoch: %s", prev_epoch, epoch, diff_diagonal)
         return diff_diagonal
 
     def update(self, corpus, chunksize=None, decay=None, offset=None,
@@ -760,7 +763,7 @@ def rho():
                 if pass_ == 0:
                     # save randomly initialized model for diff with first pass
                     previous = copy.deepcopy(self)
-                self.log_epoch_diff(pass_, previous)
+                self.log_epoch_diff(pass_, previous, distance)
                 previous = copy.deepcopy(self)
 
             if reallen != lencorpus:

From f07d1d26c323f1a603b2290d49c5540f3f68f666 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Sat, 17 Jun 2017 03:02:36 +0530
Subject: [PATCH 13/15] fix flake8

---
 gensim/models/ldamodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 6b4759d1b8..1ca5e123fd 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -592,7 +592,7 @@ def log_epoch_diff(self, epoch, other_model, distance):
 
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
-               gamma_threshold=None, chunks_as_numpy=False, coherence=None, 
+               gamma_threshold=None, chunks_as_numpy=False, coherence=None,
                texts=None, window_size=None, topn=None, log_diff=None, distance=None):
         """
         Train the model with new documents, by EM-iterating over `corpus` until

From 2e3c474d2ea1f26d273b81bcb23f594741733453 Mon Sep 17 00:00:00 2001
From: parulsethi <parul1sethi@gmail.com>
Date: Wed, 28 Jun 2017 18:33:06 +0530
Subject: [PATCH 14/15] give relevant parameter names

---
 gensim/models/ldamodel.py | 66 ++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 1ca5e123fd..1346d870c0 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -198,7 +198,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  eval_every=10, iterations=50, gamma_threshold=0.001,
                  minimum_probability=0.01, random_state=None, ns_conf={},
                  minimum_phi_value=0.01, per_word_topics=False, coherence='u_mass',
-                 texts=None, window_size=None, topn=10, log_diff=False, distance="jaccard"):
+                 coherence_texts=None, coherence_window_size=None, coherence_topn=10, 
+                 log_diff=False, diff_distance="jaccard"):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -231,7 +232,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
         on how to set up a cluster of machines for gensim).
 
-        Calculate and log perplexity estimate from the latest mini-batch every
+        Calculate and log perplexity and coherence estimate from the latest mini-batch every
         `eval_every` model updates (setting this to 1 slows down training ~2x;
         default is 10 for better performance). Set to None to disable perplexity estimation.
 
@@ -242,14 +243,6 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         `random_state` can be a np.random.RandomState object or the seed for one
 
-        `texts` : Tokenized texts. Needed when eval_every is not None and coherence is logged for sliding_window_based measures c_v, c_uci, c_npmi. eg::
-                texts = [['system', 'human', 'system', 'eps'],
-                             ['user', 'response', 'time'],
-                             ['trees'],
-                             ['graph', 'trees'],
-                             ['graph', 'minors', 'trees'],
-                             ['graph', 'minors', 'survey']]
-
         `coherence` : Coherence measure to be used for logging coherence. Supported values are:
                     'u_mass'
                     'c_v'
@@ -257,18 +250,26 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                     'c_npmi'
                     For 'c_v', 'c_uci' and 'c_npmi' texts should be provided.
 
-        `window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their
+        `coherence_texts` : Tokenized texts. Needed when eval_every is not None and coherence is logged for sliding_window_based measures c_v, c_uci, c_npmi. eg::
+                texts = [['system', 'human', 'system', 'eps'],
+                             ['user', 'response', 'time'],
+                             ['trees'],
+                             ['graph', 'trees'],
+                             ['graph', 'minors', 'trees'],
+                             ['graph', 'minors', 'survey']]
+
+        `coherence_window_size` : Is the size of the window to be used for coherence measures using boolean sliding window as their
                       probability estimator. For 'u_mass' this doesn't matter.
                       If left 'None' the default window sizes are used which are:
                       'c_v' : 110
                       'c_uci' : 10
                       'c_npmi' : 10
 
-        `topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging.
+        `coherence_topn` Integer corresponding to the number of top words to be extracted from each topic for coherence logging.
 
         `log_diff` set to True to log topic diff between consecutive epochs
 
-        `distance` is the distance measure to use for `log_diff`
+        `diff_distance` is the distance measure to use for `log_diff`
 
         Example:
 
@@ -312,13 +313,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         self.minimum_phi_value = minimum_phi_value
         self.per_word_topics = per_word_topics
 
-        self.log_diff = log_diff
-        self.distance = distance
-
-        self.texts = texts
         self.coherence = coherence
-        self.window_size = window_size
-        self.topn = topn
+        self.coherence_texts = coherence_texts
+        self.coherence_window_size = coherence_window_size
+        self.coherence_topn = coherence_topn
+
+        self.log_diff = log_diff
+        self.diff_distance = diff_distance
 
         self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
 
@@ -593,7 +594,8 @@ def log_epoch_diff(self, epoch, other_model, distance):
     def update(self, corpus, chunksize=None, decay=None, offset=None,
                passes=None, update_every=None, eval_every=None, iterations=None,
                gamma_threshold=None, chunks_as_numpy=False, coherence=None,
-               texts=None, window_size=None, topn=None, log_diff=None, distance=None):
+               coherence_texts=None, coherence_window_size=None, coherence_topn=None,
+               log_diff=None, diff_distance=None):
         """
         Train the model with new documents, by EM-iterating over `corpus` until
         the topics converge (or until the maximum number of allowed iterations
@@ -642,16 +644,16 @@ def update(self, corpus, chunksize=None, decay=None, offset=None,
             gamma_threshold = self.gamma_threshold
         if coherence is None:
             coherence = self.coherence
-        if texts is None:
-            texts = self.texts
-        if window_size is None:
-            window_size = self.window_size
-        if topn is None:
-            topn = self.topn
+        if coherence_texts is None:
+            coherence_texts = self.coherence_texts
+        if coherence_window_size is None:
+            coherence_window_size = self.coherence_window_size
+        if coherence_topn is None:
+            coherence_topn = self.coherence_topn
         if log_diff is None:
             log_diff = self.log_diff
-        if distance is None:
-            distance = self.distance
+        if diff_distance is None:
+            diff_distance = self.diff_distance
 
         try:
             lencorpus = len(corpus)
@@ -715,14 +717,14 @@ def rho():
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                     self.log_perplexity(chunk, total_docs=lencorpus)
                     # texts input is needed for sliding window based coherence measures (c_v, c_uci, c_npmi)
-                    if texts is not None:
+                    if coherence_texts is not None:
                         init = (chunk_no + 1) * chunksize - chunksize
                         end = init + chunksize
                         # texts subarray corresponding to current chunk
-                        texts_chunk = texts[init:end]
+                        texts_chunk = coherence_texts[init:end]
                     else:
                         texts_chunk = None
-                    self.log_coherence(chunk, texts_chunk, coherence, window_size, topn)
+                    self.log_coherence(chunk, texts_chunk, coherence, coherence_window_size, coherence_topn)
 
                 if self.dispatcher:
                     # add the chunk to dispatcher's job queue, so workers can munch on it
@@ -763,7 +765,7 @@ def rho():
                 if pass_ == 0:
                     # save randomly initialized model for diff with first pass
                     previous = copy.deepcopy(self)
-                self.log_epoch_diff(pass_, previous, distance)
+                self.log_epoch_diff(pass_, previous, diff_distance)
                 previous = copy.deepcopy(self)
 
             if reallen != lencorpus:

From 00e776280f6c7a3b42644641c9ec06dd9c037312 Mon Sep 17 00:00:00 2001
From: Parul Sethi <parul1sethi@gmail.com>
Date: Wed, 28 Jun 2017 19:51:32 +0530
Subject: [PATCH 15/15] fix flake8

---
 gensim/models/ldamodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 1346d870c0..0422865d4e 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -198,7 +198,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  eval_every=10, iterations=50, gamma_threshold=0.001,
                  minimum_probability=0.01, random_state=None, ns_conf={},
                  minimum_phi_value=0.01, per_word_topics=False, coherence='u_mass',
-                 coherence_texts=None, coherence_window_size=None, coherence_topn=10, 
+                 coherence_texts=None, coherence_window_size=None, coherence_topn=10,
                  log_diff=False, diff_distance="jaccard"):
         """
         If given, start training from the iterable `corpus` straight away. If not given,