piskvorky · menshikh-iv · Nov 7, 2017 · Nov 2, 2017 · Nov 3, 2017 · Nov 3, 2017
diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -28,7 +28,8 @@
 from six.moves import xrange, zip as izip
 
 
-blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
+def blas(name, ndarray):
+    return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
 
 logger = logging.getLogger(__name__)
 

diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py
@@ -336,18 +336,21 @@ def __getitem__(self, query):
             # the following uses a lot of lazy evaluation and (optionally) parallel
             # processing, to improve query latency and minimize memory footprint.
             offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
-            convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]
+
+            def convert(shard_no, doc):
+                return [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]
+
             is_corpus, query = utils.is_corpus(query)
             is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
             if not is_corpus:
                 # user asked for num_best most similar and query is a single doc
-                results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results))
+                results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results))
                 result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
             else:
                 # the trickiest combination: returning num_best results when query was a corpus
                 results = []
                 for shard_no, result in enumerate(shard_results):
-                    shard_result = [convert(doc, shard_no) for doc in result]
+                    shard_result = [convert(shard_no, doc) for doc in result]
                     results.append(shard_result)
                 result = []
                 for parts in izip(*results):

diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py
@@ -76,17 +76,11 @@ def transform(self, author_names):
                 "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
             )
 
-        check = lambda x: [x] if not isinstance(x, list) else x
-        author_names = check(author_names)
-        X = [[] for _ in range(0, len(author_names))]
-
-        for k, v in enumerate(author_names):
-            transformed_author = self.gensim_model[v]
-            # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
-            probs_author = matutils.sparse2full(transformed_author, self.num_topics)
-            X[k] = probs_author
-
-        return np.reshape(np.array(X), (len(author_names), self.num_topics))
+        if not isinstance(author_names, list):
+            author_names = [author_names]
+        # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
+        topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names]
+        return np.reshape(np.array(topics), (len(author_names), self.num_topics))
 
     def partial_fit(self, X, author2doc=None, doc2author=None):
         """

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
@@ -87,12 +87,7 @@ def transform(self, docs):
             )
 
         # The input as array of array
-        check = lambda x: [x] if isinstance(x[0], string_types) else x
-        docs = check(docs)
-        X = [[] for _ in range(0, len(docs))]
-
-        for k, v in enumerate(docs):
-            doc_vec = self.gensim_model.infer_vector(v)
-            X[k] = doc_vec
-
-        return np.reshape(np.array(X), (len(docs), self.gensim_model.vector_size))
+        if isinstance(docs[0], string_types):
+            docs = [docs]
+        vectors = [self.gensim_model.infer_vector(doc) for doc in docs]
+        return np.reshape(np.array(vectors), (len(docs), self.gensim_model.vector_size))
diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py
@@ -77,21 +77,18 @@ def transform(self, docs):
             )
 
         # The input as array of array
-        check = lambda x: [x] if isinstance(x[0], tuple) else x
-        docs = check(docs)
-        X = [[] for _ in range(0, len(docs))]
-
-        max_num_topics = 0
-        for k, v in enumerate(docs):
-            X[k] = self.gensim_model[v]
-            max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1)
-
-        for k, v in enumerate(X):
-            # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
-            dense_vec = matutils.sparse2full(v, max_num_topics)
-            X[k] = dense_vec
-
-        return np.reshape(np.array(X), (len(docs), max_num_topics))
+        if isinstance(docs[0], tuple):
+            docs = [docs]
+        distribution, max_num_topics = [], 0
+
+        for doc in docs:
+            topicd = self.gensim_model[doc]
+            distribution.append(topicd)
+            max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1)
+
+        # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
+        distribution = [matutils.sparse2full(topicd, max_num_topics) for topicd in distribution]
+        return np.reshape(np.array(distribution), (len(docs), max_num_topics))
 
     def partial_fit(self, X):
         """

diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py
@@ -83,16 +83,11 @@ def transform(self, docs):
             raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
 
         # The input as array of array
-        check = lambda x: [x] if isinstance(x[0], tuple) else x
-        docs = check(docs)
-        X = [[] for _ in range(0, len(docs))]
-
-        for k, v in enumerate(docs):
-            doc_topics = self.gensim_model[v]
-            # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
-            probs_docs = matutils.sparse2full(doc_topics, self.num_topics)
-            X[k] = probs_docs
-        return np.reshape(np.array(X), (len(docs), self.num_topics))
+        if isinstance(docs[0], tuple):
+            docs = [docs]
+        # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
+        distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
+        return np.reshape(np.array(distribution), (len(docs), self.num_topics))
 
     def partial_fit(self, X):
         """

diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py
@@ -69,12 +69,7 @@ def transform(self, docs):
             raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
 
         # The input as array of array
-        check = lambda x: [x] if isinstance(x[0], tuple) else x
-        docs = check(docs)
-        X = [[] for _ in range(0, len(docs))]
-
-        for k, v in enumerate(docs):
-            transformed_author = self.gensim_model[v]
-            X[k] = transformed_author
-
-        return np.reshape(np.array(X), (len(docs), self.num_topics))
+        if isinstance(docs[0], tuple):
+            docs = [docs]
+        proportions = [self.gensim_model[doc] for doc in docs]
+        return np.reshape(np.array(proportions), (len(docs), self.num_topics))
diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py
@@ -67,15 +67,11 @@ def transform(self, docs):
             )
 
         # The input as array of array
-        check = lambda x: [x] if isinstance(x[0], tuple) else x
-        docs = check(docs)
-        X = [[] for i in range(0, len(docs))]
-        for k, v in enumerate(docs):
-            doc_topics = self.gensim_model[v]
-            # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
-            probs_docs = matutils.sparse2full(doc_topics, self.num_topics)
-            X[k] = probs_docs
-        return np.reshape(np.array(X), (len(docs), self.num_topics))
+        if isinstance(docs[0], tuple):
+            docs = [docs]
+        # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
+        distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
+        return np.reshape(np.array(distribution), (len(docs), self.num_topics))
 
     def partial_fit(self, X):
         """

diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py
@@ -50,15 +50,9 @@ def transform(self, docs):
             raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
 
         # input as python lists
-        check = lambda x: [x] if isinstance(x[0], string_types) else x
-        docs = check(docs)
-        X = [[] for _ in range(0, len(docs))]
-
-        for k, v in enumerate(docs):
-            phrase_tokens = self.gensim_model[v]
-            X[k] = phrase_tokens
-
-        return X
+        if isinstance(docs[0], string_types):
+            docs = [docs]
+        return [self.gensim_model[doc] for doc in docs]
 
     def partial_fit(self, X):
         if self.gensim_model is None:

diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py
@@ -52,14 +52,8 @@ def transform(self, docs):
             )
 
         # The input as array of array
-        check = lambda x: [x] if isinstance(x[0], tuple) else x
-        docs = check(docs)
-        X = [[] for _ in range(0, len(docs))]
-
-        for k, v in enumerate(docs):
-            transformed_doc = self.gensim_model[v]
-            # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
-            probs_docs = matutils.sparse2full(transformed_doc, self.num_topics)
-            X[k] = probs_docs
-
-        return np.reshape(np.array(X), (len(docs), self.num_topics))
+        if isinstance(docs[0], tuple):
+            docs = [docs]
+        # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
+        presentation = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
+        return np.reshape(np.array(presentation), (len(docs), self.num_topics))
diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py
@@ -48,16 +48,10 @@ def transform(self, docs):
             )
 
         # input as python lists
-        check = lambda x: [x] if isinstance(x, string_types) else x
-        docs = check(docs)
-        tokenized_docs = [list(self.tokenizer(x)) for x in docs]
-        X = [[] for _ in range(0, len(tokenized_docs))]
-
-        for k, v in enumerate(tokenized_docs):
-            bow_val = self.gensim_model.doc2bow(v)
-            X[k] = bow_val
-
-        return X
+        if isinstance(docs, string_types):
+            docs = [docs]
+        tokenized_docs = (list(self.tokenizer(doc)) for doc in docs)
+        return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs]
 
     def partial_fit(self, X):
         if self.gensim_model is None:

diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
@@ -51,12 +51,6 @@ def transform(self, docs):
             )
 
         # input as python lists
-        check = lambda x: [x] if isinstance(x[0], tuple) else x
-        docs = check(docs)
-        X = [[] for _ in range(0, len(docs))]
-
-        for k, v in enumerate(docs):
-            transformed_doc = self.gensim_model[v]
-            X[k] = transformed_doc
-
-        return X
+        if isinstance(docs[0], tuple):
+            docs = [docs]
+        return [self.gensim_model[doc] for doc in docs]
diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
@@ -75,15 +75,10 @@ def transform(self, words):
             )
 
         # The input as array of array
-        check = lambda x: [x] if isinstance(x, six.string_types) else x
-        words = check(words)
-        X = [[] for _ in range(0, len(words))]
-
-        for k, v in enumerate(words):
-            word_vec = self.gensim_model[v]
-            X[k] = word_vec
-
-        return np.reshape(np.array(X), (len(words), self.size))
+        if isinstance(words, six.string_types):
+            words = [words]
+        vectors = [self.gensim_model[word] for word in words]
+        return np.reshape(np.array(vectors), (len(words), self.size))
 
     def partial_fit(self, X):
         raise NotImplementedError(

diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py
@@ -37,7 +37,10 @@
 # Test that models are compatiple across versions, as done in LdaModel.
 
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
-datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+
+
+def datapath(fname):
+    return os.path.join(module_path, 'test_data', fname)
 
 # set up vars used in testing ("Deerwester" from the web tutorial)
 texts = [
@@ -475,24 +478,26 @@ def testPasses(self):
         # long message includes the original error message with a custom one
         self.longMessage = True
         # construct what we expect when passes aren't involved
-        test_rhots = list()
+        test_rhots = []
         model = self.class_(id2word=dictionary, chunksize=1, num_topics=2)
-        final_rhot = lambda: pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay)
+
+        def final_rhot(model):
+            return pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay)
 
         # generate 5 updates to test rhot on
-        for x in range(5):
+        for _ in range(5):
             model.update(corpus, author2doc)
-            test_rhots.append(final_rhot())
+            test_rhots.append(final_rhot(model))
 
         for passes in [1, 5, 10, 50, 100]:
             model = self.class_(id2word=dictionary, chunksize=1, num_topics=2, passes=passes)
-            self.assertEqual(final_rhot(), 1.0)
+            self.assertEqual(final_rhot(model), 1.0)
             # make sure the rhot matches the test after each update
             for test_rhot in test_rhots:
                 model.update(corpus, author2doc)
 
                 msg = "{}, {}, {}".format(passes, model.num_updates, model.state.numdocs)
-                self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)
+                self.assertAlmostEqual(final_rhot(model), test_rhot, msg=msg)
 
             self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))
             self.assertEqual(model.num_updates, len(corpus) * len(test_rhots))

diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
@@ -14,6 +14,7 @@
 import unittest
 from unittest import SkipTest
 import multiprocessing as mp
+from functools import partial
 
 import numpy as np
 from gensim.corpora.dictionary import Dictionary
@@ -215,20 +216,17 @@ def testErrors(self):
         )
 
     def testProcesses(self):
-        cpu = mp.cpu_count()
-        get_model = lambda p: CoherenceModel(
-            topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass', processes=p,
+        get_model = partial(CoherenceModel,
+            topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
         )
 
-        model = CoherenceModel(
-            topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass',
-        )
-        self.assertEqual(model.processes, cpu - 1)
+        model, used_cpus = get_model(), mp.cpu_count() - 1
+        self.assertEqual(model.processes, used_cpus)
         for p in range(-2, 1):
-            self.assertEqual(get_model(p).processes, cpu - 1)
+            self.assertEqual(get_model(processes=p).processes, used_cpus)
 
         for p in range(1, 4):
-            self.assertEqual(get_model(p).processes, p)
+            self.assertEqual(get_model(processes=p).processes, p)
 
     def testPersistence(self):
         fname = testfile()

diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -26,7 +26,10 @@
 from gensim.models import doc2vec, keyedvectors
 
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
-datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+
+
+def datapath(fname):
+    return os.path.join(module_path, 'test_data', fname)
 
 
 class DocsLeeCorpus(object):

diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py
@@ -17,7 +17,10 @@
 
 # needed because sample data files are located in the same folder
 module_path = os.path.dirname(__file__)
-datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+
+
+def datapath(fname):
+    return os.path.join(module_path, 'test_data', fname)
 
 
 class TestDtmModel(unittest.TestCase):

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -16,12 +16,15 @@
 from gensim.models.wrappers.fasttext import FastText as FT_wrapper
 
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
-datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
 logger = logging.getLogger(__name__)
 
 IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32)
 
 
+def datapath(fname):
+    return os.path.join(module_path, 'test_data', fname)
+
+
 class LeeCorpus(object):
     def __iter__(self):
         with open(datapath('lee_background.cor')) as f: