piskvorky · menshikh-iv · Sep 8, 2017 · Sep 5, 2017 · Sep 5, 2017 · Sep 5, 2017
diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -60,7 +60,7 @@ def __iter__(self):
         for line_no, line in enumerate(reader):
             if self.labels:
                 line.pop(0)  # ignore the first column = class label
-            yield list(enumerate(map(float, line)))
+            yield list(enumerate(float(x) for x in line))
 
         self.length = line_no + 1  # store the total number of CSV rows = documents
 

diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -118,12 +118,11 @@ def line2doc(self, line):
                     use_words.append(word)
                     marker.add(word)
             # construct a list of (wordIndex, wordFrequency) 2-tuples
-            doc = list(zip(map(self.word2id.get, use_words),
-                      map(words.count, use_words)))
+            doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
         else:
             uniq_words = set(words)
             # construct a list of (word, wordFrequency) 2-tuples
-            doc = list(zip(uniq_words, map(words.count, uniq_words)))
+            doc = [(w, words.count(w)) for w in uniq_words]
 
         # return the document, then forget it and move on to the next one
         # note that this way, only one doc is stored in memory at a time, not the whole corpus

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -821,7 +821,7 @@ def __init__(self, input, transposed=True):
             for lineno, line in enumerate(lines):
                 line = utils.to_unicode(line)
                 if not line.startswith('%'):
-                    self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
+                    self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
                     if not self.transposed:
                         self.num_docs, self.num_terms = self.num_terms, self.num_docs
                     break

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -205,7 +205,7 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
         logger.info("loading projection weights from %s", fname)
         with utils.smart_open(fname) as fin:
             header = utils.to_unicode(fin.readline(), encoding=encoding)
-            vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
+            vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
             if limit:
                 vocab_size = min(vocab_size, limit)
             result = cls()
@@ -254,7 +254,7 @@ def add_word(word, weights):
                     parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
                     if len(parts) != vector_size + 1:
                         raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
-                    word, weights = parts[0], list(map(REAL, parts[1:]))
+                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                     add_word(word, weights)
         if result.syn0.shape[0] != len(result.vocab):
             logger.info(

diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py
@@ -596,7 +596,7 @@ def fit_sslm(self, sstats):
         totals = np.zeros(sstats.shape[1])
 
         # computing variance, fwd_variance
-        self.variance, self.fwd_variance = map(np.array, list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
+        self.variance, self.fwd_variance = (np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
 
         # column sum of sstats
         totals = sstats.sum(axis=0)
@@ -643,7 +643,7 @@ def compute_bound(self, sstats, totals):
 
         chain_variance = self.chain_variance
         # computing mean, fwd_mean
-        self.mean, self.fwd_mean = map(np.array, (zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)])))
+        self.mean, self.fwd_mean = (np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)]))
         self.zeta = self.update_zeta()
 
         for w in range(0, W):

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -1195,7 +1195,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
         logger.info("loading projection weights from %s" % (fname))
         with utils.smart_open(fname) as fin:
             header = utils.to_unicode(fin.readline(), encoding=encoding)
-            vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
+            vocab_size, vector_size = (int(x) for x in header.split())  # throws for invalid file format
             if not vector_size == self.vector_size:
                 raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname))
                 # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)?
@@ -1221,7 +1221,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
                     parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
                     if len(parts) != vector_size + 1:
                         raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
-                    word, weights = parts[0], list(map(REAL, parts[1:]))
+                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                     if word in self.wv.vocab:
                         overlap_count += 1
                         self.wv.syn0[self.wv.vocab[word].index] = weights

diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py
@@ -305,14 +305,9 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True):
                 # the MALLET doctopic format changed in 2.0.8 to exclude the id,
                 # this handles the file differently dependent on the pattern
                 if len(parts) == 2 * self.num_topics:
-                    doc = [(id_, weight)
-                           for id_, weight in zip(map(int, parts[::2]),
-                                                  map(float, parts[1::2]))
-                           if abs(weight) > eps]
+                    doc = [(id_, weight) for id_, weight in ((int(x), float(y)) for (x, y) in zip(parts[::2], parts[1::2])) if abs(weight) > eps]
                 elif len(parts) == self.num_topics and mallet_version != '2.0.7':
-                    doc = [(id_, weight)
-                           for id_, weight in enumerate(map(float, parts))
-                           if abs(weight) > eps]
+                    doc = [(id_, weight) for id_, weight in enumerate(float(x) for x in parts) if abs(weight) > eps]
                 else:
                     if mallet_version == "2.0.7":
                         """

diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py
@@ -363,10 +363,10 @@ def stem(self, w):
         return self.b[:self.k + 1]
 
     def stem_sentence(self, txt):
-        return " ".join(map(self.stem, txt.split()))
+        return " ".join(self.stem(x) for x in txt.split())
 
     def stem_documents(self, docs):
-        return map(self.stem_sentence, docs)
+        return [self.stem_sentence(x) for x in docs]
 
 
 if __name__ == '__main__':

diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py
@@ -52,7 +52,7 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
         with open(outfiletsvmeta, 'w+') as file_metadata:
             for word in model.index2word:
                 file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
-                vector_row = '\t'.join(map(str, model[word]))
+                vector_row = '\t'.join(str(x) for x in model[word])
                 file_vector.write(vector_row + '\n')
 
     logger.info("2D tensor file saved to %s" % outfiletsv)

diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py
@@ -82,7 +82,7 @@ def transform(self, docs):
         max_num_topics = 0
         for k, v in enumerate(docs):
             X[k] = self.gensim_model[v]
-            max_num_topics = max(max_num_topics, max(list(map(lambda x: x[0], X[k]))) + 1)
+            max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1)
 
         for k, v in enumerate(X):
             # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future

diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py
@@ -34,7 +34,7 @@ def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         """
-        tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
+        tokenized_docs = [list(self.tokenizer(x)) for x in X]
         self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at)
         return self
 
@@ -48,7 +48,7 @@ def transform(self, docs):
         # input as python lists
         check = lambda x: [x] if isinstance(x, string_types) else x
         docs = check(docs)
-        tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs))
+        tokenized_docs = [list(self.tokenizer(x)) for x in docs]
         X = [[] for _ in range(0, len(tokenized_docs))]
 
         for k, v in enumerate(tokenized_docs):
@@ -61,6 +61,6 @@ def partial_fit(self, X):
         if self.gensim_model is None:
             self.gensim_model = Dictionary(prune_at=self.prune_at)
 
-        tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
+        tokenized_docs = [list(self.tokenizer(x)) for x in X]
         self.gensim_model.add_documents(tokenized_docs)
         return self
diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
@@ -18,7 +18,7 @@ class BM25(object):
 
     def __init__(self, corpus):
         self.corpus_size = len(corpus)
-        self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
+        self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
         self.corpus = corpus
         self.f = []
         self.df = {}
@@ -62,7 +62,7 @@ def get_scores(self, document, average_idf):
 
 def get_bm25_weights(corpus):
     bm25 = BM25(corpus)
-    average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
+    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
 
     weights = []
     for doc in corpus:

diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py
@@ -436,7 +436,7 @@ def testPasses(self):
             for test_rhot in test_rhots:
                 model.update(corpus, author2doc)
 
-                msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs]))
+                msg = ", ".join(str(x) for x in [passes, model.num_updates, model.state.numdocs])
                 self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)
 
             self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))

diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py
@@ -35,7 +35,7 @@ def test_text_keywords(self):
         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
             kw = f.read().strip().split("\n")
 
-        self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw)))
+        self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
 
     def test_text_keywords_words(self):
         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
@@ -61,7 +61,7 @@ def test_text_keywords_pos(self):
         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
             kw = f.read().strip().split("\n")
 
-        self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw)))
+        self.assertEqual({str(x) for x in generated_keywords_NNVBJJ}, {str(x) for x in kw})
 
     def test_text_summarization_raises_exception_on_short_input_text(self):
         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -355,7 +355,7 @@ def testPasses(self):
             for test_rhot in test_rhots:
                 model.update(self.corpus)
 
-                msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs]))
+                msg = ", ".join(str(x) for x in [passes, model.num_updates, model.state.numdocs])
                 self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)
 
             self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))

diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py
@@ -36,8 +36,7 @@
 for many searching purposes, a little fuzziness would help. """
 
 
-dataset = map(lambda x: strip_punctuation2(x.lower()),
-        [doc1, doc2, doc3, doc4])
+dataset = [strip_punctuation2(x.lower()) for x in [doc1, doc2, doc3, doc4]]
 # doc1 and doc2 have class 0, doc3 and doc4 avec class 1
 classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]])
 

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
@@ -190,7 +190,7 @@ def testPipeline(self):
             uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
             cache = pickle.loads(uncompressed_content)
         data = cache
-        id2word = Dictionary(map(lambda x: x.split(), data.data))
+        id2word = Dictionary([x.split() for x in data.data])
         corpus = [id2word.doc2bow(i.split()) for i in data.data]
         numpy.random.mtrand.RandomState(1)  # set seed for getting same result
         clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -280,7 +280,7 @@ def testPipeline(self):
             uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
             cache = pickle.loads(uncompressed_content)
         data = cache
-        id2word = Dictionary(map(lambda x: x.split(), data.data))
+        id2word = Dictionary([x.split() for x in data.data])
         corpus = [id2word.doc2bow(i.split()) for i in data.data]
         numpy.random.mtrand.RandomState(1)  # set seed for getting same result
         clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -363,7 +363,7 @@ def testPipeline(self):
         data = cache
         test_data = data.data[0:2]
         test_target = data.target[0:2]
-        id2word = Dictionary(map(lambda x: x.split(), test_data))
+        id2word = Dictionary([x.split() for x in data.data])
         corpus = [id2word.doc2bow(i.split()) for i in test_data]
         model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
         clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -433,7 +433,7 @@ def testPipeline(self):
             uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
             cache = pickle.loads(uncompressed_content)
         data = cache
-        id2word = Dictionary(map(lambda x: x.split(), data.data))
+        id2word = Dictionary([x.split() for x in data.data])
         corpus = [id2word.doc2bow(i.split()) for i in data.data]
         numpy.random.mtrand.RandomState(1)  # set seed for getting same result
         clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -517,8 +517,8 @@ def testPipeline(self):
             ('calculus', 'mathematics'), ('mathematical', 'mathematics'), ('geometry', 'mathematics'), ('operations', 'mathematics'), ('curves', 'mathematics'),
             ('natural', 'physics'), ('nuclear', 'physics'), ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')
         ]
-        train_input = list(map(lambda x: x[0], train_data))
-        train_target = list(map(lambda x: class_dict[x[1]], train_data))
+        train_input = [x[0] for x in train_data]
+        train_target = [class_dict[x[1]] for x in train_data]
 
         clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
         clf.fit(model.transform(train_input), train_target)
@@ -682,8 +682,8 @@ def testPipeline(self):
             (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),
             (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')
         ]
-        train_input = list(map(lambda x: x[0], train_data))
-        train_target = list(map(lambda x: class_dict[x[1]], train_data))
+        train_input = [x[0] for x in train_data]
+        train_target = [class_dict[x[1]] for x in train_data]
 
         clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
         clf.fit(model.transform(train_input), train_target)
@@ -737,7 +737,7 @@ def testTransform(self):
         doc = ['computer system interface time computer system']
         bow_vec = self.model.transform(doc)[0]
         expected_values = [1, 1, 2, 2]  # comparing only the word-counts
-        values = list(map(lambda x: x[1], bow_vec))
+        values = [x[1] for x in bow_vec]
         self.assertEqual(sorted(expected_values), sorted(values))
 
     def testSetGetParams(self):
@@ -815,7 +815,7 @@ def testPipeline(self):
             uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
             cache = pickle.loads(uncompressed_content)
         data = cache
-        id2word = Dictionary(map(lambda x: x.split(), data.data))
+        id2word = Dictionary([x.split() for x in data.data])
         corpus = [id2word.doc2bow(i.split()) for i in data.data]
         tfidf_model = TfIdfTransformer()
         tfidf_model.fit(corpus)
@@ -881,7 +881,7 @@ def testPipeline(self):
             uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
             cache = pickle.loads(uncompressed_content)
         data = cache
-        id2word = Dictionary(map(lambda x: x.split(), data.data))
+        id2word = Dictionary([x.split() for x in data.data])
         corpus = [id2word.doc2bow(i.split()) for i in data.data]
         model = HdpTransformer(id2word=id2word)
         clf = linear_model.LogisticRegression(penalty='l2', C=0.1)