Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor code with PEP8 and additional limitations. Fix #1521 #1569

Merged
merged 20 commits into from
Sep 8, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gensim/corpora/csvcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __iter__(self):
for line_no, line in enumerate(reader):
if self.labels:
line.pop(0) # ignore the first column = class label
yield list(enumerate(map(float, line)))
yield list(enumerate(float(x) for x in line))

self.length = line_no + 1 # store the total number of CSV rows = documents

Expand Down
5 changes: 2 additions & 3 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,11 @@ def line2doc(self, line):
use_words.append(word)
marker.add(word)
# construct a list of (wordIndex, wordFrequency) 2-tuples
doc = list(zip(map(self.word2id.get, use_words),
map(words.count, use_words)))
doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
else:
uniq_words = set(words)
# construct a list of (word, wordFrequency) 2-tuples
doc = list(zip(uniq_words, map(words.count, uniq_words)))
doc = [(w, words.count(w)) for w in uniq_words]

# return the document, then forget it and move on to the next one
# note that this way, only one doc is stored in memory at a time, not the whole corpus
Expand Down
2 changes: 1 addition & 1 deletion gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,7 +821,7 @@ def __init__(self, input, transposed=True):
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
logger.info("loading projection weights from %s", fname)
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = map(int, header.split()) # throws for invalid file format
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
if limit:
vocab_size = min(vocab_size, limit)
result = cls()
Expand Down Expand Up @@ -254,7 +254,7 @@ def add_word(word, weights):
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
if len(parts) != vector_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
word, weights = parts[0], list(map(REAL, parts[1:]))
word, weights = parts[0], [REAL(x) for x in parts[1:]]
add_word(word, weights)
if result.syn0.shape[0] != len(result.vocab):
logger.info(
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ def fit_sslm(self, sstats):
totals = np.zeros(sstats.shape[1])

# computing variance, fwd_variance
self.variance, self.fwd_variance = map(np.array, list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
self.variance, self.fwd_variance = (np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))

# column sum of sstats
totals = sstats.sum(axis=0)
Expand Down Expand Up @@ -643,7 +643,7 @@ def compute_bound(self, sstats, totals):

chain_variance = self.chain_variance
# computing mean, fwd_mean
self.mean, self.fwd_mean = map(np.array, (zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)])))
self.mean, self.fwd_mean = (np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)]))
self.zeta = self.update_zeta()

for w in range(0, W):
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,7 +1195,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
logger.info("loading projection weights from %s" % (fname))
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = map(int, header.split()) # throws for invalid file format
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
if not vector_size == self.vector_size:
raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname))
# TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)?
Expand All @@ -1221,7 +1221,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
if len(parts) != vector_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
word, weights = parts[0], list(map(REAL, parts[1:]))
word, weights = parts[0], [REAL(x) for x in parts[1:]]
if word in self.wv.vocab:
overlap_count += 1
self.wv.syn0[self.wv.vocab[word].index] = weights
Expand Down
9 changes: 2 additions & 7 deletions gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,14 +305,9 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True):
# the MALLET doctopic format changed in 2.0.8 to exclude the id,
# this handles the file differently dependent on the pattern
if len(parts) == 2 * self.num_topics:
doc = [(id_, weight)
for id_, weight in zip(map(int, parts[::2]),
map(float, parts[1::2]))
if abs(weight) > eps]
doc = [(id_, weight) for id_, weight in ((int(x), float(y)) for (x, y) in zip(parts[::2], parts[1::2])) if abs(weight) > eps]
elif len(parts) == self.num_topics and mallet_version != '2.0.7':
doc = [(id_, weight)
for id_, weight in enumerate(map(float, parts))
if abs(weight) > eps]
doc = [(id_, weight) for id_, weight in enumerate(float(x) for x in parts) if abs(weight) > eps]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@menshikh-iv
line 308:

doc = [(int(id_), float(weight))
           for id_, weight in zip(*[iter(parts)] * 2)
           if abs(float(weight)) > eps]

line 310 :

doc = [(id_, float(weight))
           for id_, weight in enumerate(parts)
           if abs(float(weight)) > eps]

How about above code?
Above code is less loop

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, accepted.

Copy link
Owner

@piskvorky piskvorky Sep 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zsef123 Both examples above are incorrect. The correct formatting using hanging indent:

doc = [
   (id_, float(weight))
   for id_, weight in enumerate(parts)
   if abs(float(weight)) > eps
]

Actually, this line not too long, so simple:

doc = [(id_, float(weight)) for id_, weight in enumerate(parts) if abs(float(weight)) > eps]

would work too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already done as your last variant @piskvorky

else:
if mallet_version == "2.0.7":
"""
Expand Down
4 changes: 2 additions & 2 deletions gensim/parsing/porter.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,10 +363,10 @@ def stem(self, w):
return self.b[:self.k + 1]

def stem_sentence(self, txt):
return " ".join(map(self.stem, txt.split()))
return " ".join(self.stem(x) for x in txt.split())

def stem_documents(self, docs):
return map(self.stem_sentence, docs)
return [self.stem_sentence(x) for x in docs]


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion gensim/scripts/word2vec2tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
with open(outfiletsvmeta, 'w+') as file_metadata:
for word in model.index2word:
file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
vector_row = '\t'.join(map(str, model[word]))
vector_row = '\t'.join(str(x) for x in model[word])
file_vector.write(vector_row + '\n')

logger.info("2D tensor file saved to %s" % outfiletsv)
Expand Down
2 changes: 1 addition & 1 deletion gensim/sklearn_api/hdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def transform(self, docs):
max_num_topics = 0
for k, v in enumerate(docs):
X[k] = self.gensim_model[v]
max_num_topics = max(max_num_topics, max(list(map(lambda x: x[0], X[k]))) + 1)
max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1)

for k, v in enumerate(X):
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
Expand Down
6 changes: 3 additions & 3 deletions gensim/sklearn_api/text2bow.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def fit(self, X, y=None):
"""
Fit the model according to the given training data.
"""
tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
tokenized_docs = [list(self.tokenizer(x)) for x in X]
self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at)
return self

Expand All @@ -48,7 +48,7 @@ def transform(self, docs):
# input as python lists
check = lambda x: [x] if isinstance(x, string_types) else x
docs = check(docs)
tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs))
tokenized_docs = [list(self.tokenizer(x)) for x in docs]
X = [[] for _ in range(0, len(tokenized_docs))]

for k, v in enumerate(tokenized_docs):
Expand All @@ -61,6 +61,6 @@ def partial_fit(self, X):
if self.gensim_model is None:
self.gensim_model = Dictionary(prune_at=self.prune_at)

tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
tokenized_docs = [list(self.tokenizer(x)) for x in X]
self.gensim_model.add_documents(tokenized_docs)
return self
4 changes: 2 additions & 2 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class BM25(object):

def __init__(self, corpus):
self.corpus_size = len(corpus)
self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
self.corpus = corpus
self.f = []
self.df = {}
Expand Down Expand Up @@ -62,7 +62,7 @@ def get_scores(self, document, average_idf):

def get_bm25_weights(corpus):
bm25 = BM25(corpus)
average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

weights = []
for doc in corpus:
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ def testPasses(self):
for test_rhot in test_rhots:
model.update(corpus, author2doc)

msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs]))
msg = ", ".join(str(x) for x in [passes, model.num_updates, model.state.numdocs])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about msg = "%d, %d, %d" % (passes, model.num_updates, model.state.numdocs)?
This is more simple.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, fixed

self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)

self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_text_keywords(self):
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
kw = f.read().strip().split("\n")

self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw)))
self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})

def test_text_keywords_words(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
Expand All @@ -61,7 +61,7 @@ def test_text_keywords_pos(self):
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
kw = f.read().strip().split("\n")

self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw)))
self.assertEqual({str(x) for x in generated_keywords_NNVBJJ}, {str(x) for x in kw})

def test_text_summarization_raises_exception_on_short_input_text(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def testPasses(self):
for test_rhot in test_rhots:
model.update(self.corpus)

msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs]))
msg = ", ".join(str(x) for x in [passes, model.num_updates, model.state.numdocs])
self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)

self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))
Expand Down
3 changes: 1 addition & 2 deletions gensim/test/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@
for many searching purposes, a little fuzziness would help. """


dataset = map(lambda x: strip_punctuation2(x.lower()),
[doc1, doc2, doc3, doc4])
dataset = [strip_punctuation2(x.lower()) for x in [doc1, doc2, doc3, doc4]]
# doc1 and doc2 have class 0, doc3 and doc4 avec class 1
classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]])

Expand Down
22 changes: 11 additions & 11 deletions gensim/test/test_sklearn_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
id2word = Dictionary(map(lambda x: x.split(), data.data))
id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
Expand Down Expand Up @@ -280,7 +280,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
id2word = Dictionary(map(lambda x: x.split(), data.data))
id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
Expand Down Expand Up @@ -363,7 +363,7 @@ def testPipeline(self):
data = cache
test_data = data.data[0:2]
test_target = data.target[0:2]
id2word = Dictionary(map(lambda x: x.split(), test_data))
id2word = Dictionary([x.split() for x in data.data])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data.data need change to test_data

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, fixed.

corpus = [id2word.doc2bow(i.split()) for i in test_data]
model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
Expand Down Expand Up @@ -433,7 +433,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
id2word = Dictionary(map(lambda x: x.split(), data.data))
id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
Expand Down Expand Up @@ -517,8 +517,8 @@ def testPipeline(self):
('calculus', 'mathematics'), ('mathematical', 'mathematics'), ('geometry', 'mathematics'), ('operations', 'mathematics'), ('curves', 'mathematics'),
('natural', 'physics'), ('nuclear', 'physics'), ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')
]
train_input = list(map(lambda x: x[0], train_data))
train_target = list(map(lambda x: class_dict[x[1]], train_data))
train_input = [x[0] for x in train_data]
train_target = [class_dict[x[1]] for x in train_data]

clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
clf.fit(model.transform(train_input), train_target)
Expand Down Expand Up @@ -682,8 +682,8 @@ def testPipeline(self):
(['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),
(['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')
]
train_input = list(map(lambda x: x[0], train_data))
train_target = list(map(lambda x: class_dict[x[1]], train_data))
train_input = [x[0] for x in train_data]
train_target = [class_dict[x[1]] for x in train_data]

clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
clf.fit(model.transform(train_input), train_target)
Expand Down Expand Up @@ -737,7 +737,7 @@ def testTransform(self):
doc = ['computer system interface time computer system']
bow_vec = self.model.transform(doc)[0]
expected_values = [1, 1, 2, 2] # comparing only the word-counts
values = list(map(lambda x: x[1], bow_vec))
values = [x[1] for x in bow_vec]
self.assertEqual(sorted(expected_values), sorted(values))

def testSetGetParams(self):
Expand Down Expand Up @@ -815,7 +815,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
id2word = Dictionary(map(lambda x: x.split(), data.data))
id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
tfidf_model = TfIdfTransformer()
tfidf_model.fit(corpus)
Expand Down Expand Up @@ -881,7 +881,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
id2word = Dictionary(map(lambda x: x.split(), data.data))
id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
model = HdpTransformer(id2word=id2word)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
Expand Down