Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ignore of E731 #1689

Merged
merged 11 commits into from
Nov 7, 2017
3 changes: 2 additions & 1 deletion gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from six.moves import xrange, zip as izip


blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
def blas(name, ndarray):
return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]

logger = logging.getLogger(__name__)

Expand Down
9 changes: 6 additions & 3 deletions gensim/similarities/docsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,18 +336,21 @@ def __getitem__(self, query):
# the following uses a lot of lazy evaluation and (optionally) parallel
# processing, to improve query latency and minimize memory footprint.
offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]

def convert(shard_no, doc):
return [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]

is_corpus, query = utils.is_corpus(query)
is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
if not is_corpus:
# user asked for num_best most similar and query is a single doc
results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results))
results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results))
result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
else:
# the trickiest combination: returning num_best results when query was a corpus
results = []
for shard_no, result in enumerate(shard_results):
shard_result = [convert(doc, shard_no) for doc in result]
shard_result = [convert(shard_no, doc) for doc in result]
results.append(shard_result)
result = []
for parts in izip(*results):
Expand Down
16 changes: 5 additions & 11 deletions gensim/sklearn_api/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,11 @@ def transform(self, author_names):
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)

check = lambda x: [x] if not isinstance(x, list) else x
author_names = check(author_names)
X = [[] for _ in range(0, len(author_names))]

for k, v in enumerate(author_names):
transformed_author = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_author = matutils.sparse2full(transformed_author, self.num_topics)
X[k] = probs_author

return np.reshape(np.array(X), (len(author_names), self.num_topics))
if not isinstance(author_names, list):
author_names = [author_names]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names]
return np.reshape(np.array(topics), (len(author_names), self.num_topics))

def partial_fit(self, X, author2doc=None, doc2author=None):
"""
Expand Down
13 changes: 4 additions & 9 deletions gensim/sklearn_api/d2vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,7 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], string_types) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
doc_vec = self.gensim_model.infer_vector(v)
X[k] = doc_vec

return np.reshape(np.array(X), (len(docs), self.gensim_model.vector_size))
if isinstance(docs[0], string_types):
docs = [docs]
vectors = [self.gensim_model.infer_vector(doc) for doc in docs]
return np.reshape(np.array(vectors), (len(docs), self.gensim_model.vector_size))
27 changes: 12 additions & 15 deletions gensim/sklearn_api/hdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,18 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

max_num_topics = 0
for k, v in enumerate(docs):
X[k] = self.gensim_model[v]
max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1)

for k, v in enumerate(X):
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
dense_vec = matutils.sparse2full(v, max_num_topics)
X[k] = dense_vec

return np.reshape(np.array(X), (len(docs), max_num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
distribution, max_num_topics = [], 0

for doc in docs:
topicd = self.gensim_model[doc]
distribution.append(topicd)
max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1)

# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(topicd, max_num_topics) for topicd in distribution]
return np.reshape(np.array(distribution), (len(docs), max_num_topics))

def partial_fit(self, X):
"""
Expand Down
15 changes: 5 additions & 10 deletions gensim/sklearn_api/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,11 @@ def transform(self, docs):
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
doc_topics = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_docs = matutils.sparse2full(doc_topics, self.num_topics)
X[k] = probs_docs
return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(distribution), (len(docs), self.num_topics))

def partial_fit(self, X):
"""
Expand Down
13 changes: 4 additions & 9 deletions gensim/sklearn_api/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,7 @@ def transform(self, docs):
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
transformed_author = self.gensim_model[v]
X[k] = transformed_author

return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
proportions = [self.gensim_model[doc] for doc in docs]
return np.reshape(np.array(proportions), (len(docs), self.num_topics))
14 changes: 5 additions & 9 deletions gensim/sklearn_api/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,11 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for i in range(0, len(docs))]
for k, v in enumerate(docs):
doc_topics = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_docs = matutils.sparse2full(doc_topics, self.num_topics)
X[k] = probs_docs
return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(distribution), (len(docs), self.num_topics))

def partial_fit(self, X):
"""
Expand Down
12 changes: 3 additions & 9 deletions gensim/sklearn_api/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,9 @@ def transform(self, docs):
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# input as python lists
check = lambda x: [x] if isinstance(x[0], string_types) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
phrase_tokens = self.gensim_model[v]
X[k] = phrase_tokens

return X
if isinstance(docs[0], string_types):
docs = [docs]
return [self.gensim_model[doc] for doc in docs]

def partial_fit(self, X):
if self.gensim_model is None:
Expand Down
16 changes: 5 additions & 11 deletions gensim/sklearn_api/rpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,8 @@ def transform(self, docs):
)

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
transformed_doc = self.gensim_model[v]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
probs_docs = matutils.sparse2full(transformed_doc, self.num_topics)
X[k] = probs_docs

return np.reshape(np.array(X), (len(docs), self.num_topics))
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
presentation = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(presentation), (len(docs), self.num_topics))
14 changes: 4 additions & 10 deletions gensim/sklearn_api/text2bow.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,10 @@ def transform(self, docs):
)

# input as python lists
check = lambda x: [x] if isinstance(x, string_types) else x
docs = check(docs)
tokenized_docs = [list(self.tokenizer(x)) for x in docs]
X = [[] for _ in range(0, len(tokenized_docs))]

for k, v in enumerate(tokenized_docs):
bow_val = self.gensim_model.doc2bow(v)
X[k] = bow_val

return X
if isinstance(docs, string_types):
docs = [docs]
tokenized_docs = (list(self.tokenizer(doc)) for doc in docs)
return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs]

def partial_fit(self, X):
if self.gensim_model is None:
Expand Down
12 changes: 3 additions & 9 deletions gensim/sklearn_api/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,6 @@ def transform(self, docs):
)

# input as python lists
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
transformed_doc = self.gensim_model[v]
X[k] = transformed_doc

return X
if isinstance(docs[0], tuple):
docs = [docs]
return [self.gensim_model[doc] for doc in docs]
13 changes: 4 additions & 9 deletions gensim/sklearn_api/w2vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,10 @@ def transform(self, words):
)

# The input as array of array
check = lambda x: [x] if isinstance(x, six.string_types) else x
words = check(words)
X = [[] for _ in range(0, len(words))]

for k, v in enumerate(words):
word_vec = self.gensim_model[v]
X[k] = word_vec

return np.reshape(np.array(X), (len(words), self.size))
if isinstance(words, six.string_types):
words = [words]
vectors = [self.gensim_model[word] for word in words]
return np.reshape(np.array(vectors), (len(words), self.size))

def partial_fit(self, X):
raise NotImplementedError(
Expand Down
19 changes: 12 additions & 7 deletions gensim/test/test_atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@
# Test that models are compatiple across versions, as done in LdaModel.

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please create utils file and move this function (replace to import everywhere).

return os.path.join(module_path, 'test_data', fname)

# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This text used in many places, maybe move to utils too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm,

  1. it's data, not utils/functions
  2. maybe in another PR ?

Copy link
Contributor

@menshikh-iv menshikh-iv Nov 6, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Yeah, but it's "common resource", I think that's fine.
  2. No need to create distinct PR for this change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Ok, I'll fix it.
  2. small PR is better to review and faster to merge.

Expand Down Expand Up @@ -475,24 +478,26 @@ def testPasses(self):
# long message includes the original error message with a custom one
self.longMessage = True
# construct what we expect when passes aren't involved
test_rhots = list()
test_rhots = []
model = self.class_(id2word=dictionary, chunksize=1, num_topics=2)
final_rhot = lambda: pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay)

def final_rhot(model):
return pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay)

# generate 5 updates to test rhot on
for x in range(5):
for _ in range(5):
model.update(corpus, author2doc)
test_rhots.append(final_rhot())
test_rhots.append(final_rhot(model))

for passes in [1, 5, 10, 50, 100]:
model = self.class_(id2word=dictionary, chunksize=1, num_topics=2, passes=passes)
self.assertEqual(final_rhot(), 1.0)
self.assertEqual(final_rhot(model), 1.0)
# make sure the rhot matches the test after each update
for test_rhot in test_rhots:
model.update(corpus, author2doc)

msg = "{}, {}, {}".format(passes, model.num_updates, model.state.numdocs)
self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)
self.assertAlmostEqual(final_rhot(model), test_rhot, msg=msg)

self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))
self.assertEqual(model.num_updates, len(corpus) * len(test_rhots))
Expand Down
16 changes: 7 additions & 9 deletions gensim/test/test_coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import unittest
from unittest import SkipTest
import multiprocessing as mp
from functools import partial

import numpy as np
from gensim.corpora.dictionary import Dictionary
Expand Down Expand Up @@ -215,20 +216,17 @@ def testErrors(self):
)

def testProcesses(self):
cpu = mp.cpu_count()
get_model = lambda p: CoherenceModel(
topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass', processes=p,
get_model = partial(CoherenceModel,
topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
)

model = CoherenceModel(
topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass',
)
self.assertEqual(model.processes, cpu - 1)
model, used_cpus = get_model(), mp.cpu_count() - 1
self.assertEqual(model.processes, used_cpus)
for p in range(-2, 1):
self.assertEqual(get_model(p).processes, cpu - 1)
self.assertEqual(get_model(processes=p).processes, used_cpus)

for p in range(1, 4):
self.assertEqual(get_model(p).processes, p)
self.assertEqual(get_model(processes=p).processes, p)

def testPersistence(self):
fname = testfile()
Expand Down
5 changes: 4 additions & 1 deletion gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
from gensim.models import doc2vec, keyedvectors

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


class DocsLeeCorpus(object):
Expand Down
5 changes: 4 additions & 1 deletion gensim/test/test_dtm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

# needed because sample data files are located in the same folder
module_path = os.path.dirname(__file__)
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


class TestDtmModel(unittest.TestCase):
Expand Down
5 changes: 4 additions & 1 deletion gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@
from gensim.models.wrappers.fasttext import FastText as FT_wrapper

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
logger = logging.getLogger(__name__)

IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


class LeeCorpus(object):
def __iter__(self):
with open(datapath('lee_background.cor')) as f:
Expand Down
Loading