diff --git a/gensim/matutils.py b/gensim/matutils.py index 85118f892c..38e2431caf 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -28,7 +28,9 @@ from six.moves import xrange, zip as izip -blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] +def blas(name, ndarray): + return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] + logger = logging.getLogger(__name__) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 5e93c6f8cf..20c9f8518c 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -336,18 +336,21 @@ def __getitem__(self, query): # the following uses a lot of lazy evaluation and (optionally) parallel # processing, to improve query latency and minimize memory footprint. offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) - convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] + + def convert(shard_no, doc): + return [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] + is_corpus, query = utils.is_corpus(query) is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: # user asked for num_best most similar and query is a single doc - results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results)) + results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results)) result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) else: # the trickiest combination: returning num_best results when query was a corpus results = [] for shard_no, result in enumerate(shard_results): - shard_result = [convert(doc, shard_no) for doc in result] + shard_result = [convert(shard_no, doc) for doc in result] results.append(shard_result) result = [] for parts in izip(*results): diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index d3128243a6..8845bdd816 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -76,17 +76,11 @@ def transform(self, author_names): "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) - check = lambda x: [x] if not isinstance(x, list) else x - author_names = check(author_names) - X = [[] for _ in range(0, len(author_names))] - - for k, v in enumerate(author_names): - transformed_author = self.gensim_model[v] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future - probs_author = matutils.sparse2full(transformed_author, self.num_topics) - X[k] = probs_author - - return np.reshape(np.array(X), (len(author_names), self.num_topics)) + if not isinstance(author_names, list): + author_names = [author_names] + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names] + return np.reshape(np.array(topics), (len(author_names), self.num_topics)) def partial_fit(self, X, author2doc=None, doc2author=None): """ diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 14163f1600..245231ad45 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -87,12 +87,7 @@ def transform(self, docs): ) # The input as array of array - check = lambda x: [x] if isinstance(x[0], string_types) else x - docs = check(docs) - X = [[] for _ in range(0, len(docs))] - - for k, v in enumerate(docs): - doc_vec = self.gensim_model.infer_vector(v) - X[k] = doc_vec - - return np.reshape(np.array(X), (len(docs), self.gensim_model.vector_size)) + if isinstance(docs[0], string_types): + docs = [docs] + vectors = [self.gensim_model.infer_vector(doc) for doc in docs] + return np.reshape(np.array(vectors), (len(docs), self.gensim_model.vector_size)) diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index d1dcec01a5..acadd6f459 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -77,21 +77,18 @@ def transform(self, docs): ) # The input as array of array - check = lambda x: [x] if isinstance(x[0], tuple) else x - docs = check(docs) - X = [[] for _ in range(0, len(docs))] - - max_num_topics = 0 - for k, v in enumerate(docs): - X[k] = self.gensim_model[v] - max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1) - - for k, v in enumerate(X): - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future - dense_vec = matutils.sparse2full(v, max_num_topics) - X[k] = dense_vec - - return np.reshape(np.array(X), (len(docs), max_num_topics)) + if isinstance(docs[0], tuple): + docs = [docs] + distribution, max_num_topics = [], 0 + + for doc in docs: + topicd = self.gensim_model[doc] + distribution.append(topicd) + max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1) + + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + distribution = [matutils.sparse2full(t, max_num_topics) for t in distribution] + return np.reshape(np.array(distribution), (len(docs), max_num_topics)) def partial_fit(self, X): """ diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 77d539e616..40d7c52db0 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -83,16 +83,11 @@ def transform(self, docs): raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") # The input as array of array - check = lambda x: [x] if isinstance(x[0], tuple) else x - docs = check(docs) - X = [[] for _ in range(0, len(docs))] - - for k, v in enumerate(docs): - doc_topics = self.gensim_model[v] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future - probs_docs = matutils.sparse2full(doc_topics, self.num_topics) - X[k] = probs_docs - return np.reshape(np.array(X), (len(docs), self.num_topics)) + if isinstance(docs[0], tuple): + docs = [docs] + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] + return np.reshape(np.array(distribution), (len(docs), self.num_topics)) def partial_fit(self, X): """ diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index 6b96d8d6fa..2c5d0879d4 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -69,12 +69,7 @@ def transform(self, docs): raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") # The input as array of array - check = lambda x: [x] if isinstance(x[0], tuple) else x - docs = check(docs) - X = [[] for _ in range(0, len(docs))] - - for k, v in enumerate(docs): - transformed_author = self.gensim_model[v] - X[k] = transformed_author - - return np.reshape(np.array(X), (len(docs), self.num_topics)) + if isinstance(docs[0], tuple): + docs = [docs] + proportions = [self.gensim_model[doc] for doc in docs] + return np.reshape(np.array(proportions), (len(docs), self.num_topics)) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 776af6f5da..30263b02af 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -67,15 +67,11 @@ def transform(self, docs): ) # The input as array of array - check = lambda x: [x] if isinstance(x[0], tuple) else x - docs = check(docs) - X = [[] for i in range(0, len(docs))] - for k, v in enumerate(docs): - doc_topics = self.gensim_model[v] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future - probs_docs = matutils.sparse2full(doc_topics, self.num_topics) - X[k] = probs_docs - return np.reshape(np.array(X), (len(docs), self.num_topics)) + if isinstance(docs[0], tuple): + docs = [docs] + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] + return np.reshape(np.array(distribution), (len(docs), self.num_topics)) def partial_fit(self, X): """ diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 2eab84b95e..fcd7d4c5f1 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -50,15 +50,9 @@ def transform(self, docs): raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") # input as python lists - check = lambda x: [x] if isinstance(x[0], string_types) else x - docs = check(docs) - X = [[] for _ in range(0, len(docs))] - - for k, v in enumerate(docs): - phrase_tokens = self.gensim_model[v] - X[k] = phrase_tokens - - return X + if isinstance(docs[0], string_types): + docs = [docs] + return [self.gensim_model[doc] for doc in docs] def partial_fit(self, X): if self.gensim_model is None: diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 62395e0bce..59d4c87a45 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -52,14 +52,8 @@ def transform(self, docs): ) # The input as array of array - check = lambda x: [x] if isinstance(x[0], tuple) else x - docs = check(docs) - X = [[] for _ in range(0, len(docs))] - - for k, v in enumerate(docs): - transformed_doc = self.gensim_model[v] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future - probs_docs = matutils.sparse2full(transformed_doc, self.num_topics) - X[k] = probs_docs - - return np.reshape(np.array(X), (len(docs), self.num_topics)) + if isinstance(docs[0], tuple): + docs = [docs] + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + presentation = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] + return np.reshape(np.array(presentation), (len(docs), self.num_topics)) diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 6beb126d0d..e71a954c32 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -48,16 +48,10 @@ def transform(self, docs): ) # input as python lists - check = lambda x: [x] if isinstance(x, string_types) else x - docs = check(docs) - tokenized_docs = [list(self.tokenizer(x)) for x in docs] - X = [[] for _ in range(0, len(tokenized_docs))] - - for k, v in enumerate(tokenized_docs): - bow_val = self.gensim_model.doc2bow(v) - X[k] = bow_val - - return X + if isinstance(docs, string_types): + docs = [docs] + tokenized_docs = (list(self.tokenizer(doc)) for doc in docs) + return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs] def partial_fit(self, X): if self.gensim_model is None: diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index 414c597dc1..7952d11e75 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -51,12 +51,6 @@ def transform(self, docs): ) # input as python lists - check = lambda x: [x] if isinstance(x[0], tuple) else x - docs = check(docs) - X = [[] for _ in range(0, len(docs))] - - for k, v in enumerate(docs): - transformed_doc = self.gensim_model[v] - X[k] = transformed_doc - - return X + if isinstance(docs[0], tuple): + docs = [docs] + return [self.gensim_model[doc] for doc in docs] diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 6ddea2eb90..317842ee07 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -75,15 +75,10 @@ def transform(self, words): ) # The input as array of array - check = lambda x: [x] if isinstance(x, six.string_types) else x - words = check(words) - X = [[] for _ in range(0, len(words))] - - for k, v in enumerate(words): - word_vec = self.gensim_model[v] - X[k] = word_vec - - return np.reshape(np.array(X), (len(words), self.size)) + if isinstance(words, six.string_types): + words = [words] + vectors = [self.gensim_model[word] for word in words] + return np.reshape(np.array(vectors), (len(words), self.size)) def partial_fit(self, X): raise NotImplementedError( diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 97bffde623..5947280f59 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -14,9 +14,6 @@ import logging import unittest -import os -import os.path -import tempfile import numbers from os import remove @@ -27,6 +24,8 @@ from gensim.models import atmodel from gensim import matutils from gensim.test import basetmtests +from gensim.test.utils import (datapath, + get_tmpfile, common_texts, common_dictionary as dictionary, common_corpus as corpus) # TODO: # Test that computing the bound on new unseen documents works as expected (this is somewhat different @@ -36,23 +35,6 @@ # increases the bound. # Test that models are compatiple across versions, as done in LdaModel. -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] # Assign some authors randomly to the documents above. author2doc = { @@ -61,6 +43,7 @@ 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7] } + doc2author = { 0: ['john', 'jack'], 1: ['john', 'jill'], @@ -76,18 +59,12 @@ # More data with new and old authors (to test update method). # Although the text is just a subset of the previous, the model # just sees it as completely new data. -texts_new = texts[0:3] +texts_new = common_texts[0:3] author2doc_new = {'jill': [0], 'bob': [0, 1], 'sally': [1, 2]} dictionary_new = Dictionary(texts_new) corpus_new = [dictionary_new.doc2bow(text) for text in texts_new] -def testfile(test_fname=''): - # temporary data will be stored to this file - fname = 'gensim_models_' + test_fname + '.tst' - return os.path.join(tempfile.gettempdir(), fname) - - class TestAuthorTopicModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) @@ -475,30 +452,32 @@ def testPasses(self): # long message includes the original error message with a custom one self.longMessage = True # construct what we expect when passes aren't involved - test_rhots = list() + test_rhots = [] model = self.class_(id2word=dictionary, chunksize=1, num_topics=2) - final_rhot = lambda: pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay) + + def final_rhot(model): + return pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay) # generate 5 updates to test rhot on - for x in range(5): + for _ in range(5): model.update(corpus, author2doc) - test_rhots.append(final_rhot()) + test_rhots.append(final_rhot(model)) for passes in [1, 5, 10, 50, 100]: model = self.class_(id2word=dictionary, chunksize=1, num_topics=2, passes=passes) - self.assertEqual(final_rhot(), 1.0) + self.assertEqual(final_rhot(model), 1.0) # make sure the rhot matches the test after each update for test_rhot in test_rhots: model.update(corpus, author2doc) msg = "{}, {}, {}".format(passes, model.num_updates, model.state.numdocs) - self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg) + self.assertAlmostEqual(final_rhot(model), test_rhot, msg=msg) self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots)) self.assertEqual(model.num_updates, len(corpus) * len(test_rhots)) def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models_atmodel.tst') model = self.model model.save(fname) model2 = self.class_.load(fname) @@ -507,7 +486,7 @@ def testPersistence(self): self.assertTrue(np.allclose(model.state.gamma, model2.state.gamma)) def testPersistenceIgnore(self): - fname = testfile('testPersistenceIgnore') + fname = get_tmpfile('gensim_models_atmodel_testPersistenceIgnore.tst') model = atmodel.AuthorTopicModel(corpus, author2doc=author2doc, num_topics=2) model.save(fname, ignore='id2word') model2 = atmodel.AuthorTopicModel.load(fname) @@ -518,7 +497,7 @@ def testPersistenceIgnore(self): self.assertTrue(model2.id2word is None) def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_atmodel.tst.gz') model = self.model model.save(fname) model2 = self.class_.load(fname, mmap=None) @@ -533,7 +512,7 @@ def testPersistenceCompressed(self): self.assertTrue(np.allclose(jill_topics, jill_topics2)) def testLargeMmap(self): - fname = testfile() + fname = get_tmpfile('gensim_models_atmodel.tst') model = self.model # simulate storing large arrays separately @@ -553,7 +532,7 @@ def testLargeMmap(self): self.assertTrue(np.allclose(jill_topics, jill_topics2)) def testLargeMmapCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_atmodel.tst.gz') model = self.model # simulate storing large arrays separately diff --git a/gensim/test/test_big.py b/gensim/test/test_big.py index abf19c63c7..f422953d18 100644 --- a/gensim/test/test_big.py +++ b/gensim/test/test_big.py @@ -12,16 +12,11 @@ import logging import unittest import os -import tempfile import numpy as np import gensim - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_big.tst') +from gensim.test.utils import get_tmpfile class BigCorpus(object): @@ -50,24 +45,27 @@ class TestLargeData(unittest.TestCase): def testWord2Vec(self): corpus = BigCorpus(words_only=True, num_docs=100000, num_terms=3000000, doc_len=200) + tmpf = get_tmpfile('gensim_big.tst') model = gensim.models.Word2Vec(corpus, size=300, workers=4) - model.save(testfile(), ignore=['syn1']) + model.save(tmpf, ignore=['syn1']) del model - gensim.models.Word2Vec.load(testfile()) + gensim.models.Word2Vec.load(tmpf) def testLsiModel(self): corpus = BigCorpus(num_docs=50000) + tmpf = get_tmpfile('gensim_big.tst') model = gensim.models.LsiModel(corpus, num_topics=500, id2word=corpus.dictionary) - model.save(testfile()) + model.save(tmpf) del model - gensim.models.LsiModel.load(testfile()) + gensim.models.LsiModel.load(tmpf) def testLdaModel(self): corpus = BigCorpus(num_docs=5000) + tmpf = get_tmpfile('gensim_big.tst') model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary) - model.save(testfile()) + model.save(tmpf) del model - gensim.models.LdaModel.load(testfile()) + gensim.models.LdaModel.load(tmpf) if __name__ == '__main__': diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 787d661f89..523e658e66 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -10,44 +10,26 @@ import logging import os -import tempfile import unittest from unittest import SkipTest import multiprocessing as mp +from functools import partial import numpy as np -from gensim.corpora.dictionary import Dictionary from gensim.matutils import argsort from gensim.models.coherencemodel import CoherenceModel, BOOLEAN_DOCUMENT_BASED from gensim.models.ldamodel import LdaModel from gensim.models.wrappers import LdaMallet from gensim.models.wrappers import LdaVowpalWabbit - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +from gensim.test.utils import get_tmpfile, common_texts, common_dictionary, common_corpus class TestCoherenceModel(unittest.TestCase): # set up vars used in testing ("Deerwester" from the web tutorial) - texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] - ] - dictionary = Dictionary(texts) - - @classmethod - def setUpClass(cls): - cls.corpus = [cls.dictionary.doc2bow(text) for text in cls.texts] + texts = common_texts + dictionary = common_dictionary + corpus = common_corpus def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. @@ -215,23 +197,20 @@ def testErrors(self): ) def testProcesses(self): - cpu = mp.cpu_count() - get_model = lambda p: CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass', processes=p, + get_model = partial(CoherenceModel, + topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' ) - model = CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass', - ) - self.assertEqual(model.processes, cpu - 1) + model, used_cpus = get_model(), mp.cpu_count() - 1 + self.assertEqual(model.processes, used_cpus) for p in range(-2, 1): - self.assertEqual(get_model(p).processes, cpu - 1) + self.assertEqual(get_model(processes=p).processes, used_cpus) for p in range(1, 4): - self.assertEqual(get_model(p).processes, p) + self.assertEqual(get_model(processes=p).processes, p) def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models_coherence.tst') model = CoherenceModel( topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' ) @@ -240,7 +219,7 @@ def testPersistence(self): self.assertTrue(model.get_coherence() == model2.get_coherence()) def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_coherence.tst.gz') model = CoherenceModel( topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' ) @@ -249,7 +228,7 @@ def testPersistenceCompressed(self): self.assertTrue(model.get_coherence() == model2.get_coherence()) def testPersistenceAfterProbabilityEstimationUsingCorpus(self): - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') model = CoherenceModel( topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' ) @@ -260,7 +239,7 @@ def testPersistenceAfterProbabilityEstimationUsingCorpus(self): self.assertTrue(model.get_coherence() == model2.get_coherence()) def testPersistenceAfterProbabilityEstimationUsingTexts(self): - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') model = CoherenceModel( topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v' ) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index a990c6be94..4ddc16e0cf 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -23,18 +23,7 @@ ucicorpus, malletcorpus, textcorpus, indexedcorpus) from gensim.interfaces import TransformedCorpus from gensim.utils import to_unicode - -# needed because sample data files are located in the same folder -module_path = os.path.dirname(__file__) - - -def datapath(fname): - return os.path.join(module_path, 'test_data', fname) - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_corpus.tst') +from gensim.test.utils import datapath, get_tmpfile class DummyTransformer(object): @@ -61,7 +50,7 @@ def run(self, result=None): def tearDown(self): # remove all temporary test files - fname = testfile() + fname = get_tmpfile('gensim_corpus.tst') extensions = ['', '', '.bz2', '.gz', '.index', '.vocab'] for ext in itertools.permutations(extensions, 2): try: @@ -93,13 +82,14 @@ def test_len(self): self.assertEqual(len(corpus), 9) def test_empty_input(self): - with open(testfile(), 'w') as f: + tmpf = get_tmpfile('gensim_corpus.tst') + with open(tmpf, 'w') as f: f.write('') - with open(testfile() + '.vocab', 'w') as f: + with open(tmpf + '.vocab', 'w') as f: f.write('') - corpus = self.corpus_class(testfile()) + corpus = self.corpus_class(tmpf) self.assertEqual(len(corpus), 0) docs = list(corpus) @@ -107,22 +97,24 @@ def test_empty_input(self): def test_save(self): corpus = self.TEST_CORPUS + tmpf = get_tmpfile('gensim_corpus.tst') # make sure the corpus can be saved - self.corpus_class.save_corpus(testfile(), corpus) + self.corpus_class.save_corpus(tmpf, corpus) # and loaded back, resulting in exactly the same corpus - corpus2 = list(self.corpus_class(testfile())) + corpus2 = list(self.corpus_class(tmpf)) self.assertEqual(corpus, corpus2) def test_serialize(self): corpus = self.TEST_CORPUS + tmpf = get_tmpfile('gensim_corpus.tst') # make sure the corpus can be saved - self.corpus_class.serialize(testfile(), corpus) + self.corpus_class.serialize(tmpf, corpus) # and loaded back, resulting in exactly the same corpus - corpus2 = self.corpus_class(testfile()) + corpus2 = self.corpus_class(tmpf) self.assertEqual(corpus, list(corpus2)) # make sure the indexing corpus[i] works @@ -137,9 +129,10 @@ def test_serialize(self): def test_serialize_compressed(self): corpus = self.TEST_CORPUS + tmpf = get_tmpfile('gensim_corpus.tst') for extension in ['.gz', '.bz2']: - fname = testfile() + extension + fname = tmpf + extension # make sure the corpus can be saved self.corpus_class.serialize(fname, corpus) @@ -252,7 +245,7 @@ def setUp(self): def test_save_format_for_dtm(self): corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] - test_file = testfile() + test_file = get_tmpfile('gensim_corpus.tst') self.corpus_class.save_corpus(test_file, corpus) with open(test_file) as f: for line in f: @@ -493,7 +486,7 @@ def test_non_trivial_structure(self): . ├── 0.txt ├── a_folder - │   └── 1.txt + │ └── 1.txt └── b_folder ├── 2.txt ├── 3.txt diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py index f6c7d8b43c..e0b8d1e426 100644 --- a/gensim/test/test_corpora_dictionary.py +++ b/gensim/test/test_corpora_dictionary.py @@ -10,7 +10,6 @@ from collections import Mapping import logging -import tempfile import unittest import codecs import os @@ -20,31 +19,14 @@ import gensim from gensim.corpora import Dictionary from gensim.utils import to_utf8 +from gensim.test.utils import get_tmpfile, common_texts from six import PY3 from six.moves import zip -# sample data files are located in the same folder -module_path = os.path.dirname(__file__) - - -def get_tmpfile(suffix): - return os.path.join(tempfile.gettempdir(), suffix) - - class TestDictionary(unittest.TestCase): def setUp(self): - self.texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] - ] + self.texts = common_texts def testDocFreqOneDoc(self): texts = [['human', 'interface', 'computer']] diff --git a/gensim/test/test_corpora_hashdictionary.py b/gensim/test/test_corpora_hashdictionary.py index 808246dc59..74f28eb0bb 100644 --- a/gensim/test/test_corpora_hashdictionary.py +++ b/gensim/test/test_corpora_hashdictionary.py @@ -9,35 +9,17 @@ import logging -import tempfile import unittest import os import zlib from gensim.corpora.hashdictionary import HashDictionary - - -# sample data files are located in the same folder -module_path = os.path.dirname(__file__) - - -def get_tmpfile(suffix): - return os.path.join(tempfile.gettempdir(), suffix) +from gensim.test.utils import get_tmpfile, common_texts class TestHashDictionary(unittest.TestCase): def setUp(self): - self.texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] - ] + self.texts = common_texts def testDocFreqOneDoc(self): texts = [['human', 'interface', 'computer']] diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 6feeab3bd2..0d49c9e2e3 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -14,7 +14,6 @@ import logging import unittest import os -import tempfile from six.moves import zip as izip from collections import namedtuple @@ -24,9 +23,7 @@ from gensim import utils from gensim.models import doc2vec, keyedvectors - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +from gensim.test.utils import datapath, get_tmpfile, common_texts as raw_sentences class DocsLeeCorpus(object): @@ -49,56 +46,42 @@ def __iter__(self): list_corpus = list(DocsLeeCorpus()) -raw_sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] - ] sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.tst') - - def load_on_instance(): # Save and load a Doc2Vec Model on instance for test + tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) - model.save(testfile()) + model.save(tmpf) model = doc2vec.Doc2Vec() # should fail at this point - return model.load(testfile()) + return model.load(tmpf) class TestDoc2VecModel(unittest.TestCase): def test_persistence(self): """Test storing/loading the entire model.""" + tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) - model.save(testfile()) - self.models_equal(model, doc2vec.Doc2Vec.load(testfile())) + model.save(tmpf) + self.models_equal(model, doc2vec.Doc2Vec.load(tmpf)) def testPersistenceWord2VecFormat(self): """Test storing the entire model in word2vec format.""" model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) # test saving both document and word embedding - test_doc_word = os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.dw') + test_doc_word = get_tmpfile('gensim_doc2vec.dw') model.save_word2vec_format(test_doc_word, doctag_vec=True, word_vec=True, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc_word, binary=True) self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab)) # test saving document embedding only - test_doc = os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.d') + test_doc = get_tmpfile('gensim_doc2vec.d') model.save_word2vec_format(test_doc, doctag_vec=True, word_vec=False, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc, binary=True) self.assertEqual(len(model.docvecs), len(binary_model_dv.vocab)) # test saving word embedding only - test_word = os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.w') + test_word = get_tmpfile('gensim_doc2vec.w') model.save_word2vec_format(test_word, doctag_vec=False, word_vec=True, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True) self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab)) @@ -106,21 +89,23 @@ def testPersistenceWord2VecFormat(self): def test_unicode_in_doctag(self): """Test storing document vectors of a model with unicode titles.""" model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1) + tmpf = get_tmpfile('gensim_doc2vec.tst') try: - model.save_word2vec_format(testfile(), doctag_vec=True, word_vec=True, binary=True) + model.save_word2vec_format(tmpf, doctag_vec=True, word_vec=True, binary=True) except UnicodeEncodeError: self.fail('Failed storing unicode title.') def test_load_mmap(self): """Test storing/loading the entire model.""" model = doc2vec.Doc2Vec(sentences, min_count=1) + tmpf = get_tmpfile('gensim_doc2vec.tst') # test storing the internal arrays into separate files - model.save(testfile(), sep_limit=0) - self.models_equal(model, doc2vec.Doc2Vec.load(testfile())) + model.save(tmpf, sep_limit=0) + self.models_equal(model, doc2vec.Doc2Vec.load(tmpf)) # make sure mmaping the arrays back works, too - self.models_equal(model, doc2vec.Doc2Vec.load(testfile(), mmap='r')) + self.models_equal(model, doc2vec.Doc2Vec.load(tmpf, mmap='r')) def test_int_doctags(self): """Test doc2vec doctag alternatives""" @@ -217,8 +202,9 @@ def model_sanity(self, model, keep_training=True): # keep training after save if keep_training: - model.save(testfile()) - loaded = doc2vec.Doc2Vec.load(testfile()) + tmpf = get_tmpfile('gensim_doc2vec.tst') + model.save(tmpf) + loaded = doc2vec.Doc2Vec.load(tmpf) loaded.train(sentences, total_examples=loaded.corpus_count, epochs=loaded.iter) def test_training(self): diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py index 231bbb1932..a52766b8cd 100644 --- a/gensim/test/test_dtm.py +++ b/gensim/test/test_dtm.py @@ -13,11 +13,7 @@ import sys import unittest from gensim import corpora - - -# needed because sample data files are located in the same folder -module_path = os.path.dirname(__file__) -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +from gensim.test.utils import datapath class TestDtmModel(unittest.TestCase): diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 710c5a500d..d56272b4e1 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -3,7 +3,6 @@ import logging import unittest -import tempfile import os import struct @@ -14,9 +13,8 @@ from gensim.models.fasttext import FastText as FT_gensim from gensim.models.wrappers.fasttext import FastTextKeyedVectors from gensim.models.wrappers.fasttext import FastText as FT_wrapper +from gensim.test.utils import datapath, get_tmpfile, common_texts as sentences -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) logger = logging.getLogger(__name__) IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32) @@ -31,18 +29,6 @@ def __iter__(self): list_corpus = list(LeeCorpus()) -sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] - new_sentences = [ ['computer', 'artificial', 'intelligence'], ['artificial', 'trees'], @@ -53,11 +39,6 @@ def __iter__(self): ] -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_fasttext.tst') - - class TestFastTextModel(unittest.TestCase): def setUp(self): @@ -113,29 +94,31 @@ def models_equal(self, model, model2): @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_persistence(self): + tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) - model.save(testfile()) - self.models_equal(model, FT_gensim.load(testfile())) + model.save(tmpf) + self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv - wv.save(testfile()) - loaded_wv = FastTextKeyedVectors.load(testfile()) + wv.save(tmpf) + loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams)) @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_norm_vectors_not_saved(self): + tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.init_sims() - model.save(testfile()) - loaded_model = FT_gensim.load(testfile()) + model.save(tmpf) + loaded_model = FT_gensim.load(tmpf) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv - wv.save(testfile()) - loaded_kv = FastTextKeyedVectors.load(testfile()) + wv.save(tmpf) + loaded_kv = FastTextKeyedVectors.load(tmpf) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None) @@ -362,8 +345,9 @@ def test_cbow_hs_against_wrapper(self): logger.info("FT_HOME env variable not set, skipping test") return + tmpf = get_tmpfile('gensim_fasttext.tst') model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), - output_file=testfile(), model='cbow', size=50, alpha=0.05, window=5, min_count=5, word_ngrams=1, + output_file=tmpf, model='cbow', size=50, alpha=0.05, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, @@ -382,8 +366,9 @@ def test_sg_hs_against_wrapper(self): logger.info("FT_HOME env variable not set, skipping test") return + tmpf = get_tmpfile('gensim_fasttext.tst') model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), - output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, + output_file=tmpf, model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, @@ -411,9 +396,10 @@ def test_online_learning(self): self.assertTrue('tif' in model_hs.wv.ngrams) # ngram added because of the word `artificial` def test_online_learning_after_save(self): + tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) - model_neg.save(testfile()) - model_neg = FT_gensim.load(testfile()) + model_neg.save(tmpf) + model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py index 77e10bdf99..cc6b9b9519 100644 --- a/gensim/test/test_fasttext_wrapper.py +++ b/gensim/test/test_fasttext_wrapper.py @@ -11,23 +11,16 @@ import logging import unittest import os -import tempfile import numpy from gensim.models.wrappers import fasttext from gensim.models import keyedvectors +from gensim.test.utils import datapath, get_tmpfile -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) logger = logging.getLogger(__name__) -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_fasttext.tst') - - class TestFastText(unittest.TestCase): def setUp(self): ft_home = os.environ.get('FT_HOME', None) @@ -55,8 +48,9 @@ def testTraining(self): logger.info("FT_HOME env variable not set, skipping test") return # Use self.skipTest once python < 2.7 is no longer supported vocab_size, model_size = 1763, 10 + tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') trained_model = fasttext.FastText.train( - self.ft_path, self.corpus_file, size=model_size, output_file=testfile() + self.ft_path, self.corpus_file, size=model_size, output_file=tmpf ) self.assertEqual(trained_model.wv.syn0.shape, (vocab_size, model_size)) @@ -65,20 +59,21 @@ def testTraining(self): self.model_sanity(trained_model) # Tests temporary training files deleted - self.assertFalse(os.path.exists('%s.bin' % testfile())) + self.assertFalse(os.path.exists('%s.bin' % tmpf)) def testMinCount(self): """Tests words with frequency less than `min_count` absent from vocab""" if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return # Use self.skipTest once python < 2.7 is no longer supported + tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') test_model_min_count_5 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=5 + self.ft_path, self.corpus_file, output_file=tmpf, size=10, min_count=5 ) self.assertTrue('forests' not in test_model_min_count_5.wv.vocab) test_model_min_count_1 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=1 + self.ft_path, self.corpus_file, output_file=tmpf, size=10, min_count=1 ) self.assertTrue('forests' in test_model_min_count_1.wv.vocab) @@ -87,8 +82,9 @@ def testModelSize(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return # Use self.skipTest once python < 2.7 is no longer supported + tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') test_model_size_20 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=testfile(), size=20 + self.ft_path, self.corpus_file, output_file=tmpf, size=20 ) self.assertEqual(test_model_size_20.vector_size, 20) self.assertEqual(test_model_size_20.wv.syn0.shape[1], 20) @@ -96,24 +92,26 @@ def testModelSize(self): def testPersistence(self): """Test storing/loading the entire model.""" - self.test_model.save(testfile()) - loaded = fasttext.FastText.load(testfile()) + tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') + self.test_model.save(tmpf) + loaded = fasttext.FastText.load(tmpf) self.models_equal(self.test_model, loaded) - self.test_model.save(testfile(), sep_limit=0) - self.models_equal(self.test_model, fasttext.FastText.load(testfile())) + self.test_model.save(tmpf, sep_limit=0) + self.models_equal(self.test_model, fasttext.FastText.load(tmpf)) def testNormalizedVectorsNotSaved(self): """Test syn0norm/syn0_ngrams_norm aren't saved in model file""" + tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') self.test_model.init_sims() - self.test_model.save(testfile()) - loaded = fasttext.FastText.load(testfile()) + self.test_model.save(tmpf) + loaded = fasttext.FastText.load(tmpf) self.assertTrue(loaded.wv.syn0norm is None) self.assertTrue(loaded.wv.syn0_ngrams_norm is None) wv = self.test_model.wv - wv.save(testfile()) - loaded_kv = keyedvectors.KeyedVectors.load(testfile()) + wv.save(tmpf) + loaded_kv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None) diff --git a/gensim/test/test_glove2word2vec.py b/gensim/test/test_glove2word2vec.py index 07c46a7332..44548529ac 100644 --- a/gensim/test/test_glove2word2vec.py +++ b/gensim/test/test_glove2word2vec.py @@ -10,26 +10,18 @@ import unittest import os import sys -import tempfile import numpy import gensim from gensim.utils import check_output - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -def testfile(): - # temporary model will be stored to this file - return os.path.join(tempfile.gettempdir(), 'glove2word2vec.test') +from gensim.test.utils import datapath, get_tmpfile class TestGlove2Word2Vec(unittest.TestCase): def setUp(self): self.datapath = datapath('test_glove.txt') - self.output_file = testfile() + self.output_file = get_tmpfile('glove2word2vec.test') def testConversion(self): check_output(args=[ diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index b3cf8bdde1..a1ed0d6dcd 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -11,39 +11,16 @@ import logging import unittest -import os -import os.path -import tempfile from gensim.corpora import mmcorpus, Dictionary from gensim.models import hdpmodel from gensim.test import basetmtests +from gensim.test.utils import datapath, common_texts import numpy as np -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +dictionary = Dictionary(common_texts) +corpus = [dictionary.doc2bow(text) for text in common_texts] class TestHdpModel(unittest.TestCase, basetmtests.TestBaseTopicModel): diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index 41bc74a967..3d3abd6f3b 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -1,5 +1,4 @@ import unittest -import os import numpy as np from gensim.models import word2vec @@ -21,25 +20,12 @@ except ImportError: raise unittest.SkipTest("Test requires Keras to be installed, which is not available") -sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +from gensim.test.utils import common_texts class TestKerasWord2VecWrapper(unittest.TestCase): def setUp(self): - self.model_cos_sim = word2vec.Word2Vec(sentences, size=100, min_count=1, hs=1) + self.model_cos_sim = word2vec.Word2Vec(common_texts, size=100, min_count=1, hs=1) # self.model_twenty_ng = word2vec.Word2Vec(word2vec.LineSentence(datapath('20_newsgroup_keras_w2v_data.txt')), min_count=1) self.model_twenty_ng = word2vec.Word2Vec(min_count=1) diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index 5ed4486e16..b780ad42f6 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -13,7 +13,6 @@ import unittest import os import os.path -import tempfile import numpy as np @@ -22,30 +21,10 @@ from gensim import matutils from gensim.models import ldamodel from gensim.test import basetmtests +from gensim.test.utils import datapath, get_tmpfile, common_texts -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] - -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +dictionary = Dictionary(common_texts) +corpus = [dictionary.doc2bow(text) for text in common_texts] class TestLdaMallet(unittest.TestCase, basetmtests.TestBaseTopicModel): @@ -125,7 +104,7 @@ def testMallet2Model(self): def testPersistence(self): if not self.mallet_path: return - fname = testfile() + fname = get_tmpfile('gensim_models_lda_mallet.tst') model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) model.save(fname) model2 = ldamallet.LdaMallet.load(fname) @@ -137,7 +116,7 @@ def testPersistence(self): def testPersistenceCompressed(self): if not self.mallet_path: return - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_lda_mallet.tst.gz') model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) model.save(fname) model2 = ldamallet.LdaMallet.load(fname, mmap=None) @@ -149,7 +128,7 @@ def testPersistenceCompressed(self): def testLargeMmap(self): if not self.mallet_path: return - fname = testfile() + fname = get_tmpfile('gensim_models_lda_mallet.tst') model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) # simulate storing large arrays separately @@ -166,7 +145,7 @@ def testLargeMmap(self): def testLargeMmapCompressed(self): if not self.mallet_path: return - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_lda_mallet.tst.gz') model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) # simulate storing large arrays separately diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index c1d35c2661..df9e0c1c72 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -11,9 +11,6 @@ import logging import unittest -import os -import os.path -import tempfile import numbers import six @@ -23,31 +20,10 @@ from gensim.models import ldamodel, ldamulticore from gensim import matutils, utils from gensim.test import basetmtests +from gensim.test.utils import datapath, get_tmpfile, common_texts -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - - -def testfile(test_fname=''): - # temporary data will be stored to this file - fname = 'gensim_models_' + test_fname + '.tst' - return os.path.join(tempfile.gettempdir(), fname) +dictionary = Dictionary(common_texts) +corpus = [dictionary.doc2bow(text) for text in common_texts] def testRandomState(): @@ -348,22 +324,24 @@ def testPasses(self): # construct what we expect when passes aren't involved test_rhots = list() model = self.class_(id2word=dictionary, chunksize=1, num_topics=2) - final_rhot = lambda: pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay) + + def final_rhot(model): + return pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay) # generate 5 updates to test rhot on for x in range(5): model.update(self.corpus) - test_rhots.append(final_rhot()) + test_rhots.append(final_rhot(model)) for passes in [1, 5, 10, 50, 100]: model = self.class_(id2word=dictionary, chunksize=1, num_topics=2, passes=passes) - self.assertEqual(final_rhot(), 1.0) + self.assertEqual(final_rhot(model), 1.0) # make sure the rhot matches the test after each update for test_rhot in test_rhots: model.update(self.corpus) msg = ", ".join(str(x) for x in [passes, model.num_updates, model.state.numdocs]) - self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg) + self.assertAlmostEqual(final_rhot(model), test_rhot, msg=msg) self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots)) self.assertEqual(model.num_updates, len(corpus) * len(test_rhots)) @@ -402,7 +380,7 @@ def testPasses(self): # self.assertTrue(passed) def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models_lda.tst') model = self.model model.save(fname) model2 = self.class_.load(fname) @@ -425,7 +403,7 @@ def testModelCompatibilityWithPythonVersions(self): self.assertEqual(set(id2word_2_7.keys()), set(id2word_3_5.keys())) def testPersistenceIgnore(self): - fname = testfile('testPersistenceIgnore') + fname = get_tmpfile('gensim_models_lda_testPersistenceIgnore.tst') model = ldamodel.LdaModel(self.corpus, num_topics=2) model.save(fname, ignore='id2word') model2 = ldamodel.LdaModel.load(fname) @@ -436,7 +414,7 @@ def testPersistenceIgnore(self): self.assertTrue(model2.id2word is None) def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_lda.tst.gz') model = self.model model.save(fname) model2 = self.class_.load(fname, mmap=None) @@ -446,7 +424,7 @@ def testPersistenceCompressed(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmap(self): - fname = testfile() + fname = get_tmpfile('gensim_models_lda.tst') model = self.model # simulate storing large arrays separately @@ -461,7 +439,7 @@ def testLargeMmap(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmapCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_lda.tst.gz') model = self.model # simulate storing large arrays separately @@ -483,7 +461,7 @@ def testRandomStateBackwardCompatibility(self): self.assertTrue(isinstance(i[1], six.string_types)) # save back the loaded model using a post-0.13.2 version of Gensim - post_0_13_2_fname = testfile('post_0_13_2_model') + post_0_13_2_fname = get_tmpfile('gensim_models_lda_post_0_13_2_model.tst') model_pre_0_13_2.save(post_0_13_2_fname) # load a model saved using a post-0.13.2 version of Gensim diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index d38c01868c..eac238dcdc 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -3,17 +3,13 @@ Tests to check DTM math functions and Topic-Word, Doc-Topic proportions. """ +import unittest +import logging import numpy as np # for arrays, array broadcasting etc. from gensim.models import ldaseqmodel from gensim.corpora import Dictionary -import os.path -import unittest -import logging - - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data/DTM', fname) +from gensim.test.utils import datapath class TestLdaSeq(unittest.TestCase): @@ -203,7 +199,7 @@ def setUp(self): ['bank', 'loan', 'sell'] ] # initializing using own LDA sufficient statistics so that we get same results each time. - sstats = np.loadtxt(datapath('sstats_test.txt')) + sstats = np.loadtxt(datapath('DTM/sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel( diff --git a/gensim/test/test_ldavowpalwabbit_wrapper.py b/gensim/test/test_ldavowpalwabbit_wrapper.py index d14723de59..5f898246e4 100644 --- a/gensim/test/test_ldavowpalwabbit_wrapper.py +++ b/gensim/test/test_ldavowpalwabbit_wrapper.py @@ -25,10 +25,7 @@ import gensim.models.wrappers.ldavowpalwabbit as ldavowpalwabbit from gensim.models.wrappers.ldavowpalwabbit import LdaVowpalWabbit - - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +from gensim.test.utils import datapath # set up vars used in testing ("Deerwester" from the web tutorial) diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py index 33cce71e52..ff0de9dc3f 100644 --- a/gensim/test/test_lee.py +++ b/gensim/test/test_lee.py @@ -27,6 +27,7 @@ import logging import os.path import unittest +from functools import partial import numpy as np @@ -50,7 +51,7 @@ def setUp(self): sim_file = 'similarities0-1.txt' # read in the corpora - latin1 = lambda line: utils.to_unicode(line, encoding='latin1') + latin1 = partial(utils.to_unicode, encoding='latin1') with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: diff --git a/gensim/test/test_logentropy_model.py b/gensim/test/test_logentropy_model.py index 22ca09be0d..bc64f1b2d1 100644 --- a/gensim/test/test_logentropy_model.py +++ b/gensim/test/test_logentropy_model.py @@ -11,44 +11,17 @@ import logging import unittest -import os -import os.path -import tempfile - import numpy as np -from gensim.corpora import mmcorpus, Dictionary +from gensim.corpora.mmcorpus import MmCorpus from gensim.models import logentropy_model - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +from gensim.test.utils import datapath, get_tmpfile class TestLogEntropyModel(unittest.TestCase): def setUp(self): - self.corpus_small = mmcorpus.MmCorpus(datapath('test_corpus_small.mm')) - self.corpus_ok = mmcorpus.MmCorpus(datapath('test_corpus_ok.mm')) + self.corpus_small = MmCorpus(datapath('test_corpus_small.mm')) + self.corpus_ok = MmCorpus(datapath('test_corpus_ok.mm')) def testTransform(self): # create the transformation model @@ -66,7 +39,7 @@ def testTransform(self): self.assertTrue(np.allclose(transformed, expected)) def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models_logentry.tst') model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True) model.save(fname) model2 = logentropy_model.LogEntropyModel.load(fname) @@ -75,7 +48,7 @@ def testPersistence(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_logentry.tst.gz') model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True) model.save(fname) model2 = logentropy_model.LogEntropyModel.load(fname, mmap=None) diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index ed537feaa8..3ff59c1e43 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -10,50 +10,21 @@ import logging -import os -import os.path -import tempfile import unittest import numpy as np import scipy.linalg from gensim import matutils -from gensim.corpora import mmcorpus, Dictionary +from gensim.corpora.mmcorpus import MmCorpus from gensim.models import lsimodel from gensim.test import basetmtests - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder - - -def datapath(fname): - return os.path.join(module_path, 'test_data', fname) - - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +from gensim.test.utils import datapath, get_tmpfile class TestLsiModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.corpus = MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2) def testTransform(self): @@ -143,7 +114,7 @@ def testOnlineTransform(self): self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models_lsi.tst') model = self.model model.save(fname) model2 = lsimodel.LsiModel.load(fname) @@ -154,7 +125,7 @@ def testPersistence(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_lsi.tst.gz') model = self.model model.save(fname) model2 = lsimodel.LsiModel.load(fname, mmap=None) @@ -165,7 +136,7 @@ def testPersistenceCompressed(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmap(self): - fname = testfile() + fname = get_tmpfile('gensim_models_lsi.tst') model = self.model # test storing the internal arrays into separate files @@ -182,7 +153,7 @@ def testLargeMmap(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmapCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models_lsi.tst.gz') model = self.model # test storing the internal arrays into separate files diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index dd660f629f..344da1adb3 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -17,22 +17,14 @@ import logging import os -import tempfile import unittest from gensim import utils, corpora, models, similarities - -# sample data files are located in the same folder -module_path = os.path.dirname(__file__) -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +from gensim.test.utils import datapath, get_tmpfile logger = logging.getLogger('test_miislita') -def get_tmpfile(suffix): - return os.path.join(tempfile.gettempdir(), suffix) - - class CorpusMiislita(corpora.TextCorpus): stoplist = set('for a of the and to in on'.split()) diff --git a/gensim/test/test_normmodel.py b/gensim/test/test_normmodel.py index 339680d085..fa7a4096fd 100644 --- a/gensim/test/test_normmodel.py +++ b/gensim/test/test_normmodel.py @@ -11,9 +11,6 @@ import logging import unittest -import os -import os.path -import tempfile import numpy as np from scipy.sparse import csr_matrix @@ -21,14 +18,7 @@ from gensim.corpora import mmcorpus from gensim.models import normmodel - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +from gensim.test.utils import datapath, get_tmpfile class TestNormModel(unittest.TestCase): @@ -140,7 +130,7 @@ def testInit(self): self.assertRaises(ValueError, normmodel.NormModel, self.corpus, 'l0') def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models.tst') model = normmodel.NormModel(self.corpus) model.save(fname) model2 = normmodel.NormModel.load(fname) @@ -149,7 +139,7 @@ def testPersistence(self): self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) # try projecting an empty vector def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models.tst.gz') model = normmodel.NormModel(self.corpus) model.save(fname) model2 = normmodel.NormModel.load(fname, mmap=None) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index faf0127297..e3a69760ca 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -15,13 +15,11 @@ from gensim import utils from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser, pseudocorpus +from gensim.test.utils import common_texts if sys.version_info[0] >= 3: unicode = str -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - class TestUtils(unittest.TestCase): @@ -137,17 +135,8 @@ def test_analysis_common_terms_in_between(self): class PhrasesData: - sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'], - ['graph', 'minors', 'survey', 'human', 'interface'] # test bigrams within same sentence + sentences = common_texts + [ + ['graph', 'minors', 'survey', 'human', 'interface'] ] unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] common_terms = frozenset() diff --git a/gensim/test/test_rpmodel.py b/gensim/test/test_rpmodel.py index 94c1abce84..6d09dbcb84 100644 --- a/gensim/test/test_rpmodel.py +++ b/gensim/test/test_rpmodel.py @@ -11,44 +11,18 @@ import logging import unittest -import os -import os.path -import tempfile import numpy as np -from gensim.corpora import mmcorpus, Dictionary +from gensim.corpora.mmcorpus import MmCorpus from gensim.models import rpmodel from gensim import matutils - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +from gensim.test.utils import datapath, get_tmpfile class TestRpModel(unittest.TestCase): def setUp(self): - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.corpus = MmCorpus(datapath('testcorpus.mm')) def testTransform(self): # create the transformation model @@ -64,7 +38,7 @@ def testTransform(self): self.assertTrue(np.allclose(vec, expected)) # transformed entries must be equal up to sign def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models.tst') model = rpmodel.RpModel(self.corpus, num_topics=2) model.save(fname) model2 = rpmodel.RpModel.load(fname) @@ -74,7 +48,7 @@ def testPersistence(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models.tst.gz') model = rpmodel.RpModel(self.corpus, num_topics=2) model.save(fname) model2 = rpmodel.RpModel.load(fname, mmap=None) diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 93c0f8a3f7..5c54685c8e 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -12,18 +12,18 @@ import logging import unittest import os -import tempfile import numpy import scipy -from gensim.corpora import Dictionary from gensim.models import word2vec from gensim.models import doc2vec from gensim.models import KeyedVectors from gensim.models.wrappers import fasttext from gensim import matutils, similarities from gensim.models import Word2Vec +from gensim.test.utils import (datapath, get_tmpfile, + common_texts as texts, common_dictionary as dictionary, common_corpus as corpus) try: from pyemd import emd # noqa:F401 @@ -31,33 +31,9 @@ except ImportError: PYEMD_EXT = False -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(texts)] -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_similarities.tst.pkl') - - class _TestSimilarityABC(object): """ Base class for SparseMatrixSimilarity and MatrixSimilarity unit tests. @@ -178,7 +154,7 @@ def testPersistency(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) elif self.cls == similarities.WmdSimilarity: @@ -203,7 +179,7 @@ def testPersistencyCompressed(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_similarities.tst.pkl.gz') if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) elif self.cls == similarities.WmdSimilarity: @@ -228,7 +204,7 @@ def testLarge(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) elif self.cls == similarities.WmdSimilarity: @@ -255,7 +231,7 @@ def testLargeCompressed(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_similarities.tst.pkl.gz') if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) elif self.cls == similarities.WmdSimilarity: @@ -282,7 +258,7 @@ def testMmap(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) elif self.cls == similarities.WmdSimilarity: @@ -310,7 +286,7 @@ def testMmapCompressed(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_similarities.tst.pkl.gz') if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) elif self.cls == similarities.WmdSimilarity: @@ -545,7 +521,7 @@ def assertApproxNeighborsMatchExact(self, model, wv, index): self.assertEqual(approx_words, exact_words) def assertIndexSaved(self, index): - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) @@ -553,7 +529,7 @@ def assertIndexSaved(self, index): def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname) index2 = AnnoyIndexer() @@ -598,7 +574,7 @@ def testApproxNeighborsMatchExact(self): self.assertEqual(approx_words, exact_words) def testSave(self): - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') self.index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) @@ -612,7 +588,7 @@ def testLoadNotExist(self): def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer - fname = testfile() + fname = get_tmpfile('gensim_similarities.tst.pkl') self.index.save(fname) self.index2 = AnnoyIndexer() diff --git a/gensim/test/test_similarity_metrics.py b/gensim/test/test_similarity_metrics.py index 27066ff09d..2f7b39857c 100644 --- a/gensim/test/test_similarity_metrics.py +++ b/gensim/test/test_similarity_metrics.py @@ -16,27 +16,9 @@ from scipy.sparse import csr_matrix import numpy as np import math -import os -from gensim.corpora import mmcorpus, Dictionary +from gensim.corpora.mmcorpus import MmCorpus from gensim.models import ldamodel - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] +from gensim.test.utils import datapath, common_dictionary, common_corpus class TestIsBow(unittest.TestCase): @@ -94,9 +76,9 @@ def test_bow(self): class TestHellinger(unittest.TestCase): def setUp(self): - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.corpus = MmCorpus(datapath('testcorpus.mm')) self.class_ = ldamodel.LdaModel - self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) + self.model = self.class_(common_corpus, id2word=common_dictionary, num_topics=2, passes=100) def test_inputs(self): @@ -146,7 +128,7 @@ def test_distributions(self): # testing LDA distribution vectors np.random.seed(0) - model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100) + model = self.class_(self.corpus, id2word=common_dictionary, num_topics=2, passes=100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.hellinger(lda_vec1, lda_vec2) @@ -156,9 +138,9 @@ def test_distributions(self): class TestKL(unittest.TestCase): def setUp(self): - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.corpus = MmCorpus(datapath('testcorpus.mm')) self.class_ = ldamodel.LdaModel - self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) + self.model = self.class_(common_corpus, id2word=common_dictionary, num_topics=2, passes=100) def test_inputs(self): @@ -214,7 +196,7 @@ def test_distributions(self): # testing LDA distribution vectors np.random.seed(0) - model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100) + model = self.class_(self.corpus, id2word=common_dictionary, num_topics=2, passes=100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.kullback_leibler(lda_vec1, lda_vec2) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index c3209ece10..ff4a6a2202 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -1,6 +1,5 @@ import unittest import numpy -import os import codecs import pickle @@ -25,10 +24,7 @@ from gensim.sklearn_api.phrases import PhrasesTransformer from gensim.corpora import mmcorpus, Dictionary from gensim import matutils, models - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) -datapath_ldaseq = lambda fname: os.path.join(module_path, 'test_data/DTM', fname) +from gensim.test.utils import datapath, common_texts texts = [ ['complier', 'system', 'computer'], @@ -114,28 +110,9 @@ d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] -dict_texts = [ - 'human interface computer', - 'survey user computer system response time', - 'eps user interface system', - 'system human system eps', - 'user response time', - 'trees', - 'graph trees', - 'graph minors trees', - 'graph minors survey' -] +dict_texts = [' '.join(text) for text in common_texts] -phrases_sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'], +phrases_sentences = common_texts + [ ['graph', 'minors', 'survey', 'human', 'interface'] ] diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index 93f00ae3a8..83df8ece57 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -5,6 +5,7 @@ from gensim.topic_coherence.text_analysis import ( InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, CorpusAccumulator) +from gensim.test.utils import common_texts class BaseTestCases(object): @@ -28,18 +29,7 @@ class TextAnalyzerTestBase(unittest.TestCase): dictionary.id2token = {v: k for k, v in token2id.items()} top_ids = set(token2id.values()) - texts2 = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'], - ['user', 'user'] - ] + texts2 = common_texts + [['user', 'user']] dictionary2 = Dictionary(texts2) dictionary2.id2token = {v: k for k, v in dictionary2.token2id.items()} top_ids2 = set(dictionary2.token2id.values()) diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index 65e2939857..c308923c29 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -11,43 +11,17 @@ import logging import unittest -import os -import os.path -import tempfile import numpy as np -from gensim.corpora import mmcorpus, Dictionary +from gensim.corpora.mmcorpus import MmCorpus from gensim.models import tfidfmodel - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -# set up vars used in testing ("Deerwester" from the web tutorial) -texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - - -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +from gensim.test.utils import datapath, get_tmpfile, common_dictionary, common_corpus class TestTfidfModel(unittest.TestCase): def setUp(self): - self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) + self.corpus = MmCorpus(datapath('testcorpus.mm')) def testTransform(self): # create the transformation model @@ -63,19 +37,20 @@ def testTransform(self): def testInit(self): # create the transformation model by analyzing a corpus # uses the global `corpus`! - model1 = tfidfmodel.TfidfModel(corpus) + model1 = tfidfmodel.TfidfModel(common_corpus) + dfs = common_dictionary.dfs # make sure the dfs<->idfs transformation works - self.assertEqual(model1.dfs, dictionary.dfs) - self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dictionary.dfs, len(corpus))) + self.assertEqual(model1.dfs, dfs) + self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dfs, len(common_corpus))) # create the transformation model by directly supplying a term->docfreq # mapping from the global var `dictionary`. - model2 = tfidfmodel.TfidfModel(dictionary=dictionary) + model2 = tfidfmodel.TfidfModel(dictionary=common_dictionary) self.assertEqual(model1.idfs, model2.idfs) def testPersistence(self): - fname = testfile() + fname = get_tmpfile('gensim_models.tst') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) @@ -84,7 +59,7 @@ def testPersistence(self): self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): - fname = testfile() + '.gz' + fname = get_tmpfile('gensim_models.tst.gz') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) diff --git a/gensim/test/test_tmdiff.py b/gensim/test/test_tmdiff.py index 67ba174361..f49c930a63 100644 --- a/gensim/test/test_tmdiff.py +++ b/gensim/test/test_tmdiff.py @@ -8,25 +8,14 @@ import unittest import numpy as np -from gensim.corpora import Dictionary from gensim.models import LdaModel +from gensim.test.utils import common_dictionary, common_corpus class TestLdaDiff(unittest.TestCase): def setUp(self): - texts = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'], - ] - self.dictionary = Dictionary(texts) - self.corpus = [self.dictionary.doc2bow(text) for text in texts] + self.dictionary = common_dictionary + self.corpus = common_corpus self.num_topics = 5 self.n_ann_terms = 10 self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index b6c24d1c4b..2c68f2c5c1 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -1,25 +1,17 @@ #!/usr/bin/env python # encoding: utf-8 -import os +from collections import namedtuple import unittest -import tempfile -import numpy as np import math +import numpy as np + from scipy.spatial.distance import cosine -from collections import namedtuple from gensim.models.doc2vec import Doc2Vec from gensim import utils from gensim.models import translation_matrix from gensim.models import KeyedVectors - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -def temp_save_file(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'transmat-en-it.pkl') +from gensim.test.utils import datapath, get_tmpfile class TestTranslationMatrix(unittest.TestCase): @@ -45,11 +37,13 @@ def test_translation_matrix(self): def testPersistence(self): """Test storing/loading the entire model.""" + tmpf = get_tmpfile('transmat-en-it.pkl') + model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) model.train(self.word_pairs) - model.save(temp_save_file()) + model.save(tmpf) - loaded_model = translation_matrix.TranslationMatrix.load(temp_save_file()) + loaded_model = translation_matrix.TranslationMatrix.load(tmpf) self.assertTrue(np.allclose(model.translation_matrix, loaded_model.translation_matrix)) def test_translate_nn(self): diff --git a/gensim/test/test_varembed_wrapper.py b/gensim/test/test_varembed_wrapper.py index 2053f7ffc3..c94c2bbcdb 100644 --- a/gensim/test/test_varembed_wrapper.py +++ b/gensim/test/test_varembed_wrapper.py @@ -11,7 +11,6 @@ """ import logging -import os import sys import numpy as np @@ -19,15 +18,14 @@ import unittest from gensim.models.wrappers import varembed +from gensim.test.utils import datapath try: import morfessor # noqa: F401 except ImportError: raise unittest.SkipTest("Test requires Morfessor to be installed, which is not available") -# needed because sample data files are located in the same folder -module_path = os.path.dirname(__file__) -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + varembed_model_vector_file = datapath('varembed_vectors.pkl') varembed_model_morfessor_file = datapath('varembed_morfessor.bin') diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index ede577a52b..e7b7b14011 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -9,15 +9,13 @@ """ -import os import logging import unittest from gensim.corpora.wikicorpus import WikiCorpus from gensim import utils +from gensim.test.utils import datapath -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2' FILENAME_U = 'bgwiki-latest-pages-articles-shortened.xml.bz2' diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 4c642ce5d2..a3720237c3 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -12,7 +12,6 @@ import logging import unittest import os -import tempfile import bz2 import sys @@ -20,6 +19,7 @@ from gensim import utils from gensim.models import word2vec, keyedvectors +from gensim.test.utils import datapath, get_tmpfile, common_texts as sentences from testfixtures import log_capture try: @@ -28,9 +28,6 @@ except ImportError: PYEMD_EXT = False -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - class LeeCorpus(object): def __iter__(self): @@ -41,18 +38,6 @@ def __iter__(self): list_corpus = list(LeeCorpus()) -sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] -] - new_sentences = [ ['computer', 'artificial', 'intelligence'], ['artificial', 'trees'], @@ -63,11 +48,6 @@ def __iter__(self): ] -def testfile(): - # temporary data will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_word2vec.tst') - - def _rule(word, count, min_count): if word == "human": return utils.RULE_DISCARD # throw out @@ -77,10 +57,11 @@ def _rule(word, count, min_count): def load_on_instance(): # Save and load a Word2Vec Model on instance for test + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.save(testfile()) + model.save(tmpf) model = word2vec.Word2Vec() # should fail at this point - return model.load(testfile()) + return model.load(tmpf) class TestWord2VecModel(unittest.TestCase): @@ -148,9 +129,10 @@ def testOnlineLearning(self): def testOnlineLearningAfterSave(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" + tmpf = get_tmpfile('gensim_word2vec.tst') model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) - model_neg.save(testfile()) - model_neg = word2vec.Word2Vec.load(testfile()) + model_neg.save(tmpf) + model_neg = word2vec.Word2Vec.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) @@ -203,21 +185,23 @@ def test_cbow_neg_online(self): def testPersistence(self): """Test storing/loading the entire model.""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.save(testfile()) - self.models_equal(model, word2vec.Word2Vec.load(testfile())) + model.save(tmpf) + self.models_equal(model, word2vec.Word2Vec.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv - wv.save(testfile()) - loaded_wv = keyedvectors.KeyedVectors.load(testfile()) + wv.save(tmpf) + loaded_wv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0, loaded_wv.syn0)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) def testPersistenceWithConstructorRule(self): """Test storing/loading the entire model with a vocab trimming rule passed in the constructor.""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule) - model.save(testfile()) - self.models_equal(model, word2vec.Word2Vec.load(testfile())) + model.save(tmpf) + self.models_equal(model, word2vec.Word2Vec.load(tmpf)) def testRuleWithMinCount(self): """Test that returning RULE_DEFAULT from trim_rule triggers min_count.""" @@ -234,21 +218,24 @@ def testRule(self): def testLambdaRule(self): """Test that lambda trim_rule works.""" - rule = lambda word, count, min_count: utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT + def rule(word, count, min_count): + return utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT + model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule) self.assertTrue("human" not in model.wv.vocab) def testSyn0NormNotSaved(self): """Test syn0norm isn't saved in model file""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.save(testfile()) - loaded_model = word2vec.Word2Vec.load(testfile()) + model.save(tmpf) + loaded_model = word2vec.Word2Vec.load(tmpf) self.assertTrue(loaded_model.wv.syn0norm is None) wv = model.wv - wv.save(testfile()) - loaded_kv = keyedvectors.KeyedVectors.load(testfile()) + wv.save(tmpf) + loaded_kv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(loaded_kv.syn0norm is None) def testLoadPreKeyedVectorModel(self): @@ -280,34 +267,36 @@ def testLoadPreKeyedVectorModelCFormat(self): def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.wv.save_word2vec_format(testfile(), binary=True) - binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) + model.wv.save_word2vec_format(tmpf, binary=True) + binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model_kv.init_sims(replace=False) self.assertTrue(np.allclose(model['human'], binary_model_kv['human'])) - norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) + norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) norm_only_model.init_sims(replace=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) - limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) + limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True, limit=3) self.assertEqual(len(limited_model_kv.syn0), 3) half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( - testfile(), binary=True, datatype=np.float16 + tmpf, binary=True, datatype=np.float16 ) self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) def testNoTrainingCFormat(self): + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.wv.save_word2vec_format(testfile(), binary=True) - kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) + model.wv.save_word2vec_format(tmpf, binary=True) + kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model = word2vec.Word2Vec() binary_model.wv = kv self.assertRaises(ValueError, binary_model.train, sentences) def testTooShortBinaryWord2VecFormat(self): - tfile = testfile() + tfile = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tfile, binary=True) @@ -317,7 +306,7 @@ def testTooShortBinaryWord2VecFormat(self): self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=True) def testTooShortTextWord2VecFormat(self): - tfile = testfile() + tfile = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tfile, binary=False) @@ -328,13 +317,14 @@ def testTooShortTextWord2VecFormat(self): def testPersistenceWord2VecFormatNonBinary(self): """Test storing/loading the entire model in word2vec non-binary format.""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.wv.save_word2vec_format(testfile(), binary=False) - text_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False) + model.wv.save_word2vec_format(tmpf, binary=False) + text_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False) text_model.init_sims(False) self.assertTrue(np.allclose(model['human'], text_model['human'], atol=1e-6)) - norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False) + norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False) norm_only_model.init_sims(True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6)) self.assertTrue(np.allclose( @@ -343,44 +333,48 @@ def testPersistenceWord2VecFormatNonBinary(self): def testPersistenceWord2VecFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab') - model.wv.save_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) + testvocab = get_tmpfile('gensim_word2vec.vocab') + model.wv.save_word2vec_format(tmpf, testvocab, binary=True) + binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count) def testPersistenceKeyedVectorsFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab') - model.wv.save_word2vec_format(testfile(), testvocab, binary=True) - kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) + testvocab = get_tmpfile('gensim_word2vec.vocab') + model.wv.save_word2vec_format(tmpf, testvocab, binary=True) + kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count) def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): """Test storing/loading the entire model and vocabulary in word2vec format chained with saving and loading via `save` and `load` methods`. It was possible prior to 1.0.0 release, now raises Exception""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab') - model.wv.save_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab_kv.save(testfile()) - self.assertRaises(AttributeError, word2vec.Word2Vec.load, testfile()) + testvocab = get_tmpfile('gensim_word2vec.vocab') + model.wv.save_word2vec_format(tmpf, testvocab, binary=True) + binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) + binary_model_with_vocab_kv.save(tmpf) + self.assertRaises(AttributeError, word2vec.Word2Vec.load, tmpf) def testLargeMmap(self): """Test storing/loading the entire model.""" + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) # test storing the internal arrays into separate files - model.save(testfile(), sep_limit=0) - self.models_equal(model, word2vec.Word2Vec.load(testfile())) + model.save(tmpf, sep_limit=0) + self.models_equal(model, word2vec.Word2Vec.load(tmpf)) # make sure mmaping the arrays back works, too - self.models_equal(model, word2vec.Word2Vec.load(testfile(), mmap='r')) + self.models_equal(model, word2vec.Word2Vec.load(tmpf, mmap='r')) def testVocab(self): """Test word2vec vocabulary building.""" @@ -672,9 +666,10 @@ def testDeleteTemporaryTrainingData(self): self.assertTrue(not hasattr(model, 'syn0_lockf')) def testNormalizeAfterTrainingData(self): + tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.save(testfile()) - norm_only_model = word2vec.Word2Vec.load(testfile()) + model.save(tmpf) + norm_only_model = word2vec.Word2Vec.load(tmpf) norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) @@ -690,9 +685,10 @@ def testPredictOutputWord(self): self.assertEqual(predictions_out_of_vocab, None) # when required model parameters have been deleted + tmpf = get_tmpfile('gensim_word2vec.tst') model_with_neg.init_sims() - model_with_neg.wv.save_word2vec_format(testfile(), binary=True) - kv_model_with_neg = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) + model_with_neg.wv.save_word2vec_format(tmpf, binary=True) + kv_model_with_neg = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model_with_neg = word2vec.Word2Vec() binary_model_with_neg.wv = kv_model_with_neg self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human']) diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index 4ecb9f7c70..634afa71cc 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -12,19 +12,11 @@ import logging import unittest import os -import tempfile import numpy from gensim.models.wrappers import wordrank - -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - - -def testfile(): - # temporary model will be stored to this file - return os.path.join(tempfile.gettempdir(), 'gensim_wordrank.test') +from gensim.test.utils import datapath, get_tmpfile class TestWordrank(unittest.TestCase): @@ -61,8 +53,9 @@ def testPersistence(self): """Test storing/loading the entire model""" if not self.wr_path: return - self.test_model.save(testfile()) - loaded = wordrank.Wordrank.load(testfile()) + tmpf = get_tmpfile('gensim_wordrank.test') + self.test_model.save(tmpf) + loaded = wordrank.Wordrank.load(tmpf) self.models_equal(self.test_model, loaded) def testSimilarity(self): diff --git a/gensim/test/utils.py b/gensim/test/utils.py new file mode 100644 index 0000000000..89fae9226e --- /dev/null +++ b/gensim/test/utils.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# encoding: utf-8 + + +""" +Common utils for tests +""" +import tempfile +import os + +from gensim.corpora import Dictionary + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder + + +def datapath(fname): + """Return full path to the pre created file with test data (basically corpus).""" + return os.path.join(module_path, 'test_data', fname) + + +def get_tmpfile(suffix): + """ + Return full path to temporary file with required suffix. + + Function doesn't create file. Double calling with the same suffix can return different paths. + """ + return os.path.join(tempfile.gettempdir(), suffix) + + +# set up vars used in testing ("Deerwester" from the web tutorial) +common_texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] + +common_dictionary = Dictionary(common_texts) +common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] diff --git a/gensim/utils.py b/gensim/utils.py index bca29e73cc..0627f4703c 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -288,10 +288,11 @@ def _load_specials(self, fname, mmap, compress, subname): opportunity to recursively included SaveLoad instances. """ - mmap_error = lambda x, y: IOError( - 'Cannot mmap compressed object %s in file %s. ' % (x, y) + - 'Use `load(fname, mmap=None)` or uncompress files manually.' - ) + def mmap_error(obj, filename): + return IOError( + 'Cannot mmap compressed object %s in file %s. ' % (obj, filename) + + 'Use `load(fname, mmap=None)` or uncompress files manually.' + ) for attrib in getattr(self, '__recursive_saveloads', []): cfname = '.'.join((fname, attrib)) @@ -336,13 +337,8 @@ def _load_specials(self, fname, mmap, compress, subname): @staticmethod def _adapt_by_suffix(fname): """Give appropriate compress setting and filename formula""" - if fname.endswith('.gz') or fname.endswith('.bz2'): - compress = True - subname = lambda *args: '.'.join(list(args) + ['npz']) - else: - compress = False - subname = lambda *args: '.'.join(list(args) + ['npy']) - return compress, subname + compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy') + return compress, lambda *args: '.'.join(args + (suffix,)) def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): """ diff --git a/setup.cfg b/setup.cfg index a5d10487ef..26a4aa0132 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,4 +8,4 @@ artifact_indexes= http://17a25141cb7f75c18ee4-676a79255544e7711e0dd8bccdcdd1cb.r23.cf2.rackcdn.com [flake8] -ignore = E501,E731,E12,W503,E402 +ignore = E501,E12,W503