diff --git a/ez_setup.py b/ez_setup.py index 4da59fcd76..4251063fc0 100644 --- a/ez_setup.py +++ b/ez_setup.py @@ -32,19 +32,25 @@ DEFAULT_VERSION = "1.3.2" DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" + def _python_cmd(*args): args = (sys.executable,) + args return subprocess.call(args) == 0 + def _check_call_py24(cmd, *args, **kwargs): res = subprocess.call(cmd, *args, **kwargs) + class CalledProcessError(Exception): pass if not res == 0: msg = "Command '%s' return non-zero exit status %d" % (cmd, res) raise CalledProcessError(msg) + + vars(subprocess).setdefault('check_call', _check_call_py24) + def _install(tarball, install_args=()): # extracting the tarball tmpdir = tempfile.mkdtemp() @@ -151,6 +157,7 @@ def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, return _do_download(version, download_base, to_dir, download_delay) + def _clean_check(cmd, target): """ Run the command to download target. If the command fails, clean up before @@ -163,6 +170,7 @@ def _clean_check(cmd, target): os.unlink(target) raise + def download_file_powershell(url, target): """ Download the file at url to target using Powershell (which will validate @@ -176,6 +184,7 @@ def download_file_powershell(url, target): ] _clean_check(cmd, target) + def has_powershell(): if platform.system() != 'Windows': return False @@ -184,50 +193,58 @@ def has_powershell(): try: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) - except: + except Exception: return False finally: devnull.close() return True + download_file_powershell.viable = has_powershell + def download_file_curl(url, target): cmd = ['curl', url, '--silent', '--output', target] _clean_check(cmd, target) + def has_curl(): cmd = ['curl', '--version'] devnull = open(os.path.devnull, 'wb') try: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) - except: + except Exception: return False finally: devnull.close() return True + download_file_curl.viable = has_curl + def download_file_wget(url, target): cmd = ['wget', url, '--quiet', '--output-document', target] _clean_check(cmd, target) + def has_wget(): cmd = ['wget', '--version'] devnull = open(os.path.devnull, 'wb') try: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) - except: + except Exception: return False finally: devnull.close() return True + download_file_wget.viable = has_wget + def download_file_insecure(url, target): """ Use Python to download the file, even though it cannot authenticate the @@ -251,8 +268,10 @@ def download_file_insecure(url, target): if dst: dst.close() + download_file_insecure.viable = lambda: True + def get_best_downloader(): downloaders = [ download_file_powershell, @@ -265,6 +284,7 @@ def get_best_downloader(): if dl.viable(): return dl + def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, delay=15, downloader_factory=get_best_downloader): @@ -317,7 +337,7 @@ def _extractall(self, path=".", members=None): # Reverse sort directories. if sys.version_info < (2, 4): def sorter(dir1, dir2): - return cmp(dir1.name, dir2.name) + return cmp(dir1.name, dir2.name) # noqa:F821 directories.sort(sorter) directories.reverse() else: @@ -350,6 +370,7 @@ def _build_install_args(options): install_args.append('--user') return install_args + def _parse_args(): """ Parse the command line for options @@ -371,6 +392,7 @@ def _parse_args(): # positional arguments are ignored return options + def main(version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" options = _parse_args() @@ -378,5 +400,6 @@ def main(version=DEFAULT_VERSION): downloader_factory=options.downloader_factory) return _install(tarball, _build_install_args(options)) + if __name__ == '__main__': sys.exit(main()) diff --git a/gensim/__init__.py b/gensim/__init__.py index aa95da4a6e..c267afe4de 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -3,16 +3,19 @@ similarities within a corpus of documents. """ -from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization +from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization # noqa:F401 import logging __version__ = '2.3.0' + class NullHandler(logging.Handler): """For python versions <= 2.6; same as `logging.NullHandler` in 2.7.""" + def emit(self, record): pass + logger = logging.getLogger('gensim') -if len(logger.handlers) == 0: # To ensure reload() doesn't add another one +if len(logger.handlers) == 0: # To ensure reload() doesn't add another one logger.addHandler(NullHandler()) diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index a11a0df229..a5c54a65ff 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -3,15 +3,15 @@ """ # bring corpus classes directly into package namespace, to save some typing -from .indexedcorpus import IndexedCorpus # must appear before the other classes +from .indexedcorpus import IndexedCorpus # noqa:F401 must appear before the other classes -from .mmcorpus import MmCorpus -from .bleicorpus import BleiCorpus -from .svmlightcorpus import SvmLightCorpus -from .lowcorpus import LowCorpus -from .dictionary import Dictionary -from .hashdictionary import HashDictionary -from .wikicorpus import WikiCorpus -from .textcorpus import TextCorpus -from .ucicorpus import UciCorpus -from .malletcorpus import MalletCorpus +from .mmcorpus import MmCorpus # noqa:F401 +from .bleicorpus import BleiCorpus # noqa:F401 +from .svmlightcorpus import SvmLightCorpus # noqa:F401 +from .lowcorpus import LowCorpus # noqa:F401 +from .dictionary import Dictionary # noqa:F401 +from .hashdictionary import HashDictionary # noqa:F401 +from .wikicorpus import WikiCorpus # noqa:F401 +from .textcorpus import TextCorpus # noqa:F401 +from .ucicorpus import UciCorpus # noqa:F401 +from .malletcorpus import MalletCorpus # noqa:F401 diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index b84d080c40..327a36fc14 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -14,7 +14,7 @@ from os import path import logging -from gensim import interfaces, utils +from gensim import utils from gensim.corpora import IndexedCorpus from six.moves import xrange diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index a0b3f8d73e..d32276688b 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -333,14 +333,14 @@ def merge_with(self, other): old2new[other_id] = new_id try: self.dfs[new_id] += other.dfs[other_id] - except: + except Exception: # `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going pass try: self.num_docs += other.num_docs self.num_nnz += other.num_nnz self.num_pos += other.num_pos - except: + except Exception: pass import gensim.models diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 0b15de5df6..63f966b3cd 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -118,7 +118,7 @@ def add_documents(self, documents): for docno, document in enumerate(documents): if docno % 10000 == 0: logger.info("adding document #%i to %s" % (docno, self)) - _ = self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids + self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids logger.info( "built %s from %i documents (total %i corpus positions)", self, self.num_docs, self.num_pos) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index e0e00e7663..62f29b25ed 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -51,7 +51,7 @@ def __init__(self, fname, index_fname=None): # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) logger.info("loaded corpus index from %s" % index_fname) - except: + except Exception: self.index = None self.length = None @@ -130,5 +130,4 @@ def __getitem__(self, docno): raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') - # endclass IndexedCorpus diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index b87f1108a2..315490cdcc 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -65,24 +65,24 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) - self.fname = fname # input file, see class doc for format - self.line2words = line2words # how to translate lines into words (simply split on space by default) + self.fname = fname # input file, see class doc for format + self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() - self.use_wordids = False # return documents as (word, wordCount) 2-tuples + self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) - all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id - self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) + all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id + self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) - self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples + self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname)) @@ -135,7 +135,7 @@ def __iter__(self): """ with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): - if lineno > 0: # ignore the first line = number of documents + if lineno > 0: # ignore the first line = number of documents yield self.line2doc(line) @staticmethod diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index f8410845e6..00333e9358 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -56,8 +56,8 @@ def __iter__(self): yield self.line2doc(line) def line2doc(self, line): - l = [word for word in utils.to_unicode(line).strip().split(' ') if word] - docid, doclang, words = l[0], l[1], l[2:] + splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word] + docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:] doc = super(MalletCorpus, self).line2doc(' '.join(words)) diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index a9a879db3e..08e809443b 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -12,7 +12,7 @@ import logging -from gensim import interfaces, matutils +from gensim import matutils from gensim.corpora import IndexedCorpus @@ -23,6 +23,7 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): """ Corpus in the Matrix Market format. """ + def __init__(self, fname): # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 16ecaf3d12..255fc2b7fe 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -26,6 +26,12 @@ import scipy.sparse as sparse import time +from six.moves import xrange + +import gensim +from gensim.corpora import IndexedCorpus +from gensim.interfaces import TransformedCorpus + logger = logging.getLogger(__name__) #: Specifies which dtype should be used for serializing the shards. @@ -37,13 +43,6 @@ logger.info('Could not import Theano, will use standard float for default ShardedCorpus dtype.') -from six.moves import xrange - -import gensim -from gensim.corpora import IndexedCorpus -from gensim.interfaces import TransformedCorpus - - class ShardedCorpus(IndexedCorpus): """ This corpus is designed for situations where you need to train a model @@ -233,7 +232,7 @@ def __init__(self, output_prefix, corpus, dim=None, self.current_shard = None # The current shard itself (numpy ndarray) self.current_shard_n = None # Current shard is the current_shard_n-th self.current_offset = None # The index into the dataset which - # corresponds to index 0 of current shard + # corresponds to index 0 of current shard logger.info('Initializing sharded corpus with prefix ' '{0}'.format(output_prefix)) @@ -331,7 +330,7 @@ def save_shard(self, shard, n=None, filename=None): """ new_shard = False if n is None: - n = self.n_shards # Saving the *next* one by default. + n = self.n_shards # Saving the *next* one by default. new_shard = True if not filename: @@ -347,7 +346,7 @@ def load_shard(self, n): """ Load (unpickle) the n-th shard as the "live" part of the dataset into the Dataset object.""" - #logger.debug('ShardedCorpus loading shard {0}, ' + # logger.debug('ShardedCorpus loading shard {0}, ' # 'current shard: {1}'.format(n, self.current_shard_n)) # No-op if the shard is already open. @@ -416,7 +415,7 @@ def in_next(self, offset): """ if self.current_shard_n == self.n_shards: - return False # There's no next shard. + return False # There's no next shard. return (self.offsets[self.current_shard_n + 1] <= offset) \ and (offset < self.offsets[self.current_shard_n + 2]) @@ -611,7 +610,7 @@ def __getitem__(self, offset): # This fails on one-past # slice indexing; that's why there's a code branch here. - #logger.debug('ShardedCorpus: Retrieving slice {0}: ' + # logger.debug('ShardedCorpus: Retrieving slice {0}: ' # 'shard {1}'.format((offset.start, offset.stop), # (first_shard, last_shard))) @@ -656,13 +655,13 @@ def __getitem__(self, offset): shard_stop = self.offsets[self.current_shard_n + 1] - \ self.current_offset - #s_result[result_start:result_stop] = self.current_shard[ + # s_result[result_start:result_stop] = self.current_shard[ # shard_start:shard_stop] s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop) # First and last get special treatment, these are in between - for shard_n in xrange(first_shard+1, last_shard): + for shard_n in xrange(first_shard + 1, last_shard): self.load_shard(shard_n) result_start = result_stop @@ -753,7 +752,7 @@ def _getitem_sparse2gensim(self, result): """ def row_sparse2gensim(row_idx, csr_matrix): - indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx+1]] + indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx + 1]] g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices] return g_row diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 4fdc764b16..5e24419421 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -58,7 +58,7 @@ def __init__(self, fname, store_labels=True): IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) - self.fname = fname # input file, see class doc for format + self.fname = fname # input file, see class doc for format self.length = None self.store_labels = store_labels self.labels = [] @@ -94,7 +94,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): offsets = [] with utils.smart_open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): - label = labels[docno] if labels else 0 # target class is 0 by default + label = labels[docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets @@ -114,12 +114,12 @@ def line2doc(self, line): line = utils.to_unicode(line) line = line[: line.find('#')].strip() if not line: - return None # ignore comments and empty lines + return None # ignore comments and empty lines parts = line.split() if not parts: raise ValueError('invalid line format in %s' % self.fname) target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] - doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based + doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based return doc, target @staticmethod @@ -127,7 +127,7 @@ def doc2line(doc, label=0): """ Output the document in SVMlight format, as a string. Inverse function to `line2doc`. """ - pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base + pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base return "%s %s\n" % (label, pairs) # endclass SvmLightCorpus diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 44b2a772d9..0c09cc7e34 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -21,7 +21,7 @@ from gensim.corpora import IndexedCorpus from gensim.matutils import MmReader from gensim.matutils import MmWriter -from six import iteritems, string_types +from six import iteritems from six.moves import xrange @@ -118,7 +118,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): offsets.append(posnow) poslast = posnow - vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights + vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights max_id, veclen = writer.write_vector(docno, vector) num_terms = max(num_terms, 1 + max_id) num_nnz += veclen @@ -165,7 +165,7 @@ def __iter__(self): (yielding one document at a time). """ for docId, doc in super(UciCorpus, self).__iter__(): - yield doc # get rid of docId, return the sparse vector only + yield doc # get rid of docId, return the sparse vector only def create_dictionary(self): """ diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index ec032067f1..ea87cce4a2 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -187,6 +187,8 @@ def get_namespace(tag): raise ValueError("%s not recognized as MediaWiki dump namespace" % namespace) return namespace + + _get_namespace = get_namespace @@ -233,6 +235,8 @@ def extract_pages(f, filter_namespaces=False): # ./revision/text element. The pages comprise the bulk of the # file, so in practice we prune away enough. elem.clear() + + _extract_pages = extract_pages # for backward compatibility @@ -266,6 +270,7 @@ class WikiCorpus(TextCorpus): >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word """ + def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): """ diff --git a/gensim/examples/dmlcz/__init__.py b/gensim/examples/dmlcz/__init__.py index 8b13789179..e69de29bb2 100644 --- a/gensim/examples/dmlcz/__init__.py +++ b/gensim/examples/dmlcz/__init__.py @@ -1 +0,0 @@ - diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py index 63c9f16855..d8fb8c4cb5 100644 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ b/gensim/examples/dmlcz/dmlcorpus.py @@ -11,11 +11,10 @@ import logging -import itertools import os.path from gensim import interfaces, matutils -import dictionary # for constructing word->id mappings +import dictionary # for constructing word->id mappings logger = logging.getLogger('gensim.corpora.dmlcorpus') @@ -34,39 +33,35 @@ class DmlConfig(object): output files and which articles to accept for the corpus (= an additional filter over the sources). """ - def __init__(self, configId, resultDir, acceptLangs = None): - self.resultDir = resultDir # output files will be stored in this directory - self.configId = configId # configId is a string that is used as filename prefix for all files, so keep it simple - self.sources = {} # all article sources; see sources.DmlSource class for an example of source - if acceptLangs is None: # which languages to accept - acceptLangs = set(['any']) # if not specified, accept all languages (including unknown/unspecified) + def __init__(self, configId, resultDir, acceptLangs=None): + self.resultDir = resultDir # output files will be stored in this directory + self.configId = configId # configId is a string that is used as filename prefix for all files, so keep it simple + self.sources = {} # all article sources; see sources.DmlSource class for an example of source + + if acceptLangs is None: # which languages to accept + acceptLangs = set(['any']) # if not specified, accept all languages (including unknown/unspecified) self.acceptLangs = set(acceptLangs) logger.info('initialized %s' % self) - def resultFile(self, fname): return os.path.join(self.resultDir, self.configId + '_' + fname) - def acceptArticle(self, metadata): - lang = metadata.get('language', 'unk') # if there was no language field in the article metadata, set language to 'unk' = unknown + lang = metadata.get('language', 'unk') # if there was no language field in the article metadata, set language to 'unk' = unknown if 'any' not in self.acceptLangs and lang not in self.acceptLangs: return False return True - def addSource(self, source): sourceId = str(source) assert sourceId not in self.sources, "source %s already present in the config!" % sourceId self.sources[sourceId] = source - def __str__(self): return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs))) -#endclass DmlConfig - +# endclass DmlConfig class DmlCorpus(interfaces.CorpusABC): @@ -79,16 +74,15 @@ class DmlCorpus(interfaces.CorpusABC): DmlCorpus has methods for building a dictionary (mapping between words and their ids). """ + def __init__(self): self.documents = [] self.config = None self.dictionary = dictionary.Dictionary() - def __len__(self): return len(self.documents) - def __iter__(self): """ The function that defines a corpus -- iterating over the corpus yields @@ -101,8 +95,7 @@ def __iter__(self): contents = source.getContent(docUri) words = [source.normalizeWord(word) for word in source.tokenize(contents)] - yield self.dictionary.doc2bow(words, allowUpdate = False) - + yield self.dictionary.doc2bow(words, allowUpdate=False) def buildDictionary(self): """ @@ -125,12 +118,11 @@ def buildDictionary(self): numPositions += len(words) # convert to bag-of-words, but ignore the result -- here we only care about updating token ids - _ = self.dictionary.doc2bow(words, allowUpdate = True) + _ = self.dictionary.doc2bow(words, allowUpdate=True) # noqa:F841 logger.info("built %s from %i documents (total %i corpus positions)" % (self.dictionary, len(self.documents), numPositions)) - - def processConfig(self, config, shuffle = False): + def processConfig(self, config, shuffle=False): """ Parse the directories specified in the config, looking for suitable articles. @@ -148,8 +140,8 @@ def processConfig(self, config, shuffle = False): logger.info("processing source '%s'" % sourceId) accepted = [] for articleUri in source.findArticles(): - meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value) - if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata + meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value) + if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata accepted.append((sourceId, articleUri)) logger.info("accepted %i articles for source '%s'" % (len(accepted), sourceId)) @@ -166,7 +158,6 @@ def processConfig(self, config, shuffle = False): logger.info("accepted total of %i articles for %s" % (len(self.documents), str(config))) - def saveDictionary(self, fname): logger.info("saving dictionary mapping to %s" % fname) fout = open(fname, 'w') @@ -194,7 +185,6 @@ def saveDocuments(self, fname): fout.write("%i\t%s\n" % (docNo, repr(docId))) fout.close() - def saveAsText(self): """ Store the corpus to disk, in a human-readable text format. @@ -211,7 +201,6 @@ def saveAsText(self): self.saveDocuments(self.config.resultFile('docids.txt')) matutils.MmWriter.writeCorpus(self.config.resultFile('bow.mm'), self) - def articleDir(self, docNo): """ Return absolute normalized path on filesystem to article no. `docNo`. @@ -220,7 +209,6 @@ def articleDir(self, docNo): source = self.config.sources[sourceId] return os.path.join(source.baseDir, outPath) - def getMeta(self, docNo): """ Return metadata for article no. `docNo`. @@ -228,5 +216,4 @@ def getMeta(self, docNo): sourceId, uri = self.documents[docNo] source = self.config.sources[sourceId] return source.getMeta(uri) -#endclass DmlCorpus - +# endclass DmlCorpus diff --git a/gensim/examples/dmlcz/gensim_build.py b/gensim/examples/dmlcz/gensim_build.py index 4e258ada8d..9695241fb3 100755 --- a/gensim/examples/dmlcz/gensim_build.py +++ b/gensim/examples/dmlcz/gensim_build.py @@ -15,12 +15,9 @@ import logging import sys import os.path -import re - from gensim.corpora import sources, dmlcorpus - PREFIX = 'dmlcz' AT_HOME = False @@ -51,12 +48,12 @@ def buildDmlCorpus(config): dml = dmlcorpus.DmlCorpus() - dml.processConfig(config, shuffle = True) + dml.processConfig(config, shuffle=True) dml.buildDictionary() - dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words + dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words - dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) - dml.saveAsText() # save id mappings and documents as text data (matrix market format) + dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) + dml.saveAsText() # save id mappings and documents as text data (matrix market format) return dml diff --git a/gensim/examples/dmlcz/gensim_genmodel.py b/gensim/examples/dmlcz/gensim_genmodel.py index 428f8b5536..df11f9696c 100755 --- a/gensim/examples/dmlcz/gensim_genmodel.py +++ b/gensim/examples/dmlcz/gensim_genmodel.py @@ -15,25 +15,22 @@ import logging import sys import os.path -import re - -from gensim.corpora import sources, dmlcorpus, MmCorpus +from gensim.corpora import dmlcorpus, MmCorpus from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel import gensim_build # internal method parameters -DIM_RP = 300 # dimensionality for random projections -DIM_LSI = 200 # for lantent semantic indexing -DIM_LDA = 100 # for latent dirichlet allocation - +DIM_RP = 300 # dimensionality for random projections +DIM_LSI = 200 # for lantent semantic indexing +DIM_LDA = 100 # for latent dirichlet allocation if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level = logging.INFO) + logging.root.setLevel(level=logging.INFO) logging.info("running %s" % ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -56,22 +53,22 @@ corpus = MmCorpus(config.resultFile('bow.mm')) if method == 'tfidf': - model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) + model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) model.save(config.resultFile('model_tfidf.pkl')) elif method == 'lda': - model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) + model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA) model.save(config.resultFile('model_lda.pkl')) elif method == 'lsi': # first, transform word counts to tf-idf weights - tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) + tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space - model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI) + model = lsimodel.LsiModel(tfidf[corpus], id2word=id2word, numTopics=DIM_LSI) model.save(config.resultFile('model_lsi.pkl')) elif method == 'rp': # first, transform word counts to tf-idf weights - tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) + tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space - model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP) + model = rpmodel.RpModel(tfidf[corpus], id2word=id2word, numTopics=DIM_RP) model.save(config.resultFile('model_rp.pkl')) else: raise ValueError('unknown topic extraction method: %s' % repr(method)) @@ -79,4 +76,3 @@ MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus]) logging.info("finished running %s" % program) - diff --git a/gensim/examples/dmlcz/gensim_xml.py b/gensim/examples/dmlcz/gensim_xml.py index 8ac2b265c2..f810d045d4 100755 --- a/gensim/examples/dmlcz/gensim_xml.py +++ b/gensim/examples/dmlcz/gensim_xml.py @@ -14,10 +14,8 @@ import logging import sys import os.path -import re - -from gensim.corpora import sources, dmlcorpus, MmCorpus +from gensim.corpora import dmlcorpus, MmCorpus from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity import gensim_build @@ -28,8 +26,8 @@ DRY_RUN = False # how many 'most similar' documents to store in each similar.xml? -MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored) -MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit). +MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored) +MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit). # if there are no similar articles (after the pruning), do we still want to generate similar.xml? SAVE_EMPTY = True @@ -55,29 +53,28 @@ """ - def generateSimilar(corpus, index, method): - for docNo, topSims in enumerate(index): # for each document + for docNo, topSims in enumerate(index): # for each document # store similarities to the following file outfile = os.path.join(corpus.articleDir(docNo), 'similar_%s.xml' % method) - articles = [] # collect similars in this list - for docNo2, score in topSims: # for each most similar article - if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring) + articles = [] # collect similars in this list + for docNo2, score in topSims: # for each most similar article + if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring) source, (intId, pathId) = corpus.documents[docNo2] meta = corpus.getMeta(docNo2) suffix, author, title = '', meta.get('author', ''), meta.get('title', '') - articles.append(ARTICLE % locals()) # add the similar article to output + articles.append(ARTICLE % locals()) # add the similar article to output if len(articles) >= MAX_SIMILAR: break # now `articles` holds multiple strings in similar_*.xml format if SAVE_EMPTY or articles: - output = ''.join(articles) # concat all similars to one string - if not DRY_RUN: # only open output files for writing if DRY_RUN is false + output = ''.join(articles) # concat all similars to one string + if not DRY_RUN: # only open output files for writing if DRY_RUN is false logging.info("generating %s (%i similars)" % (outfile, len(articles))) outfile = open(outfile, 'w') - outfile.write(SIMILAR % output) # add xml headers and print to file + outfile.write(SIMILAR % output) # add xml headers and print to file outfile.close() else: logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output)) @@ -85,7 +82,6 @@ def generateSimilar(corpus, index, method): logging.debug("skipping %s (no similar found)" % outfile) - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) @@ -112,14 +108,13 @@ def generateSimilar(corpus, index, method): input = MmCorpus(config.resultFile('_%s.mm' % method)) assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) - # initialize structure for similarity queries - if method == 'lsi' or method == 'rp': # for these methods, use dense vectors + # initialize structure for similarity queries + if method == 'lsi' or method == 'rp': # for these methods, use dense vectors index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) else: index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) - index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) - generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format + index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) + generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format logging.info("finished running %s" % program) - diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py index f6244ad361..da4e0ac0b0 100644 --- a/gensim/examples/dmlcz/sources.py +++ b/gensim/examples/dmlcz/sources.py @@ -20,10 +20,13 @@ import os.path import re -import xml.sax # for parsing arxmliv articles +import xml.sax # for parsing arxmliv articles from gensim import utils +import sys +if sys.version_info[0] >= 3: + unicode = str PAT_TAG = re.compile('<(.*?)>(.*)') logger = logging.getLogger('gensim.corpora.sources') @@ -44,6 +47,7 @@ class ArticleSource(object): This class is just an ABC interface; see eg. DmlSource or ArxmlivSource classes for concrete instances. """ + def __init__(self, sourceId): self.sourceId = sourceId @@ -64,8 +68,7 @@ def tokenize(self, content): def normalizeWord(self, word): raise NotImplementedError('Abstract Base Class') -#endclass ArticleSource - +# endclass ArticleSource class DmlSource(ArticleSource): @@ -79,6 +82,7 @@ class DmlSource(ArticleSource): See the ArticleSource class for general info on sources. """ + def __init__(self, sourceId, baseDir): self.sourceId = sourceId self.baseDir = os.path.normpath(baseDir) @@ -94,12 +98,12 @@ def parseDmlMeta(cls, xmlfile): result = {} xml = open(xmlfile) for line in xml: - if line.find('
') >= 0: # skip until the beginning of
tag + if line.find('
') >= 0: # skip until the beginning of
tag break for line in xml: - if line.find('
') >= 0: # end of
, we're done + if line.find('
') >= 0: # end of
, we're done break - p = re.search(PAT_TAG, line) # HAX assumes one element = one line; proper xml parsing probably better... but who cares + p = re.search(PAT_TAG, line) # HAX assumes one element = one line; proper xml parsing probably better... but who cares if p: name, cont = p.groups() name = name.split()[0] @@ -110,20 +114,18 @@ def parseDmlMeta(cls, xmlfile): result.setdefault('msc', []).append(cont) continue if name == 'idMR': - cont = cont[2:] # omit MR from MR123456 + cont = cont[2:] # omit MR from MR123456 if name and cont: result[name] = cont xml.close() return result - def idFromDir(self, path): assert len(path) > len(self.baseDir) - intId = path[1 + path.rfind('#') : ] - pathId = path[len(self.baseDir) + 1 : ] + intId = path[1 + path.rfind('#'):] + pathId = path[1 + len(self.baseDir):] return (intId, pathId) - def isArticle(self, path): # in order to be valid, the article directory must start with '#' if not os.path.basename(path).startswith('#'): @@ -138,7 +140,6 @@ def isArticle(self, path): return False return True - def findArticles(self): dirTotal = artAccepted = 0 logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir)) @@ -151,7 +152,6 @@ def findArticles(self): logger.info('%i directories processed, found %i articles' % (dirTotal, artAccepted)) - def getContent(self, uri): """ Return article content as a single large string. @@ -160,7 +160,6 @@ def getContent(self, uri): filename = os.path.join(self.baseDir, pathId, 'fulltext.txt') return open(filename).read() - def getMeta(self, uri): """ Return article metadata as a attribute->value dictionary. @@ -169,15 +168,13 @@ def getMeta(self, uri): filename = os.path.join(self.baseDir, pathId, 'meta.xml') return DmlSource.parseDmlMeta(filename) - def tokenize(self, content): - return [token.encode('utf8') for token in utils.tokenize(content, errors = 'ignore') if not token.isdigit()] - + return [token.encode('utf8') for token in utils.tokenize(content, errors='ignore') if not token.isdigit()] def normalizeWord(self, word): wordU = unicode(word, 'utf8') - return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring -#endclass DmlSource + return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring +# endclass DmlSource class DmlCzSource(DmlSource): @@ -190,13 +187,13 @@ class DmlCzSource(DmlSource): See the ArticleSource class for general info on sources. """ + def idFromDir(self, path): assert len(path) > len(self.baseDir) dmlczId = open(os.path.join(path, 'dspace_id')).read().strip() - pathId = path[len(self.baseDir) + 1 : ] + pathId = path[1 + len(self.baseDir):] return (dmlczId, pathId) - def isArticle(self, path): # in order to be valid, the article directory must start with '#' if not os.path.basename(path).startswith('#'): @@ -215,7 +212,6 @@ def isArticle(self, path): return False return True - def getContent(self, uri): """ Return article content as a single large string. @@ -236,8 +232,7 @@ def getContent(self, uri): assert os.path.exists(filename2) filename = filename2 return open(filename).read() -#endclass DmlCzSource - +# endclass DmlCzSource class ArxmlivSource(ArticleSource): @@ -253,8 +248,8 @@ class ArxmlivSource(ArticleSource): """ class ArxmlivContentHandler(xml.sax.handler.ContentHandler): def __init__(self): - self.path = [''] # help structure for sax event parsing - self.tokens = [] # will contain tokens once parsing is finished + self.path = [''] # help structure for sax event parsing + self.tokens = [] # will contain tokens once parsing is finished def startElement(self, name, attr): # for math tokens, we only care about Math elements directly below

@@ -270,10 +265,9 @@ def endElement(self, name): def characters(self, text): # for text, we only care about tokens directly within the

tag if self.path[-1] == 'p': - tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()] + tokens = [token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if not token.isdigit()] self.tokens.extend(tokens) - #endclass ArxmlivHandler - + # endclass ArxmlivHandler class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler): # Python2.5 implementation of xml.sax is broken -- character streams and @@ -287,25 +281,21 @@ def error(self, exception): # logger.debug("SAX error parsing xml: %s" % exception) warning = fatalError = error - #endclass ArxmlivErrorHandler - + # endclass ArxmlivErrorHandler def __init__(self, sourceId, baseDir): self.sourceId = sourceId self.baseDir = os.path.normpath(baseDir) - def __str__(self): return self.sourceId - def idFromDir(self, path): assert len(path) > len(self.baseDir) - intId = path[1 + path.rfind('#') : ] - pathId = path[len(self.baseDir) + 1 : ] + intId = path[1 + path.rfind('#'):] + pathId = path[1 + len(self.baseDir):] return (intId, pathId) - def isArticle(self, path): # in order to be valid, the article directory must start with '#' if not os.path.basename(path).startswith('#'): @@ -316,7 +306,6 @@ def isArticle(self, path): return False return True - def findArticles(self): dirTotal = artAccepted = 0 logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir)) @@ -329,7 +318,6 @@ def findArticles(self): logger.info('%i directories processed, found %i articles' % (dirTotal, artAccepted)) - def getContent(self, uri): """ Return article content as a single large string. @@ -338,15 +326,13 @@ def getContent(self, uri): filename = os.path.join(self.baseDir, pathId, 'tex.xml') return open(filename).read() - def getMeta(self, uri): """ Return article metadata as an attribute->value dictionary. """ # intId, pathId = uri # filename = os.path.join(self.baseDir, pathId, 'tex.xml') - return {'language': 'eng'} # TODO maybe parse out some meta; but currently not needed for anything... - + return {'language': 'eng'} # TODO maybe parse out some meta; but currently not needed for anything... def tokenize(self, content): """ @@ -361,12 +347,9 @@ def tokenize(self, content): xml.sax.parseString(content, handler, ArxmlivSource.ArxmlivErrorHandler()) return handler.tokens - def normalizeWord(self, word): - if word[0] == '$': # ignore math tokens + if word[0] == '$': # ignore math tokens return word wordU = unicode(word, 'utf8') - return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring -#endclass ArxmlivSource - - + return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring +# endclass ArxmlivSource diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 58e6f45b13..4087fd8893 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -14,7 +14,6 @@ from __future__ import with_statement import logging -import itertools from gensim import utils, matutils from six.moves import xrange @@ -48,13 +47,13 @@ class CorpusABC(utils.SaveLoad): state, and **not** the documents themselves. See the `save_corpus` static method for serializing the actual stream content. """ + def __iter__(self): """ Iterate over the corpus, yielding one document at a time. """ raise NotImplementedError('cannot instantiate abstract base class') - def save(self, *args, **kwargs): import warnings warnings.warn("corpus.save() stores only the (tiny) iteration object; " @@ -98,16 +97,16 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): # example code: logger.info("converting corpus to ??? format: %s" % fname) with utils.smart_open(fname, 'wb') as fout: - for doc in corpus: # iterate over the document stream - fmt = str(doc) # format the document appropriately... - fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk -#endclass CorpusABC + for doc in corpus: # iterate over the document stream + fmt = str(doc) # format the document appropriately... + fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk +# endclass CorpusABC class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): self.obj, self.corpus, self.chunksize = obj, corpus, chunksize - for key, value in kwargs.items(): #add the new parameters like per_word_topics to base class object of LdaModel + for key, value in kwargs.items(): # add the new parameters like per_word_topics to base class object of LdaModel setattr(self.obj, key, value) self.metadata = False @@ -125,10 +124,10 @@ def __iter__(self): def __getitem__(self, docno): if hasattr(self.corpus, '__getitem__'): - return self.obj[self.corpus[docno]] + return self.obj[self.corpus[docno]] else: raise RuntimeError('Type {} does not support slicing.'.format(type(self.corpus))) -#endclass TransformedCorpus +# endclass TransformedCorpus class TransformationABC(utils.SaveLoad): @@ -157,14 +156,13 @@ def __getitem__(self, vec): """ raise NotImplementedError('cannot instantiate abstract base class') - def _apply(self, corpus, chunksize=None, **kwargs): """ Apply the transformation to a whole corpus (as opposed to a single document) and return the result as another corpus. """ return TransformedCorpus(self, corpus, chunksize, **kwargs) -#endclass TransformationABC +# endclass TransformationABC class SimilarityABC(utils.SaveLoad): @@ -183,16 +181,15 @@ class SimilarityABC(utils.SaveLoad): similarities of each document in the corpus against the whole corpus (ie., the query is each corpus document in turn). """ + def __init__(self, corpus): raise NotImplementedError("cannot instantiate Abstract Base Class") - def get_similarities(self, doc): # (Sparse)MatrixSimilarity override this method so that they both use the # same __getitem__ method, defined below raise NotImplementedError("cannot instantiate Abstract Base Class") - def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. @@ -210,7 +207,7 @@ def __getitem__(self, query): # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): - import warnings + import warnings # noqa:F401 # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: @@ -225,7 +222,7 @@ def __getitem__(self, query): # if maintain_sparity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): - return matutils.scipy2scipy_clipped(result, self.num_best) + return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn @@ -235,7 +232,6 @@ def __getitem__(self, query): # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best) - def __iter__(self): """ For each index document, compute cosine similarity against all other @@ -276,4 +272,4 @@ def __iter__(self): # restore old normalization value self.normalize = norm -#endclass SimilarityABC +# endclass SimilarityABC diff --git a/gensim/matutils.py b/gensim/matutils.py index a8e427eb50..58904e1657 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -217,6 +217,7 @@ class Scipy2Corpus(object): This is the mirror function to `corpus2csc`. """ + def __init__(self, vecs): """ `vecs` is a sequence of dense and/or sparse vectors, such as a 2d np array, @@ -267,6 +268,7 @@ def full2sparse(vec, eps=1e-9): nnz = np.nonzero(abs(vec) > eps)[0] return list(zip(nnz, vec.take(nnz))) + dense2vec = full2sparse @@ -318,6 +320,7 @@ class Dense2Corpus(object): This is the mirror function to `corpus2dense`. """ + def __init__(self, dense, documents_columns=True): if documents_columns: self.dense = dense.T @@ -330,7 +333,7 @@ def __iter__(self): def __len__(self): return len(self.dense) -#endclass DenseCorpus +# endclass DenseCorpus class Sparse2Corpus(object): @@ -340,6 +343,7 @@ class Sparse2Corpus(object): This is the mirror function to `corpus2csc`. """ + def __init__(self, sparse, documents_columns=True): if documents_columns: self.sparse = sparse.tocsc() @@ -352,7 +356,7 @@ def __iter__(self): def __len__(self): return self.sparse.shape[1] -#endclass Sparse2Corpus +# endclass Sparse2Corpus def veclen(vec): @@ -431,7 +435,7 @@ def unitvec(vec, norm='l2'): try: first = next(iter(vec)) # is there at least one element? - except: + except Exception: return vec if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format @@ -544,7 +548,7 @@ def hellinger(vec1, vec2): vec1, vec2 = dict(vec1), dict(vec2) if len(vec2) < len(vec1): vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector - sim = np.sqrt(0.5*sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) + sim = np.sqrt(0.5 * sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) return sim else: sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum()) @@ -778,7 +782,7 @@ def close(self): logger.debug("closing %s" % self.fname) if hasattr(self, 'fout'): self.fout.close() -#endclass MmWriter +# endclass MmWriter class MmReader(object): @@ -790,6 +794,7 @@ class MmReader(object): matrix at once (unlike scipy.io.mmread). This allows us to process corpora which are larger than the available RAM. """ + def __init__(self, input, transposed=True): """ Initialize the matrix reader. @@ -864,7 +869,7 @@ def __iter__(self): if docid != previd: # change of document: return the document read so far (its id is prevId) if previd >= 0: - yield previd, document + yield previd, document # noqa:F821 # return implicit (empty) documents between previous id and new id # too, to keep consistent document numbering and corpus length @@ -912,4 +917,4 @@ def docbyoffset(self, offset): document.append((termid, val,)) # add another field to the current document return document -#endclass MmReader +# endclass MmReader diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 47638c6b41..5c25a86fd5 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -4,23 +4,23 @@ """ # bring model classes directly into package namespace, to save some typing -from .coherencemodel import CoherenceModel -from .hdpmodel import HdpModel -from .ldamodel import LdaModel -from .lsimodel import LsiModel -from .tfidfmodel import TfidfModel -from .rpmodel import RpModel -from .logentropy_model import LogEntropyModel -from .word2vec import Word2Vec -from .doc2vec import Doc2Vec -from .keyedvectors import KeyedVectors -from .ldamulticore import LdaMulticore -from .phrases import Phrases -from .normmodel import NormModel -from .atmodel import AuthorTopicModel -from .ldaseqmodel import LdaSeqModel - -from . import wrappers +from .coherencemodel import CoherenceModel # noqa:F401 +from .hdpmodel import HdpModel # noqa:F401 +from .ldamodel import LdaModel # noqa:F401 +from .lsimodel import LsiModel # noqa:F401 +from .tfidfmodel import TfidfModel # noqa:F401 +from .rpmodel import RpModel # noqa:F401 +from .logentropy_model import LogEntropyModel # noqa:F401 +from .word2vec import Word2Vec # noqa:F401 +from .doc2vec import Doc2Vec # noqa:F401 +from .keyedvectors import KeyedVectors # noqa:F401 +from .ldamulticore import LdaMulticore # noqa:F401 +from .phrases import Phrases # noqa:F401 +from .normmodel import NormModel # noqa:F401 +from .atmodel import AuthorTopicModel # noqa:F401 +from .ldaseqmodel import LdaSeqModel # noqa:F401 + +from . import wrappers # noqa:F401 from gensim import interfaces, utils @@ -42,12 +42,12 @@ class VocabTransform(interfaces.TransformationABC): >>> ... """ + def __init__(self, old2new, id2token=None): # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems()) self.old2new = old2new self.id2token = id2token - def __getitem__(self, bow): """ Return representation with the ids transformed. @@ -58,4 +58,4 @@ def __getitem__(self, bow): return self._apply(bow) return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new) -#endclass VocabTransform +# endclass VocabTransform diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 335f2af7f0..7e1ee02d62 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -64,6 +64,7 @@ class AuthorTopicState(LdaState): reduce traffic. """ + def __init__(self, eta, lambda_shape, gamma_shape): self.eta = eta self.sstats = np.zeros(lambda_shape) @@ -361,8 +362,8 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c """ try: - _ = len(chunk) - except: + _ = len(chunk) # noqa:F841 + except Exception: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: @@ -596,7 +597,7 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, try: len_input_corpus = len(corpus) - except: + except Exception: logger.warning("input corpus stream has no len(); counting documents") len_input_corpus = sum(1 for _ in corpus) if len_input_corpus == 0: diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 95a5117eee..4fa80ea15e 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -108,6 +108,7 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ + def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10, processes=-1): """ diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index f859e261a2..5e0dc20418 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -48,7 +48,7 @@ try: from queue import Queue except ImportError: - from Queue import Queue + from Queue import Queue # noqa:F401 from collections import namedtuple, defaultdict from timeit import default_timer @@ -158,7 +158,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) count = len(word2_indexes) + len(doctag_indexes) - if model.cbow_mean and count > 1 : + if model.cbow_mean and count > 1: l1 /= count neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) @@ -223,7 +223,6 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, padded_document_indexes[(pos - pre_pad_count): pos] # preceding words + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words ) - word_context_len = len(word_context_indexes) predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() @@ -253,6 +252,7 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): Replaces "sentence as a list of words" from Word2Vec. """ + def __str__(self): return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) @@ -288,6 +288,7 @@ class DocvecsArray(utils.SaveLoad): implementation, based on another persistence mechanism like LMDB, LevelDB, or SQLite, should also be possible. """ + def __init__(self, mapfile_path=None): self.doctags = {} # string -> Doctag (only filled if necessary) self.max_rawint = -1 # highest rawint-indexed doctag @@ -381,9 +382,9 @@ def estimated_lookup_memory(self): def reset_weights(self, model): length = max(len(self.doctags), self.count) if self.mapfile_path: - self.doctag_syn0 = np_memmap(self.mapfile_path+'.doctag_syn0', dtype=REAL, + self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size)) - self.doctag_syn0_lockf = np_memmap(self.mapfile_path+'.doctag_syn0_lockf', dtype=REAL, + self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,)) self.doctag_syn0_lockf.fill(1.0) else: @@ -416,7 +417,7 @@ def init_sims(self, replace=False): else: if self.mapfile_path: self.doctag_syn0norm = np_memmap( - self.mapfile_path+'.doctag_syn0norm', dtype=REAL, + self.mapfile_path + '.doctag_syn0norm', dtype=REAL, mode='w+', shape=self.doctag_syn0.shape) else: self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL) @@ -549,6 +550,7 @@ def repeat(self, word_count): class Doc2Vec(Word2Vec): """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" + def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs): @@ -863,6 +865,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* class TaggedBrownCorpus(object): """Iterate over documents from the Brown corpus (part of NLTK data), yielding each document out as a TaggedDocument object.""" + def __init__(self, dirname): self.dirname = dirname @@ -888,6 +891,7 @@ class TaggedLineDocument(object): Words are expected to be already preprocessed and separated by whitespace, tags are constructed automatically from the document line number.""" + def __init__(self, source): """ `source` can be either a string (filename) or a file object. diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 46995549f8..b26c8fa639 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -50,6 +50,7 @@ meanchangethresh = 0.00001 rhot_bound = 0.0 + def expect_log_sticks(sticks): """ For stick-breaking hdp, return the E[log(sticks)] @@ -121,6 +122,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): Model persistency is achieved through its `load`/`save` methods. """ + def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, @@ -302,7 +304,7 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, chunkids = [unique_words[id] for id in doc_word_ids] Elogbeta_doc = self.m_Elogbeta[:, doc_word_ids] - ## very similar to the hdp equations + # very similar to the hdp equations v = np.zeros((2, self.m_K - 1)) v[0] = 1.0 v[1] = self.m_alpha @@ -313,21 +315,20 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, likelihood = 0.0 old_likelihood = -1e200 converge = 1.0 - eps = 1e-100 iter = 0 max_iter = 100 # not yet support second level optimization yet, to be done in the future while iter < max_iter and (converge < 0.0 or converge > var_converge): - ### update variational parameters + # update variational parameters # var_phi if iter < 3: - var_phi = np.dot(phi.T, (Elogbeta_doc * doc_word_counts).T) + var_phi = np.dot(phi.T, (Elogbeta_doc * doc_word_counts).T) (log_var_phi, log_norm) = matutils.ret_log_normalize_vec(var_phi) var_phi = np.exp(log_var_phi) else: - var_phi = np.dot(phi.T, (Elogbeta_doc * doc_word_counts).T) + Elogsticks_1st + var_phi = np.dot(phi.T, (Elogbeta_doc * doc_word_counts).T) + Elogsticks_1st (log_var_phi, log_norm) = matutils.ret_log_normalize_vec(var_phi) var_phi = np.exp(log_var_phi) @@ -337,7 +338,7 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, (log_phi, log_norm) = matutils.ret_log_normalize_vec(phi) phi = np.exp(log_phi) else: - phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd + phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd # noqa:F821 (log_phi, log_norm) = matutils.ret_log_normalize_vec(phi) phi = np.exp(log_phi) @@ -406,7 +407,7 @@ def update_lambda(self, sstats, word_list, opt_o): if opt_o: self.optimal_ordering() - ## update top level sticks + # update top level sticks self.m_var_sticks[0] = self.m_varphi_ss[:self.m_T - 1] + 1.0 var_phi_sum = np.flipud(self.m_varphi_ss[1:]) self.m_var_sticks[1] = np.flipud(np.cumsum(var_phi_sum)) + self.m_gamma @@ -529,7 +530,7 @@ def hdp_to_lda(self): alpha = alpha * self.m_alpha # beta - beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta + \ + beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) return (alpha, beta) @@ -564,7 +565,7 @@ def evaluate_test_corpus(self, corpus): total_words += sum(doc_word_counts) logger.info('TEST: average score: %.5f, total score: %.5f, test docs: %d' % (score / total_words, score, len(corpus))) return score -#endclass HdpModel +# endclass HdpModel class HdpTopicFormatter(object): @@ -624,14 +625,14 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): return shown - def print_topic(self, topic_id, topn= None, num_words=None): + def print_topic(self, topic_id, topn=None, num_words=None): if num_words is not None: # deprecated num_words is used warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.") topn = num_words return self.show_topic(topic_id, topn, formatted=True) - def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= None,): + def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None,): if num_words is not None: # deprecated num_words is used warnings.warn("The parameter num_words for show_topic() would be deprecated in the updated version. Please use topn instead.") topn = num_words @@ -656,7 +657,6 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= N # we only return the topic_terms return topic[1] - def show_topic_terms(self, topic_data, num_words): return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]] diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 3568f43ab5..c2dfd419cb 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -59,7 +59,7 @@ try: from queue import Queue, Empty except ImportError: - from Queue import Queue, Empty + from Queue import Queue, Empty # noqa:F401 # If pyemd C extension is available, import it. # If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance @@ -69,9 +69,9 @@ except ImportError: PYEMD_EXT = False -from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ - double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis,\ - ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray +from numpy import dot, zeros, dtype, float32 as REAL,\ + double, array, vstack, fromstring, sqrt, newaxis,\ + ndarray, sum as np_sum, prod, ascontiguousarray from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.corpora.dictionary import Dictionary @@ -94,6 +94,7 @@ class Vocab(object): and for constructing binary trees (incl. both word leaves and inner nodes). """ + def __init__(self, **kwargs): self.count = 0 self.__dict__.update(kwargs) @@ -111,6 +112,7 @@ class KeyedVectors(utils.SaveLoad): Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly involved in training such as most_similar() """ + def __init__(self): self.syn0 = [] self.syn0norm = None @@ -160,7 +162,6 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row)))) - @classmethod def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): @@ -422,7 +423,7 @@ def wmdistance(self, document1, document2): distance_matrix = zeros((vocab_len, vocab_len), dtype=double) for i, t1 in dictionary.items(): for j, t2 in dictionary.items(): - if not t1 in docset1 or not t2 in docset2: + if t1 not in docset1 or t2 not in docset2: continue # Compute Euclidean distance between word vectors. distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2)) @@ -480,7 +481,7 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) positive = [positive] - all_words = set([self.vocab[word].index for word in positive+negative + all_words = set([self.vocab[word].index for word in positive + negative if not isinstance(word, ndarray) and word in self.vocab]) positive = [ @@ -572,7 +573,6 @@ def doesnt_match(self, words): return sorted(zip(dists, used_words))[0][1] def __getitem__(self, words): - """ Accept a single word or a list of words as input. @@ -692,7 +692,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c a, b, c, expected = [word.upper() for word in line.split()] else: a, b, c, expected = [word for word in line.split()] - except: + except Exception: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) continue if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: @@ -783,7 +783,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case else: a, b, sim = [word for word in line.split(delimiter)] sim = float(sim) - except: + except Exception: logger.info('skipping invalid line #%d in %s', line_no, pairs) continue if a not in ok_vocab or b not in ok_vocab: @@ -814,7 +814,6 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) return pearson, spearman, oov_ratio - def init_sims(self, replace=False): """ Precompute L2-normalized vectors. diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py index ea54a9c18b..91e7f237c7 100755 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -44,7 +44,7 @@ # timeout for the Queue object put/get blocking methods. # it should theoretically be infinity, but then keyboard interrupts don't work. # so this is really just a hack, see http://bugs.python.org/issue1360 -HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year +HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year LDA_DISPATCHER_PREFIX = 'gensim.lda_dispatcher' @@ -123,7 +123,7 @@ def getstate(self): logger.info("end of input, assigning all remaining jobs") logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) while self._jobsdone < self._jobsreceived: - time.sleep(0.5) # check every half a second + time.sleep(0.5) # check every half a second logger.info("merging states from %i workers" % len(self.workers)) workers = list(self.workers.values()) @@ -159,14 +159,12 @@ def jobdone(self, workerid): """ self._jobsdone += 1 logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) - self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way) - + self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way) def jobsdone(self): """Wrap self._jobsdone, needed for remote access through Pyro proxies""" return self._jobsdone - @Pyro4.oneway def exit(self): """ @@ -176,8 +174,8 @@ def exit(self): logger.info("terminating worker %s" % workerid) worker.exit() logger.info("terminating dispatcher") - os._exit(0) # exit the whole process (not just this thread ala sys.exit()) -#endclass Dispatcher + os._exit(0) # exit the whole process (not just this thread ala sys.exit()) +# endclass Dispatcher def main(): diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py index fbae4c0fff..ec87c29148 100755 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -46,8 +46,8 @@ def __init__(self): @Pyro4.expose def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() - self.jobsdone = 0 # how many jobs has this worker completed? - self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + self.jobsdone = 0 # how many jobs has this worker completed? + self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher self.finished = False logger.info("initializing worker #%s" % myid) @@ -76,7 +76,6 @@ def requestjob(self): else: logger.info("worker #%i stopping asking for jobs" % self.myid) - @utils.synchronous('lock_update') def processjob(self, job): logger.debug("starting to process job #%i" % self.jobsdone) @@ -94,7 +93,7 @@ def getstate(self): (self.myid, self.jobsdone)) result = self.model.state assert isinstance(result, ldamodel.LdaState) - self.model.clear() # free up mem in-between two EM cycles + self.model.clear() # free up mem in-between two EM cycles self.finished = True return result @@ -108,12 +107,11 @@ def reset(self, state): self.model.state.reset() self.finished = False - @Pyro4.oneway def exit(self): logger.info("terminating worker #%i" % self.myid) os._exit(0) -#endclass Worker +# endclass Worker def main(): diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 4dc1a024b8..9ed01d84c8 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -33,7 +33,6 @@ import logging import numbers import os -from random import sample import numpy as np import six @@ -92,6 +91,7 @@ class LdaState(utils.SaveLoad): reduce traffic. """ + def __init__(self, eta, shape): self.eta = eta self.sstats = np.zeros(shape) @@ -191,6 +191,7 @@ class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): Model persistency is achieved through its `load`/`save` methods. """ + def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, @@ -412,7 +413,7 @@ def inference(self, chunk, collect_sstats=False): """ try: _ = len(chunk) - except: + except Exception: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: @@ -588,7 +589,7 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, try: lencorpus = len(corpus) - except: + except Exception: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index e6cdd4ccec..39c3c40666 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -77,6 +77,7 @@ class LdaMulticore(LdaModel): Model persistency is achieved through its `load`/`save` methods. """ + def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, @@ -125,7 +126,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. - + `random_state` can be a numpy.random.RandomState object or the seed for one Example: @@ -145,10 +146,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, - gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability= minimum_probability, + gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability, minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics) - def update(self, corpus, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until @@ -169,7 +169,7 @@ def update(self, corpus, chunks_as_numpy=False): """ try: lencorpus = len(corpus) - except: + except Exception: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: @@ -249,7 +249,7 @@ def process_result_queue(force=False): process_result_queue() process_result_queue() - #endfor single corpus pass + # endfor single corpus pass # wait for all outstanding jobs to finish while queue_size[0] > 0: @@ -257,7 +257,7 @@ def process_result_queue(force=False): if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") - #endfor entire update + # endfor entire update pool.terminate() diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index f6a512e3b4..6399e17aae 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -32,6 +32,7 @@ logger = logging.getLogger('gensim.models.ldaseqmodel') + class LdaSeqModel(utils.SaveLoad): """ The constructor estimates Dynamic Topic Model parameters based @@ -90,7 +91,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ if corpus is not None: try: self.corpus_len = len(corpus) - except: + except Exception: logger.warning("input corpus stream has no len(); counting documents") self.corpus_len = sum(1 for _ in corpus) @@ -137,7 +138,6 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ # fit DTM self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize) - def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): """ Method to initialize State Space Language Model, topic wise. @@ -152,7 +152,6 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s # ldaseq.topic_chains[k].w_phi_sum = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) # ldaseq.topic_chains[k].w_phi_sq = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) - def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize): """ fit an lda sequence model: @@ -228,7 +227,6 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, return bound - def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize): """ Inference or E- Step. @@ -252,17 +250,15 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_infe return bound, gammas - def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize): """ Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. Need to pass the LdaSeq model, corpus, sufficient stats, gammas and lhoods matrices previously created, and LdaModel and LdaPost class objects. """ - doc_index = 0 # overall doc_index in corpus - time = 0 # current time-slice - doc_num = 0 # doc-index in current time-lice - num_topics = self.num_topics + doc_index = 0 # overall doc_index in corpus + time = 0 # current time-slice + doc_num = 0 # doc-index in current time-slice lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice time_slice = np.cumsum(np.array(self.time_slice)) @@ -299,7 +295,6 @@ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, ite return bound, gammas - def make_lda_seq_slice(self, lda, time): """ set up the LDA model topic-word values with that of ldaseq. @@ -310,7 +305,6 @@ def make_lda_seq_slice(self, lda, time): lda.alpha = np.copy(self.alphas) return lda - def fit_lda_seq_topics(self, topic_suffstats): """ Fit lda sequence topic wise. @@ -325,7 +319,6 @@ def fit_lda_seq_topics(self, topic_suffstats): return lhood - def print_topic_times(self, topic, top_terms=20): """ Prints one topic showing each time-slice. @@ -336,17 +329,15 @@ def print_topic_times(self, topic, top_terms=20): return topics - def print_topics(self, time=0, top_terms=20): """ Prints all topics in a particular time-slice. """ - topics =[] + topics = [] for topic in range(0, self.num_topics): topics.append(self.print_topic(topic, time, top_terms)) return topics - def print_topic(self, topic, time=0, top_terms=20): """ Topic is the topic number @@ -361,7 +352,6 @@ def print_topic(self, topic, time=0, top_terms=20): beststr = [(self.id2word[id_], topic[id_]) for id_ in bestn] return beststr - def doc_topics(self, doc_number): """ On passing the LdaSeqModel trained ldaseq object, the doc_number of your document in the corpus, @@ -371,7 +361,6 @@ def doc_topics(self, doc_number): doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis] return doc_topic[doc_number] - def dtm_vis(self, time, corpus): """ returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, specified by pyLDAvis format. @@ -395,7 +384,6 @@ def dtm_vis(self, time, corpus): # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics. return doc_topic, np.array(topic_term), doc_lengths, term_frequency, vocab - def dtm_coherence(self, time): """ returns all topics of a particular time-slice without probabilitiy values for it to be used @@ -441,11 +429,12 @@ class sslm(utils.SaveLoad): `fwd_mean`, `fwd_variance` are the forward posterior values. `zeta` is an extra variational parameter with a value for each time-slice """ + def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): self.vocab_len = vocab_len self.num_time_slices = num_time_slices self.obs_variance = obs_variance - self.chain_variance= chain_variance + self.chain_variance = chain_variance self.num_topics = num_topics # setting up matrices @@ -467,7 +456,6 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va self.w_phi_l_sq = None self.m_update_coeff_g = None - def update_zeta(self): """ Updates the Zeta Variational Parameter. @@ -478,7 +466,6 @@ def update_zeta(self): self.zeta[j] = np.sum(np.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2)) return self.zeta - def compute_post_variance(self, word, chain_variance): """ Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] @@ -514,12 +501,11 @@ def compute_post_variance(self, word, chain_variance): if fwd_variance[t] > 0.0: c = np.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) else: - c = 0 + c = 0 variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t]) return variance, fwd_variance - def compute_post_mean(self, word, chain_variance): """ Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] @@ -555,7 +541,6 @@ def compute_post_mean(self, word, chain_variance): mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1] return mean, fwd_mean - def compute_expected_log_prob(self): """ Compute the expected log probability given values of m. @@ -566,7 +551,6 @@ def compute_expected_log_prob(self): self.e_log_prob[w][t] = self.mean[w][t + 1] - np.log(self.zeta[t]) return self.e_log_prob - def sslm_counts_init(self, obs_variance, chain_variance, sstats): """ Initialize State Space Language Model with LDA sufficient statistics. @@ -595,7 +579,6 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats): self.zeta = self.update_zeta() self.e_log_prob = self.compute_expected_log_prob() - def fit_sslm(self, sstats): """ Fits variational distribution. @@ -643,7 +626,6 @@ def fit_sslm(self, sstats): self.e_log_prob = self.compute_expected_log_prob() return bound - def compute_bound(self, sstats, totals): """ Compute log probability bound. @@ -694,7 +676,6 @@ def compute_bound(self, sstats, totals): return val - def update_obs(self, sstats, totals): """ Function to perform optimization of obs. Parameters are suff_stats set up in the fit_sslm method. @@ -758,7 +739,6 @@ def update_obs(self, sstats, totals): return self.obs, self.zeta - def compute_mean_deriv(self, word, time, deriv): """ Used in helping find the optimum function. @@ -791,7 +771,6 @@ def compute_mean_deriv(self, word, time, deriv): return deriv - def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): """ Derivation of obs which is used in derivative function [df_obs] while optimizing. @@ -824,7 +803,7 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): for u in range(1, T + 1): mean_u = mean[u] - variance_u_prev = variance[u - 1] + variance_u_prev = variance[u - 1] # noqa:F841 mean_u_prev = mean[u - 1] dmean_u = mean_deriv[u] dmean_u_prev = mean_deriv[u - 1] @@ -848,6 +827,7 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): return deriv # endclass sslm + class LdaPost(utils.SaveLoad): """ @@ -876,7 +856,6 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= self.doc_weight = None self.renormalized_doc_weight = None - def update_phi(self, doc_number, time): """ Update variational multinomial parameters, based on a document and a time-slice. @@ -910,18 +889,17 @@ def update_phi(self, doc_number, time): phi_row = np.exp(log_phi_row) self.log_phi[n] = log_phi_row self.phi[n] = phi_row - n +=1 # increase iteration + n += 1 # increase iteration return self.phi, self.log_phi - def update_gamma(self): """ update variational dirichlet parameters as described in the original Blei LDA paper: gamma = alpha + sum(phi), over every topic for every word. """ self.gamma = np.copy(self.lda.alpha) - n = 0 # keep track of number of iterations for phi, log_phi + n = 0 # keep track of number of iterations for phi, log_phi for word_id, count in self.doc: phi_row = self.phi[n] for k in range(0, self.lda.num_topics): @@ -930,18 +908,16 @@ def update_gamma(self): return self.gamma - def init_lda_post(self): """ Initialize variational posterior, does not return anything. """ total = sum(count for word_id, count in self.doc) self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics) - self.phi[:len(self.doc),:] = 1.0 / self.lda.num_topics + self.phi[:len(self.doc), :] = 1.0 / self.lda.num_topics # doc_weight used during DIM # ldapost.doc_weight = None - def compute_lda_lhood(self): """ compute the likelihood bound @@ -959,7 +935,7 @@ def compute_lda_lhood(self): # influence_term = 0 digsum = digamma(gamma_sum) - model = "DTM" + model = "DTM" # noqa:F841 for k in range(0, num_topics): # below code only to be used in DIM mode # if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"): @@ -1034,7 +1010,6 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8, return lhood - def update_lda_seq_ss(self, time, doc, topic_suffstats): """ Update lda sequence sufficient statistics from an lda posterior. @@ -1085,7 +1060,7 @@ def f_obs(x, *args): for t in range(1, T + 1): mean_t = mean[t] mean_t_prev = mean[t - 1] - var_t_prev = variance[t - 1] + var_t_prev = variance[t - 1] # noqa:F841 val = mean_t - mean_t_prev term1 += val * val @@ -1107,8 +1082,8 @@ def f_obs(x, *args): return final -def df_obs(x, *args): +def df_obs(x, *args): """ Derivative of function which optimises obs. """ @@ -1121,6 +1096,6 @@ def df_obs(x, *args): if model == "DTM": deriv = sslm.compute_obs_deriv(word, word_counts, totals, mean_deriv_mtx, deriv) elif model == "DIM": - deriv = sslm.compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) + deriv = sslm.compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) # noqa:F821 return np.negative(deriv) diff --git a/gensim/models/lsi_dispatcher.py b/gensim/models/lsi_dispatcher.py index 8c4fb78dd3..5a69327522 100755 --- a/gensim/models/lsi_dispatcher.py +++ b/gensim/models/lsi_dispatcher.py @@ -15,7 +15,11 @@ from __future__ import with_statement -import os, sys, logging, threading, time +import os +import sys +import logging +import threading +import time from six import iteritems, itervalues try: from Queue import Queue @@ -37,8 +41,7 @@ # timeout for the Queue object put/get blocking methods. # it should really be infinity, but then keyboard interrupts don't work. # so this is really just a hack, see http://bugs.python.org/issue1360 -HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year - +HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year class Dispatcher(object): @@ -55,7 +58,7 @@ def __init__(self, maxsize=0): """ self.maxsize = maxsize self.workers = {} - self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) + self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) @Pyro4.expose def initialize(self, **model_params): @@ -71,7 +74,7 @@ def initialize(self, **model_params): # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} with utils.getNS() as ns: - self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self + self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self for name, uri in iteritems(ns.list(prefix='gensim.lsi_worker')): try: worker = Pyro4.Proxy(uri) @@ -115,7 +118,7 @@ def getstate(self): logger.info("end of input, assigning all remaining jobs") logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) while self._jobsdone < self._jobsreceived: - time.sleep(0.5) # check every half a second + time.sleep(0.5) # check every half a second # TODO: merge in parallel, so that we're done in `log_2(workers)` merges, # and not `workers - 1` merges! @@ -156,14 +159,12 @@ def jobdone(self, workerid): self._jobsdone += 1 logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) worker = self.workers[workerid] - worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) - + worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) def jobsdone(self): """Wrap self._jobsdone, needed for remote access through proxies""" return self._jobsdone - @Pyro4.oneway def exit(self): """ @@ -173,9 +174,8 @@ def exit(self): logger.info("terminating worker %s" % workerid) worker.exit() logger.info("terminating dispatcher") - os._exit(0) # exit the whole process (not just this thread ala sys.exit()) -#endclass Dispatcher - + os._exit(0) # exit the whole process (not just this thread ala sys.exit()) +# endclass Dispatcher def main(): @@ -197,6 +197,5 @@ def main(): logger.info("finished running %s" % program) - if __name__ == '__main__': main() diff --git a/gensim/models/lsi_worker.py b/gensim/models/lsi_worker.py index b9de939962..4cae372ffd 100755 --- a/gensim/models/lsi_worker.py +++ b/gensim/models/lsi_worker.py @@ -17,7 +17,9 @@ from __future__ import with_statement -import os, sys, logging +import os +import sys +import logging import threading import tempfile try: @@ -31,8 +33,7 @@ logger = logging.getLogger('gensim.models.lsi_worker') -SAVE_DEBUG = 0 # save intermediate models after every SAVE_DEBUG updates (0 for never) - +SAVE_DEBUG = 0 # save intermediate models after every SAVE_DEBUG updates (0 for never) class Worker(object): @@ -42,8 +43,8 @@ def __init__(self): @Pyro4.expose def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() - self.jobsdone = 0 # how many jobs has this worker completed? - self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + self.jobsdone = 0 # how many jobs has this worker completed? + self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher self.finished = False logger.info("initializing worker #%s" % myid) @@ -72,7 +73,6 @@ def requestjob(self): else: logger.info("worker #%i stopping asking for jobs" % self.myid) - @utils.synchronous('lock_update') def processjob(self, job): self.model.add_documents(job) @@ -97,17 +97,15 @@ def reset(self): self.model.projection = self.model.projection.empty_like() self.finished = False - @Pyro4.oneway def exit(self): logger.info("terminating worker #%i" % self.myid) os._exit(0) -#endclass Worker - +# endclass Worker def main(): - logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info("running %s" % " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -121,6 +119,5 @@ def main(): logger.info("finished running %s" % program) - if __name__ == '__main__': main() diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 8e326fdc6c..f9e16cd23b 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -218,7 +218,7 @@ def merge(self, other, decay=1.0): self.u[:, i] *= -1.0 # diff = np.dot(self.u.T, self.u) - np.eye(self.u.shape[1]) # logger.info('orth error after=%f' % np.sum(diff * diff)) -#endclass Projection +# endclass Projection class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): @@ -242,6 +242,7 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): .. [2] https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q4-how-do-you-output-the-u-s-vt-matrices-of-lsi """ + def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS): @@ -328,7 +329,6 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, if corpus is not None: self.add_documents(corpus) - def add_documents(self, corpus, chunksize=None, decay=None): """ Update singular value decomposition to take into account a new @@ -581,7 +581,7 @@ def load(cls, fname, *args, **kwargs): except Exception as e: logging.warning("failed to load projection from %s: %s" % (projection_fname, e)) return result -#endclass LsiModel +# endclass LsiModel def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): diff --git a/gensim/models/normmodel.py b/gensim/models/normmodel.py index 07bdcce650..a78dc604dc 100644 --- a/gensim/models/normmodel.py +++ b/gensim/models/normmodel.py @@ -29,6 +29,7 @@ class NormModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods """ + def __init__(self, corpus=None, norm='l2'): """ Compute the 'l1' or 'l2' normalization by normalizing separately @@ -72,4 +73,4 @@ def normalize(self, bow): def __getitem__(self, bow): return self.normalize(bow) -#endclass NormModel +# endclass NormModel diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 33390fc08e..1f0826258c 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -107,6 +107,7 @@ class Phrases(interfaces.TransformationABC): and `phrases[corpus]` syntax. """ + def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, scoring='default'): @@ -390,6 +391,7 @@ class Phraser(interfaces.TransformationABC): other values.) """ + def __init__(self, phrases_model): self.threshold = phrases_model.threshold self.min_count = phrases_model.min_count @@ -460,7 +462,7 @@ def __getitem__(self, sentence): sys.exit(1) infile = sys.argv[1] - from gensim.models import Phrases # for pickle + from gensim.models import Phrases # noqa:F811 for pickle from gensim.models.word2vec import Text8Corpus sentences = Text8Corpus(infile) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index e4fef5acb4..1186a041a0 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -34,6 +34,7 @@ class RpModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ + def __init__(self, corpus, id2word=None, num_topics=300): """ `id2word` is a mapping from word ids (integers) to words (strings). It is @@ -94,4 +95,4 @@ def __getitem__(self, bow): def __setstate__(self, state): self.__dict__ = state self.freshly_loaded = True -#endclass RpModel +# endclass RpModel diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 1b4aea863b..4b5ba02e02 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -48,6 +48,7 @@ class TfidfModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ + def __init__( self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, wglobal=df2idf, normalize=True): @@ -101,11 +102,9 @@ def __init__( # be initialized in some other way pass - def __str__(self): return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) - def initialize(self, corpus): """ Compute inverse document weights, which will be used to modify term @@ -133,7 +132,6 @@ def initialize(self, corpus): self.num_docs, n_features, self.num_nnz) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) - def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. @@ -160,4 +158,4 @@ def __getitem__(self, bow, eps=1e-12): # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector -#endclass TfidfModel +# endclass TfidfModel diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a37de25158..255b9c553f 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -115,17 +115,15 @@ from Queue import Queue, Empty from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ - double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis,\ - ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack, logaddexp + uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ + empty, sum as np_sum, ones, logaddexp from scipy.special import expit from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.corpora.dictionary import Dictionary from six import iteritems, itervalues, string_types from six.moves import xrange from types import GeneratorType -from scipy import stats logger = logging.getLogger(__name__) @@ -215,7 +213,7 @@ def score_sentence_sg(model, sentence, work=None): # now go over all words from the window, predicting each one in turn start = max(0, pos - model.window) - for pos2, word2 in enumerate(word_vocabs[start : pos + model.window + 1], start): + for pos2, word2 in enumerate(word_vocabs[start: pos + model.window + 1], start): # don't train on OOV words and on the `word` itself if word2 is not None and pos2 != pos: log_prob_sentence += score_sg_pair(model, word, word2) @@ -372,7 +370,6 @@ def score_cbow_pair(model, word, l1): return sum(lprob) - class Word2Vec(utils.SaveLoad): """ Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/ @@ -503,10 +500,10 @@ def __init__( self.build_vocab(sentences, trim_rule=trim_rule) self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) - else : - if trim_rule is not None : + else: + if trim_rule is not None: logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") - logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored." ) + logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.") def initialize_word_vectors(self): self.wv = KeyedVectors() @@ -1140,12 +1137,12 @@ def update_weights(self): # randomize the remaining words for i in xrange(len(self.wv.syn0), len(self.wv.vocab)): # construct deterministic seed from word AND seed argument - newsyn0[i-len(self.wv.syn0)] = self.seeded_vector(self.wv.index2word[i] + str(self.seed)) + newsyn0[i - len(self.wv.syn0)] = self.seeded_vector(self.wv.index2word[i] + str(self.seed)) # Raise an error if an online update is run before initial training on a corpus if not len(self.wv.syn0): - raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " \ - "First build the vocabulary of your model with a corpus " \ + raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus " "before doing an online update.") self.wv.syn0 = vstack([self.wv.syn0, newsyn0]) @@ -1322,10 +1319,10 @@ def predict_output_word(self, context_words_list, topn=10): if word2_indices and self.cbow_mean: l1 /= len(word2_indices) - prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities + prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] #returning the most probable output words with their probabilities + return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] # returning the most probable output words with their probabilities def init_sims(self, replace=False): """ @@ -1377,7 +1374,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case def __str__(self): return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) - def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_lockf = False): + def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): warnings.warn("This method would be deprecated in the future. Keep just_word_vectors = model.wv to retain just the KeyedVectors instance for read-only querying of word vectors.") if save_syn1 and save_syn1neg and save_syn0_lockf: return @@ -1459,6 +1456,7 @@ def get_latest_training_loss(self): class BrownCorpus(object): """Iterate over sentences from the Brown corpus (part of NLTK data).""" + def __init__(self, dirname): self.dirname = dirname @@ -1481,6 +1479,7 @@ def __iter__(self): class Text8Corpus(object): """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .""" + def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH): self.fname = fname self.max_sentence_length = max_sentence_length @@ -1541,7 +1540,7 @@ def __iter__(self): line = utils.to_unicode(line).split() i = 0 while i < len(line): - yield line[i : i + self.max_sentence_length] + yield line[i: i + self.max_sentence_length] i += self.max_sentence_length except AttributeError: # If it didn't work like a file, use it as a string filename @@ -1550,7 +1549,7 @@ def __iter__(self): line = utils.to_unicode(line).split() i = 0 while i < len(line): - yield line[i:i + self.max_sentence_length] + yield line[i: i + self.max_sentence_length] i += self.max_sentence_length @@ -1619,7 +1618,7 @@ def __iter__(self): print(globals()['__doc__'] % locals()) sys.exit(1) - from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle + from gensim.models.word2vec import Word2Vec # noqa:F811 avoid referencing __main__ in pickle seterr(all='raise') # don't ignore numpy errors diff --git a/gensim/models/wrappers/__init__.py b/gensim/models/wrappers/__init__.py index a833ae0c58..9cd14ea8e7 100644 --- a/gensim/models/wrappers/__init__.py +++ b/gensim/models/wrappers/__init__.py @@ -2,9 +2,9 @@ This package contains wrappers for other topic modeling programs. """ -from .ldamallet import LdaMallet -from .dtmmodel import DtmModel -from .ldavowpalwabbit import LdaVowpalWabbit -from .fasttext import FastText -from .wordrank import Wordrank -from .varembed import VarEmbed \ No newline at end of file +from .ldamallet import LdaMallet # noqa:F401 +from .dtmmodel import DtmModel # noqa:F401 +from .ldavowpalwabbit import LdaVowpalWabbit # noqa:F401 +from .fasttext import FastText # noqa:F401 +from .wordrank import Wordrank # noqa:F401 +from .varembed import VarEmbed # noqa:F401 diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 5eff091417..bc02663e04 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -27,7 +27,6 @@ import os from subprocess import PIPE import numpy as np -import six from gensim import utils, corpora, matutils from gensim.utils import check_output @@ -89,7 +88,7 @@ def __init__( try: lencorpus = len(corpus) - except: + except Exception: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: @@ -222,9 +221,9 @@ def train(self, corpus, time_slices, mode, model): self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): - topic = "%03d" % t - self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) - self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) + topic = "%03d" % t + self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) + self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) @@ -256,7 +255,7 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted else: num_topics = min(num_topics, self.num_topics) chosen_topics = range(num_topics) - # add a little random jitter, to randomize results around the same + # add a little random jitter, to randomize results around the same # alpha # sort_alpha = self.alpha + 0.0001 * \ # numpy.random.rand(len(self.alpha)) @@ -320,7 +319,7 @@ def dtm_vis(self, corpus, time): all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis. input parameter is the year to do the visualisation. """ - topic_term = np.exp(self.lambda_[:,:,time]) / np.exp(self.lambda_[:,:,time]).sum() + topic_term = np.exp(self.lambda_[:, :, time]) / np.exp(self.lambda_[:, :, time]).sum() topic_term = topic_term * self.num_topics doc_topic = self.gamma_ @@ -339,7 +338,7 @@ def dtm_vis(self, corpus, time): def dtm_coherence(self, time, num_words=20): """ - returns all topics of a particular time-slice without probabilitiy values for it to be used + returns all topics of a particular time-slice without probabilitiy values for it to be used for either "u_mass" or "c_v" coherence. TODO: because of print format right now can only return for 1st time-slice. diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 9f68d67ca0..839cb46633 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -31,6 +31,7 @@ import tempfile import os import struct +from six.moves import xrange import numpy as np from numpy import float32 as REAL, sqrt, newaxis @@ -38,8 +39,6 @@ from gensim.models.keyedvectors import KeyedVectors, Vocab from gensim.models.word2vec import Word2Vec -from six import string_types - logger = logging.getLogger(__name__) FASTTEXT_FILEFORMAT_MAGIC = 793712314 @@ -52,6 +51,7 @@ class FastTextKeyedVectors(KeyedVectors): Subclasses KeyedVectors to implement oov lookups, storing ngrams and other FastText specific methods """ + def __init__(self): super(FastTextKeyedVectors, self).__init__() self.syn0_all_norm = None @@ -90,7 +90,7 @@ def word_vec(self, word, use_norm=False): word_vec += ngram_weights[self.ngrams[ngram]] if word_vec.any(): return word_vec / len(ngrams) - else: # No ngrams of the word are present in self.ngrams + else: # No ngrams of the word are present in self.ngrams raise KeyError('all ngrams for word %s absent from model' % word) def init_sims(self, replace=False): @@ -212,7 +212,7 @@ def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, a cmd.append("-%s" % option) cmd.append(str(value)) - output = utils.check_output(args=cmd) + output = utils.check_output(args=cmd) # noqa:F841 model = cls.load_fasttext_format(output_file) cls.delete_training_files(output_file) return model @@ -390,7 +390,6 @@ def init_ngrams(self): @staticmethod def compute_ngrams(word, min_n, max_n): - ngram_indices = [] BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix extended_word = BOW + word + EOW ngrams = [] @@ -414,4 +413,3 @@ def ft_hash(string): h = h * np.uint32(16777619) np.seterr(**old_settings) return h - diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index c93af78a1a..a4e435810f 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -53,6 +53,7 @@ class LdaMallet(utils.SaveLoad, basemodel.BaseTopicModel): takes place by passing around data files on disk and calling Java with subprocess.call(). """ + def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None, optimize_interval=0, iterations=1000, topic_threshold=0.0): """ @@ -84,7 +85,7 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.num_topics = num_topics - self.topic_threshold=topic_threshold + self.topic_threshold = topic_threshold self.alpha = alpha if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' @@ -193,7 +194,7 @@ def load_word_topics(self): _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" - _ = next(fin) # beta + _ = next(fin) # noqa:F841 beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split(" ") @@ -233,9 +234,9 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) - sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) # add a little random jitter, to randomize results around the same alpha + sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) # add a little random jitter, to randomize results around the same alpha sorted_topics = list(matutils.argsort(sort_alpha)) - chosen_topics = sorted_topics[:num_topics//2] + sorted_topics[-num_topics//2 : ] + chosen_topics = sorted_topics[: num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] for i in chosen_topics: if formatted: @@ -288,8 +289,6 @@ def get_version(self, direc_path): except Exception: return "Can't parse pom.xml version file" - - def read_doctopics(self, fname, eps=1e-6, renorm=True): """ Yield document topic vectors from MALLET's "doc-topics" format, as sparse gensim vectors. diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index 8fd0582bae..afa19c4327 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -76,6 +76,7 @@ class LdaVowpalWabbit(utils.SaveLoad): between Vowpal Wabbit and Python takes place by passing around data files on disk and calling the 'vw' binary with the subprocess module. """ + def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, offset=1, gamma_threshold=0.001, random_seed=None, @@ -197,7 +198,7 @@ def train(self, corpus): _run_vw_command(cmd) - # ensure that future updates of this model use correct offset + # ensure that future updates of this model use correct offset self.offset += corpus_size def update(self, corpus): @@ -216,7 +217,7 @@ def update(self, corpus): _run_vw_command(cmd) - # ensure that future updates of this model use correct offset + # ensure that future updates of this model use correct offset self.offset += corpus_size def log_perplexity(self, chunk): @@ -312,7 +313,7 @@ def load(cls, fname, *args, **kwargs): LOG.debug("Writing model bytes to '%s'", lda_vw._model_filename) with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle: fhandle.write(lda_vw._model_data) - lda_vw._model_data = None # no need to keep in memory after this + lda_vw._model_data = None # no need to keep in memory after this if lda_vw._topics_data: LOG.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) @@ -336,11 +337,11 @@ def _init_temp_dir(self, prefix='tmp'): def _get_vw_predict_command(self, corpus_size): """Get list of command line arguments for running prediction.""" cmd = [self.vw_path, - '--testonly', # don't update model with this data + '--testonly', # don't update model with this data '--lda_D', str(corpus_size), - '-i', self._model_filename, # load existing binary model + '-i', self._model_filename, # load existing binary model '-d', self._corpus_filename, - '--learning_rate', '0', # possibly not needed, but harmless + '--learning_rate', '0', # possibly not needed, but harmless '-p', self._predict_filename] if self.random_seed is not None: @@ -364,7 +365,7 @@ def _get_vw_train_command(self, corpus_size, update=False): '--cache_file', self._cache_filename, '--lda_epsilon', str(self.gamma_threshold), '--readable_model', self._topics_filename, - '-k', # clear cache + '-k', # clear cache '-f', self._model_filename] if update: @@ -570,6 +571,7 @@ def _bit_length(num): """Return number of bits needed to encode given number.""" return len(bin(num).lstrip('-0b')) + def vwmodel2ldamodel(vw_model, iterations=50): """ Function to convert vowpal wabbit model to gensim LdaModel. This works by diff --git a/gensim/models/wrappers/varembed.py b/gensim/models/wrappers/varembed.py index 6012be2233..eab1e0217c 100644 --- a/gensim/models/wrappers/varembed.py +++ b/gensim/models/wrappers/varembed.py @@ -63,7 +63,7 @@ def load_varembed_format(cls, vectors, morfessor_model=None): morpho_embeddings = D['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: - if sys.version_info >= (2, 7): #Morfessor is only supported for Python 2.7 and above. + if sys.version_info >= (2, 7): # Morfessor is only supported for Python 2.7 and above. try: import morfessor morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model) @@ -90,7 +90,7 @@ def load_word_embeddings(self, word_embeddings, word_to_ix): self.vocab_size = len(counts) self.vector_size = word_embeddings.shape[1] self.syn0 = np.zeros((self.vocab_size, self.vector_size)) - self.index2word = [None]*self.vocab_size + self.index2word = [None] * self.vocab_size logger.info("Corpus has %i words", len(self.vocab)) for word_id, word in enumerate(counts): self.vocab[word] = Vocab(index=word_id, count=counts[word]) @@ -99,7 +99,6 @@ def load_word_embeddings(self, word_embeddings, word_to_ix): assert((len(self.vocab), self.vector_size) == self.syn0.shape) logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size) - def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho_to_ix): """ Method to include morpheme embeddings into varembed vectors Allowed only in Python versions 2.7 and above. @@ -109,4 +108,3 @@ def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho [morpho_embeddings[morpho_to_ix.get(m, -1)] for m in morfessor_model.viterbi_segment(word)[0]]).sum(axis=0) self.syn0[self.vocab[word].index] += morpheme_embedding logger.info("Added morphemes to word vectors") - diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 38ab25f92b..8426af1d82 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -21,17 +21,13 @@ import logging import os -import sys import copy import multiprocessing -import numpy as np - from gensim import utils from gensim.models.keyedvectors import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec -from six import string_types from smart_open import smart_open from shutil import copyfile, rmtree @@ -45,7 +41,7 @@ class Wordrank(KeyedVectors): takes place by working with data files on disk and calling the Wordrank binary and glove's helper binaries (for preparing training data) with subprocess module. """ - + @classmethod def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, @@ -126,7 +122,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1]) f.write(meta_info.encode('utf-8')) - + if iter % dump_period == 0: iter += 1 else: @@ -162,7 +158,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") - output = utils.check_output(args=cmd) + output = utils.check_output(args=cmd) # noqa:F841 # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) @@ -176,7 +172,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, @classmethod def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1): - glove2word2vec(model_file, model_file+'.w2vformat') + glove2word2vec(model_file, model_file + '.w2vformat') model = cls.load_word2vec_format('%s.w2vformat' % model_file) if ensemble and context_file: model.ensemble_embedding(model_file, context_file) @@ -209,7 +205,7 @@ def sort_embeddings(self, vocab_file): def ensemble_embedding(self, word_embedding, context_embedding): """Replace syn0 with the sum of context and word embeddings.""" - glove2word2vec(context_embedding, context_embedding+'.w2vformat') + glove2word2vec(context_embedding, context_embedding + '.w2vformat') w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding) c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding) # compare vocab words using keys of dict vocab @@ -223,4 +219,3 @@ def ensemble_embedding(self, word_embedding, context_embedding): new_emb = w_emb.syn0 + c_emb.syn0 self.syn0 = new_emb return new_emb - diff --git a/gensim/nosy.py b/gensim/nosy.py index 3536965b68..2913e1e694 100644 --- a/gensim/nosy.py +++ b/gensim/nosy.py @@ -24,7 +24,7 @@ EXTENSIONS = ['*.py'] EXECUTABLE = 'nosetests test/' -DEFAULTARGS = '--with-color -exe'# -w tests' +DEFAULTARGS = '--with-color -exe' # -w tests' def checkSum(): @@ -39,6 +39,7 @@ def checkSum(): val += stats[stat.ST_SIZE] + stats[stat.ST_MTIME] return val + if __name__ == '__main__': val = 0 try: diff --git a/gensim/parsing/__init__.py b/gensim/parsing/__init__.py index 2af45d7477..5dcc010aec 100644 --- a/gensim/parsing/__init__.py +++ b/gensim/parsing/__init__.py @@ -3,5 +3,9 @@ """ # bring model classes directly into package namespace, to save some typing -from .porter import PorterStemmer -from .preprocessing import * +from .porter import PorterStemmer # noqa:F401 +from .preprocessing import (remove_stopwords, strip_punctuation, strip_punctuation2, # noqa:F401 + strip_tags, strip_short, strip_numeric, + strip_non_alphanum, strip_multiple_whitespaces, + split_alphanum, stem_text, preprocess_string, + preprocess_documents, read_file, read_files) diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py index 539271f58c..a22b8b94d1 100644 --- a/gensim/parsing/porter.py +++ b/gensim/parsing/porter.py @@ -103,7 +103,7 @@ def _vowelinstem(self): def _doublec(self, j): """True <=> j,(j-1) contain a double consonant.""" - return j > 0 and self.b[j] == self.b[j-1] and self._cons(j) + return j > 0 and self.b[j] == self.b[j - 1] and self._cons(j) def _cvc(self, i): """True <=> i-2,i-1,i has the form consonant - vowel - consonant @@ -113,25 +113,25 @@ def _cvc(self, i): cav(e), lov(e), hop(e), crim(e), but snow, box, tray. """ - if i < 2 or not self._cons(i) or self._cons(i-1) or not self._cons(i-2): + if i < 2 or not self._cons(i) or self._cons(i - 1) or not self._cons(i - 2): return False return self.b[i] not in "wxy" def _ends(self, s): """True <=> 0,...k ends with the string s.""" - if s[-1] != self.b[self.k]: # tiny speed-up + if s[-1] != self.b[self.k]: # tiny speed-up return 0 length = len(s) if length > (self.k + 1): return 0 - if self.b[self.k-length+1:self.k+1] != s: + if self.b[self.k - length + 1:self.k + 1] != s: return 0 self.j = self.k - length return 1 def _setto(self, s): """Set (j+1),...k to the characters in the string s, adjusting k.""" - self.b = self.b[:self.j+1] + s + self.b = self.b[:self.j + 1] + s self.k = len(self.b) - 1 def _r(self, s): @@ -171,9 +171,12 @@ def _step1ab(self): self.k -= 1 elif (self._ends("ed") or self._ends("ing")) and self._vowelinstem(): self.k = self.j - if self._ends("at"): self._setto("ate") - elif self._ends("bl"): self._setto("ble") - elif self._ends("iz"): self._setto("ize") + if self._ends("at"): + self._setto("ate") + elif self._ends("bl"): + self._setto("ble") + elif self._ends("iz"): + self._setto("ize") elif self._doublec(self.k): if self.b[self.k - 1] not in "lsz": self.k -= 1 @@ -193,87 +196,133 @@ def _step2(self): """ ch = self.b[self.k - 1] if ch == 'a': - if self._ends("ational"): self._r("ate") - elif self._ends("tional"): self._r("tion") + if self._ends("ational"): + self._r("ate") + elif self._ends("tional"): + self._r("tion") elif ch == 'c': - if self._ends("enci"): self._r("ence") - elif self._ends("anci"): self._r("ance") + if self._ends("enci"): + self._r("ence") + elif self._ends("anci"): + self._r("ance") elif ch == 'e': - if self._ends("izer"): self._r("ize") + if self._ends("izer"): + self._r("ize") elif ch == 'l': - if self._ends("bli"): self._r("ble") # --DEPARTURE-- + if self._ends("bli"): + self._r("ble") # --DEPARTURE-- # To match the published algorithm, replace this phrase with # if self._ends("abli"): self._r("able") - elif self._ends("alli"): self._r("al") - elif self._ends("entli"): self._r("ent") - elif self._ends("eli"): self._r("e") - elif self._ends("ousli"): self._r("ous") + elif self._ends("alli"): + self._r("al") + elif self._ends("entli"): + self._r("ent") + elif self._ends("eli"): + self._r("e") + elif self._ends("ousli"): + self._r("ous") elif ch == 'o': - if self._ends("ization"): self._r("ize") - elif self._ends("ation"): self._r("ate") - elif self._ends("ator"): self._r("ate") + if self._ends("ization"): + self._r("ize") + elif self._ends("ation"): + self._r("ate") + elif self._ends("ator"): + self._r("ate") elif ch == 's': - if self._ends("alism"): self._r("al") - elif self._ends("iveness"): self._r("ive") - elif self._ends("fulness"): self._r("ful") - elif self._ends("ousness"): self._r("ous") + if self._ends("alism"): + self._r("al") + elif self._ends("iveness"): + self._r("ive") + elif self._ends("fulness"): + self._r("ful") + elif self._ends("ousness"): + self._r("ous") elif ch == 't': - if self._ends("aliti"): self._r("al") - elif self._ends("iviti"): self._r("ive") - elif self._ends("biliti"): self._r("ble") - elif ch == 'g': # --DEPARTURE-- - if self._ends("logi"): self._r("log") + if self._ends("aliti"): + self._r("al") + elif self._ends("iviti"): + self._r("ive") + elif self._ends("biliti"): + self._r("ble") + elif ch == 'g': # --DEPARTURE-- + if self._ends("logi"): + self._r("log") # To match the published algorithm, delete this phrase def _step3(self): """Deal with -ic-, -full, -ness etc. Similar strategy to _step2.""" ch = self.b[self.k] if ch == 'e': - if self._ends("icate"): self._r("ic") - elif self._ends("ative"): self._r("") - elif self._ends("alize"): self._r("al") + if self._ends("icate"): + self._r("ic") + elif self._ends("ative"): + self._r("") + elif self._ends("alize"): + self._r("al") elif ch == 'i': - if self._ends("iciti"): self._r("ic") + if self._ends("iciti"): + self._r("ic") elif ch == 'l': - if self._ends("ical"): self._r("ic") - elif self._ends("ful"): self._r("") + if self._ends("ical"): + self._r("ic") + elif self._ends("ful"): + self._r("") elif ch == 's': - if self._ends("ness"): self._r("") + if self._ends("ness"): + self._r("") def _step4(self): """_step4() takes off -ant, -ence etc., in context vcvc.""" ch = self.b[self.k - 1] if ch == 'a': - if not self._ends("al"): return + if not self._ends("al"): + return elif ch == 'c': - if not self._ends("ance") and not self._ends("ence"): return + if not self._ends("ance") and not self._ends("ence"): + return elif ch == 'e': - if not self._ends("er"): return + if not self._ends("er"): + return elif ch == 'i': - if not self._ends("ic"): return + if not self._ends("ic"): + return elif ch == 'l': - if not self._ends("able") and not self._ends("ible"): return + if not self._ends("able") and not self._ends("ible"): + return elif ch == 'n': - if self._ends("ant"): pass - elif self._ends("ement"): pass - elif self._ends("ment"): pass - elif self._ends("ent"): pass - else: return + if self._ends("ant"): + pass + elif self._ends("ement"): + pass + elif self._ends("ment"): + pass + elif self._ends("ent"): + pass + else: + return elif ch == 'o': - if self._ends("ion") and self.b[self.j] in "st": pass - elif self._ends("ou"): pass + if self._ends("ion") and self.b[self.j] in "st": + pass + elif self._ends("ou"): + pass # takes care of -ous - else: return + else: + return elif ch == 's': - if not self._ends("ism"): return + if not self._ends("ism"): + return elif ch == 't': - if not self._ends("ate") and not self._ends("iti"): return + if not self._ends("ate") and not self._ends("iti"): + return elif ch == 'u': - if not self._ends("ous"): return + if not self._ends("ous"): + return elif ch == 'v': - if not self._ends("ive"): return + if not self._ends("ive"): + return elif ch == 'z': - if not self._ends("ize"): return + if not self._ends("ize"): + return else: return if self._m() > 1: @@ -295,7 +344,7 @@ def stem(self, w): w = w.lower() k = len(w) - 1 if k <= 1: - return w # --DEPARTURE-- + return w # --DEPARTURE-- # With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the @@ -311,7 +360,7 @@ def stem(self, w): self._step3() self._step4() self._step5() - return self.b[:self.k+1] + return self.b[:self.k + 1] def stem_sentence(self, txt): return " ".join(map(self.stem, txt.split())) diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index 367f0b02ad..a92eb98656 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -45,6 +45,8 @@ def remove_stopwords(s): RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) + + def strip_punctuation(s): s = utils.to_unicode(s) return RE_PUNCT.sub(" ", s) @@ -58,9 +60,11 @@ def strip_punctuation(s): RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) + + def strip_tags(s): s = utils.to_unicode(s) - return RE_TAGS.sub("",s) + return RE_TAGS.sub("", s) def strip_short(s, minsize=3): @@ -69,18 +73,24 @@ def strip_short(s, minsize=3): RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) + + def strip_numeric(s): s = utils.to_unicode(s) return RE_NUMERIC.sub("", s) RE_NONALPHA = re.compile(r"\W", re.UNICODE) + + def strip_non_alphanum(s): s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s) RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) + + def strip_multiple_whitespaces(s): s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s) @@ -88,6 +98,8 @@ def strip_multiple_whitespaces(s): RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) + + def split_alphanum(s): s = utils.to_unicode(s) s = RE_AL_NUM.sub(r"\1 \2", s) @@ -101,6 +113,8 @@ def stem_text(text): text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split()) + + stem = stem_text DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 8d3d1cb02f..4f13de4524 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -19,11 +19,9 @@ import os import sys -import random import logging import argparse -import gensim from smart_open import smart_open logger = logging.getLogger(__name__) @@ -73,4 +71,3 @@ def glove2word2vec(glove_input_file, word2vec_output_file): # do the actual conversion num_lines, num_dims = glove2word2vec(args.input, args.output) logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims) - diff --git a/gensim/scripts/make_wiki_online.py b/gensim/scripts/make_wiki_online.py index 26ca5d83ff..66985a566e 100755 --- a/gensim/scripts/make_wiki_online.py +++ b/gensim/scripts/make_wiki_online.py @@ -76,20 +76,20 @@ if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies + dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above diff --git a/gensim/scripts/make_wiki_online_lemma.py b/gensim/scripts/make_wiki_online_lemma.py index 26ca5d83ff..66985a566e 100755 --- a/gensim/scripts/make_wiki_online_lemma.py +++ b/gensim/scripts/make_wiki_online_lemma.py @@ -76,20 +76,20 @@ if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies + dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py index 26ca5d83ff..66985a566e 100755 --- a/gensim/scripts/make_wiki_online_nodebug.py +++ b/gensim/scripts/make_wiki_online_nodebug.py @@ -76,20 +76,20 @@ if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies + dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py index 26ca5d83ff..66985a566e 100755 --- a/gensim/scripts/make_wikicorpus.py +++ b/gensim/scripts/make_wikicorpus.py @@ -76,20 +76,20 @@ if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies + dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index ce2866c5ae..88cab79d25 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -15,8 +15,8 @@ The script will create two TSV files. A 2d tensor format file, and a Word Embedding metadata file. Both files will us the --output file name as prefix This script is used to convert the word2vec format to Tensorflow 2D tensor and metadata formats for Embedding Visualization -To use the generated TSV 2D tensor and metadata file in the Projector Visualizer, please -1) Open http://projector.tensorflow.org/. +To use the generated TSV 2D tensor and metadata file in the Projector Visualizer, please +1) Open http://projector.tensorflow.org/. 2) Choose "Load Data" from the left menu. 3) Select "Choose file" in "Load a TSV file of vectors." and choose you local "_tensor.tsv" file 4) Select "Choose file" in "Load a TSV file of metadata." and choose you local "_metadata.tsv" file @@ -28,7 +28,6 @@ import os import sys -import random import logging import argparse @@ -36,28 +35,30 @@ logger = logging.getLogger(__name__) -def word2vec2tensor(word2vec_model_path,tensor_filename, binary=False): + +def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): ''' Convert Word2Vec mode to 2D tensor TSV file and metadata file Args: param1 (str): word2vec model file path param2 (str): filename prefix param2 (bool): set True to use a binary Word2Vec model, defaults to False - ''' + ''' model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary) outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' - + with open(outfiletsv, 'w+') as file_vector: with open(outfiletsvmeta, 'w+') as file_metadata: for word in model.index2word: file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n')) vector_row = '\t'.join(map(str, model[word])) file_vector.write(vector_row + '\n') - + logger.info("2D tensor file saved to %s" % outfiletsv) logger.info("Tensor metadata file saved to %s" % outfiletsvmeta) + if __name__ == "__main__": logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.root.setLevel(level=logging.INFO) @@ -76,8 +77,8 @@ def word2vec2tensor(word2vec_model_path,tensor_filename, binary=False): parser.add_argument( "-o", "--output", required=True, help="Output tensor file name prefix") - parser.add_argument( "-b", "--binary", - required=False, + parser.add_argument("-b", "--binary", + required=False, help="If word2vec model in binary format, set True, else False") args = parser.parse_args() diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py index a8ee58645a..52baea6f4c 100644 --- a/gensim/scripts/word2vec_standalone.py +++ b/gensim/scripts/word2vec_standalone.py @@ -55,10 +55,10 @@ import argparse from numpy import seterr -logger = logging.getLogger(__name__) - from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle +logger = logging.getLogger(__name__) + if __name__ == "__main__": logging.basicConfig( diff --git a/gensim/similarities/__init__.py b/gensim/similarities/__init__.py index 9f9e03ee11..48915d89c9 100644 --- a/gensim/similarities/__init__.py +++ b/gensim/similarities/__init__.py @@ -3,4 +3,4 @@ """ # bring classes directly into package namespace, to save some typing -from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity, WmdSimilarity +from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity, WmdSimilarity # noqa:F401 diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 2a1a9512ea..efe71159d3 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -82,6 +82,7 @@ class Shard(utils.SaveLoad): request (query). """ + def __init__(self, fname, index): self.dirname, self.fname = os.path.split(fname) self.length = len(index) @@ -126,7 +127,7 @@ def __getitem__(self, query): try: index.num_best = self.num_best index.normalize = self.normalize - except: + except Exception: raise ValueError("num_best and normalize have to be set before querying a proxy Shard object") return index[query] @@ -149,6 +150,7 @@ class Similarity(interfaces.SimilarityABC): The shards themselves are simply stored as files to disk and mmap'ed back as needed. """ + def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'): """ Construct the index from `corpus`. The index can be later extended by calling @@ -456,7 +458,7 @@ def destroy(self): for fname in glob.glob(self.output_prefix + '*'): logger.info("deleting %s", fname) os.remove(fname) -#endclass Similarity +# endclass Similarity class MatrixSimilarity(interfaces.SimilarityABC): @@ -473,6 +475,7 @@ class MatrixSimilarity(interfaces.SimilarityABC): See also `Similarity` and `SparseMatrixSimilarity` in this module. """ + def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): """ `num_features` is the number of features in the corpus (will be determined @@ -550,7 +553,8 @@ def get_similarities(self, query): def __str__(self): return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1]) -#endclass MatrixSimilarity +# endclass MatrixSimilarity + class WmdSimilarity(interfaces.SimilarityABC): """ @@ -576,6 +580,7 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> query = 'Very good, you should seat outdoor.' >>> sims = instance[query] """ + def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=True, chunksize=256): """ corpus: List of lists of strings, as in gensim.models.word2vec. @@ -618,7 +623,7 @@ def get_similarities(self, query): # Compute similarity for each query. qresult = [self.w2v_model.wmdistance(document, query[qidx]) for document in self.corpus] qresult = numpy.array(qresult) - qresult = 1./(1.+qresult) # Similarity is the negative of the distance. + qresult = 1. / (1. + qresult) # Similarity is the negative of the distance. # Append single query result to list of all results. result.append(qresult) @@ -633,7 +638,8 @@ def get_similarities(self, query): def __str__(self): return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1]) -#endclass WmdSimilarity +# endclass WmdSimilarity + class SparseMatrixSimilarity(interfaces.SimilarityABC): """ @@ -652,6 +658,7 @@ class SparseMatrixSimilarity(interfaces.SimilarityABC): See also `Similarity` and `MatrixSimilarity` in this module. """ + def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): self.num_best = num_best @@ -729,4 +736,4 @@ def get_similarities(self, query): # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result -#endclass SparseMatrixSimilarity +# endclass SparseMatrixSimilarity diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py index 05db85dfbb..d0ca879225 100644 --- a/gensim/similarities/index.py +++ b/gensim/similarities/index.py @@ -46,7 +46,7 @@ def save(self, fname, protocol=2): _pickle.dump(d, fout, protocol=protocol) def load(self, fname): - fname_dict = fname+'.d' + fname_dict = fname + '.d' if not (os.path.exists(fname) and os.path.exists(fname_dict)): raise IOError( "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict)) @@ -62,8 +62,7 @@ def build_from_word2vec(self): """Build an Annoy index using word vectors from a Word2Vec model""" self.model.init_sims() - return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word - , self.model.vector_size) + return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word, self.model.vector_size) def build_from_doc2vec(self): """Build an Annoy index using document vectors from a Doc2Vec model""" diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py index c7efb84d4a..6bca1f109a 100644 --- a/gensim/summarization/__init__.py +++ b/gensim/summarization/__init__.py @@ -1,4 +1,4 @@ # bring model classes directly into package namespace, to save some typing -from .summarizer import summarize, summarize_corpus -from .keywords import keywords +from .summarizer import summarize, summarize_corpus # noqa:F401 +from .keywords import keywords # noqa:F401 diff --git a/gensim/test/basetests.py b/gensim/test/basetests.py index 1d22d8b1a8..a22bfe2d30 100644 --- a/gensim/test/basetests.py +++ b/gensim/test/basetests.py @@ -17,14 +17,14 @@ def testPrintTopic(self): topics = self.model.show_topics(formatted=True) for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) + self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821 def testPrintTopics(self): topics = self.model.print_topics() for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) + self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821 def testShowTopic(self): topic = self.model.show_topic(1) diff --git a/gensim/test/simspeed.py b/gensim/test/simspeed.py index c3ced33a55..4c1ffcab6f 100755 --- a/gensim/test/simspeed.py +++ b/gensim/test/simspeed.py @@ -21,7 +21,6 @@ from time import time import numpy as np -import scipy.sparse import gensim @@ -66,7 +65,7 @@ sims.extend(sim) else: sims = [index_dense[vec] for vec in query] - assert len(sims) == len(query) # make sure we have one result for each query document + assert len(sims) == len(query) # make sure we have one result for each query document taken = time() - start queries = math.ceil(1.0 * len(query) / chunksize) logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % @@ -85,7 +84,7 @@ sims.extend(sim) else: sims = [index_sparse[vec] for vec in query] - assert len(sims) == len(query) # make sure we have one result for each query document + assert len(sims) == len(query) # make sure we have one result for each query document taken = time() - start queries = math.ceil(1.0 * len(query) / chunksize) logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % diff --git a/gensim/test/simspeed2.py b/gensim/test/simspeed2.py index 158fa3fb9e..334730a6f1 100755 --- a/gensim/test/simspeed2.py +++ b/gensim/test/simspeed2.py @@ -20,9 +20,6 @@ import math from time import time -import numpy as np -import scipy.sparse - import gensim diff --git a/gensim/test/svd_error.py b/gensim/test/svd_error.py index 33b20a017a..4f204c1147 100755 --- a/gensim/test/svd_error.py +++ b/gensim/test/svd_error.py @@ -21,7 +21,9 @@ from __future__ import print_function, with_statement import logging -import os, sys, time +import os +import sys +import time import bz2 import itertools @@ -36,22 +38,21 @@ # no SVDLIBC: install with `easy_install sparsesvd` if you want SVDLIBC results as well sparsesvd = None -sparsesvd = None # don't use SVDLIBC +sparsesvd = None # don't use SVDLIBC -FACTORS = [300] # which num_topics to try -CHUNKSIZE = [10000, 1000] # which chunksize to try -POWER_ITERS = [0, 1, 2, 4, 6] # extra power iterations for the randomized algo +FACTORS = [300] # which num_topics to try +CHUNKSIZE = [10000, 1000] # which chunksize to try +POWER_ITERS = [0, 1, 2, 4, 6] # extra power iterations for the randomized algo # when reporting reconstruction error, also report spectral norm error? (very slow) COMPUTE_NORM2 = False - def norm2(a): """Spectral norm ("norm 2") of a symmetric matrix `a`.""" if COMPUTE_NORM2: logging.info("computing spectral norm of a %s matrix" % str(a.shape)) - return scipy.linalg.eigvalsh(a).max() # much faster than np.linalg.norm(2) + return scipy.linalg.eigvalsh(a).max() # much faster than np.linalg.norm(2) else: return np.nan @@ -64,7 +65,7 @@ def print_error(name, aat, u, s, ideal_nf, ideal_n2): err = -np.dot(u, np.dot(np.diag(s), u.T)) err += aat nf, n2 = np.linalg.norm(err), norm2(err) - print ('%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g' % + print('%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g' % (name, nf, nf / ideal_nf, n2, n2 / ideal_n2, rmse(err))) sys.stdout.flush() @@ -79,7 +80,6 @@ def __iter__(self): yield [(f, w) for f, w in doc if f < self.max_terms] - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) @@ -109,7 +109,7 @@ def __iter__(self): corpus = ClippedCorpus(mm, n, m) id2word = gensim.utils.FakeDict(m) - logging.info("computing corpus * corpus^T") # eigenvalues of this matrix are singular values of `corpus`, squared + logging.info("computing corpus * corpus^T") # eigenvalues of this matrix are singular values of `corpus`, squared aat = np.zeros((m, m), dtype=np.float64) for chunk in gensim.utils.grouper(corpus, chunksize=5000): num_nnz = sum(len(doc) for doc in chunk) @@ -122,11 +122,10 @@ def __iter__(self): logging.info("computing full decomposition of corpus * corpus^t") aat = aat.astype(np.float32) spectrum_s, spectrum_u = scipy.linalg.eigh(aat) - spectrum_s = spectrum_s[::-1] # re-order to descending eigenvalue order + spectrum_s = spectrum_s[::-1] # re-order to descending eigenvalue order spectrum_u = spectrum_u.T[::-1].T np.save(fname + '.spectrum.npy', spectrum_s) - for factors in FACTORS: err = -np.dot(spectrum_u[:, :factors], np.dot(np.diag(spectrum_s[:factors]), spectrum_u[:, :factors].T)) err += aat @@ -145,7 +144,7 @@ def __iter__(self): taken = time.time() - taken del corpus_ram del vt - u, s = ut.T.astype(np.float32), s.astype(np.float32)**2 # convert singular values to eigenvalues + u, s = ut.T.astype(np.float32), s.astype(np.float32)**2 # convert singular values to eigenvalues del ut print("SVDLIBC SVD for %i factors took %s s (spectrum %f .. %f)" % (factors, taken, s[0], s[-1])) @@ -162,7 +161,7 @@ def __iter__(self): taken = time.time() - taken u, s = model.projection.u.astype(np.float32), model.projection.s.astype(np.float32)**2 del model - print ("incremental SVD for %i factors, %i power iterations, chunksize %i took %s s (spectrum %f .. %f)" % + print("incremental SVD for %i factors, %i power iterations, chunksize %i took %s s (spectrum %f .. %f)" % (factors, power_iters, chunksize, taken, s[0], s[-1])) print_error('incremental SVD', aat, u, s, ideal_fro, ideal_n2) del u @@ -174,7 +173,7 @@ def __iter__(self): taken = time.time() - taken u, s = model.projection.u.astype(np.float32), model.projection.s.astype(np.float32)**2 del model - print ("multipass SVD for %i factors, %i power iterations took %s s (spectrum %f .. %f)" % + print("multipass SVD for %i factors, %i power iterations took %s s (spectrum %f .. %f)" % (factors, power_iters, taken, s[0], s[-1])) print_error('multipass SVD', aat, u, s, ideal_fro, ideal_n2) del u diff --git a/gensim/test/test_aggregation.py b/gensim/test/test_aggregation.py index 44e3d16f65..5f09c30ccd 100644 --- a/gensim/test/test_aggregation.py +++ b/gensim/test/test_aggregation.py @@ -13,6 +13,7 @@ from gensim.topic_coherence import aggregation + class TestAggregation(unittest.TestCase): def setUp(self): self.confirmed_measures = [1.1, 2.2, 3.3, 4.4] @@ -23,6 +24,7 @@ def testArithmeticMean(self): expected = 2.75 self.assertEqual(obtained, expected) + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main() diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index d2625f6ede..17cf1619f9 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -22,7 +22,6 @@ import six import numpy as np -import scipy.linalg from gensim.corpora import mmcorpus, Dictionary from gensim.models import atmodel @@ -483,10 +482,10 @@ def testLargeMmap(self): model = self.model # simulate storing large arrays separately - model.save(testfile(), sep_limit=0) + model.save(fname, sep_limit=0) # test loading the large model arrays with mmap - model2 = self.class_.load(testfile(), mmap='r') + model2 = self.class_.load(fname, mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(isinstance(model2.expElogbeta, np.memmap)) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) diff --git a/gensim/test/test_big.py b/gensim/test/test_big.py index ea3cc7adde..5e6972bd1f 100644 --- a/gensim/test/test_big.py +++ b/gensim/test/test_big.py @@ -12,7 +12,6 @@ import logging import unittest import os -import itertools import tempfile import numpy as np @@ -27,6 +26,7 @@ def testfile(): class BigCorpus(object): """A corpus of a large number of docs & large vocab""" + def __init__(self, words_only=False, num_terms=200000, num_docs=1000000, doc_len=100): self.dictionary = gensim.utils.FakeDict(num_terms) self.words_only = words_only @@ -47,6 +47,7 @@ def __iter__(self): if os.environ.get('GENSIM_BIG', False): class TestLargeData(unittest.TestCase): """Try common operations, using large models. You'll need ~8GB RAM to run these tests""" + def testWord2Vec(self): corpus = BigCorpus(words_only=True, num_docs=100000, num_terms=3000000, doc_len=200) model = gensim.models.Word2Vec(corpus, size=300, workers=4) diff --git a/gensim/test/test_corpora_hashdictionary.py b/gensim/test/test_corpora_hashdictionary.py index be19db8e39..6f314d65ee 100644 --- a/gensim/test/test_corpora_hashdictionary.py +++ b/gensim/test/test_corpora_hashdictionary.py @@ -101,7 +101,6 @@ def testDebugMode(self): expected = {} self.assertEqual(d.id2token, expected) - def testRange(self): # all words map to the same id d = HashDictionary(self.texts, id_range=1, debug=True) @@ -121,10 +120,9 @@ def testRange(self): self.assertEqual(d.id2token, id2token) self.assertEqual(d.token2id, token2id) - def testBuild(self): d = HashDictionary(self.texts, myhash=zlib.adler32) - expected = {5232: 2, + expected = {5232: 2, 5798: 3, 10608: 2, 12466: 2, @@ -178,7 +176,6 @@ def test_saveAsTextBz2(self): self.assertEqual(len(d), len(d2)) - if __name__ == '__main__': logging.basicConfig(level=logging.WARNING) unittest.main() diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 1265edc2b7..ad4eb4c976 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -22,7 +22,7 @@ import numpy as np -from gensim import utils, matutils +from gensim import utils from gensim.models import doc2vec, keyedvectors module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -41,6 +41,7 @@ def __iter__(self): for i, line in enumerate(f): yield doc2vec.TaggedDocument(utils.simple_preprocess(line), [self._tag(i)]) + list_corpus = list(DocsLeeCorpus()) raw_sentences = [ @@ -62,13 +63,15 @@ def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_doc2vec.tst') + def load_on_instance(): # Save and load a Doc2Vec Model on instance for test model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) model.save(testfile()) - model = doc2vec.Doc2Vec() # should fail at this point + model = doc2vec.Doc2Vec() # should fail at this point return model.load(testfile()) + class TestDoc2VecModel(unittest.TestCase): def test_persistence(self): """Test storing/loading the entire model.""" @@ -378,7 +381,7 @@ def testTrainWarning(self, l): def testLoadOnClassError(self): """Test if exception is raised when loading doc2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) -#endclass TestDoc2VecModel +# endclass TestDoc2VecModel if not hasattr(TestDoc2VecModel, 'assertLess'): @@ -397,6 +400,7 @@ class ConcatenatedDoc2Vec(object): Models must have exactly-matching vocabulary and document IDs. (Models should be trained separately; this wrapper just returns concatenated results.) """ + def __init__(self, models): self.models = models if hasattr(models[0], 'docvecs'): diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py index bd99136332..81b48374ab 100644 --- a/gensim/test/test_dtm.py +++ b/gensim/test/test_dtm.py @@ -69,6 +69,7 @@ def testCalledProcessError(self): id2word=self.id2word, model='dtm', initialize_lda=False, rng_seed=1) + if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py index bf6ac7db98..6a7a7b09ed 100644 --- a/gensim/test/test_fasttext_wrapper.py +++ b/gensim/test/test_fasttext_wrapper.py @@ -225,7 +225,7 @@ def testLoadModelWithNonAsciiVocab(self): model = fasttext.FastText.load_fasttext_format(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model) try: - vector = model[u'který'] + vector = model[u'který'] # noqa:F841 except UnicodeDecodeError: self.fail('Unable to access vector for utf8 encoded non-ascii word') @@ -234,7 +234,7 @@ def testLoadModelNonUtf8Encoding(self): model = fasttext.FastText.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model) try: - vector = model[u'který'] + vector = model[u'který'] # noqa:F841 except KeyError: self.fail('Unable to access vector for cp-852 word') diff --git a/gensim/test/test_glove2word2vec.py b/gensim/test/test_glove2word2vec.py index 1e83e93154..b638ca927d 100644 --- a/gensim/test/test_glove2word2vec.py +++ b/gensim/test/test_glove2word2vec.py @@ -9,7 +9,6 @@ import logging import unittest import os -import sys import tempfile import numpy @@ -17,31 +16,33 @@ from gensim.utils import check_output -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + def testfile(): # temporary model will be stored to this file return os.path.join(tempfile.gettempdir(), 'glove2word2vec.test') + class TestGlove2Word2Vec(unittest.TestCase): def setUp(self): self.datapath = datapath('test_glove.txt') self.output_file = testfile() def testConversion(self): - output = check_output(args=['python', '-m', 'gensim.scripts.glove2word2vec', '--input', self.datapath, '--output', self.output_file]) + output = check_output(args=['python', '-m', 'gensim.scripts.glove2word2vec', '--input', self.datapath, '--output', self.output_file]) # noqa:F841 # test that the converted model loads successfully try: self.test_model = gensim.models.KeyedVectors.load_word2vec_format(self.output_file) self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) - except: + except Exception: if os.path.isfile(os.path.join(self.output_file)): self.fail('model file %s was created but could not be loaded.' % self.output_file) else: self.fail('model file %s creation failed, check the parameters and input file format.' % self.output_file) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() - diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index e2b543687c..647b31ad7e 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -15,17 +15,13 @@ import os.path import tempfile -import six -import scipy.linalg - from gensim.corpora import mmcorpus, Dictionary from gensim.models import hdpmodel -from gensim import matutils from gensim.test import basetests import numpy as np -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -63,8 +59,8 @@ def testTopicValues(self): prob, word = results[1].split('+')[0].split('*') self.assertEqual(results[0], 0) self.assertEqual(prob, expected_prob) - self.assertEqual(word, expected_word) - + self.assertEqual(word, expected_word) + return def testLDAmodel(self): diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index ced285c77c..7f321c6565 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -112,7 +112,7 @@ def testEmbeddingLayer20NewsGroup(self): texts.append(sentence) texts_w2v.append(sentence.split(' ')) labels.append(label_id) - except: + except Exception: None # Vectorize the text samples into a 2D integer tensor diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index 952ba2fadd..8a3be3af5c 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -17,7 +17,6 @@ import unittest from gensim import utils -from gensim.corpora import Dictionary from gensim.summarization import keywords @@ -75,7 +74,6 @@ def test_text_summarization_raises_exception_on_short_input_text(self): self.assertTrue(keywords(text) is not None) - def test_keywords_ratio(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -89,7 +87,7 @@ def test_keywords_ratio(self): selected_docs_12 = keywords(text, ratio=0.1, split=True) selected_docs_21 = keywords(text, ratio=0.2, split=True) - self.assertAlmostEqual(float(len(selected_docs_21))/len(selected_docs_12), float(21)/12, places=1) + self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1) if __name__ == '__main__': diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index 9fec0e8605..a7d64839bb 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -15,10 +15,7 @@ import os.path import tempfile -import six import numpy as np -import scipy.linalg - from gensim.corpora import mmcorpus, Dictionary from gensim.models.wrappers import ldamallet @@ -26,7 +23,7 @@ from gensim.models import ldamodel from gensim.test import basetests -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) @@ -44,11 +41,11 @@ corpus = [dictionary.doc2bow(text) for text in texts] - def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') + class TestLdaMallet(unittest.TestCase, basetests.TestBaseTopicModel): def setUp(self): mallet_home = os.environ.get('MALLET_HOME', None) @@ -60,40 +57,38 @@ def setUp(self): # self.model is used in TestBaseTopicModel self.model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=1) - def testTransform(self): if not self.mallet_path: return passed = False - for i in range(5): # restart at most 5 times + for i in range(5): # restart at most 5 times # create the transformation model model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200) # transform one document doc = list(corpus)[0] transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests + vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.49, 0.51] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed) - def testSparseTransform(self): if not self.mallet_path: return passed = False - for i in range(5): # restart at most 5 times + for i in range(5): # restart at most 5 times # create the sparse transformation model with the appropriate topic_threshold model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5) # transform one document doc = list(corpus)[0] transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests + vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [1.0, 0.0] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % @@ -103,7 +98,7 @@ def testSparseTransform(self): def testMallet2Model(self): if not self.mallet_path: return - passed = False + tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary) tm2 = ldamallet.malletmodel2ldamodel(tm1) for document in corpus: @@ -129,7 +124,7 @@ def testPersistence(self): self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): if not self.mallet_path: @@ -141,7 +136,7 @@ def testPersistenceCompressed(self): self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmap(self): if not self.mallet_path: @@ -150,15 +145,15 @@ def testLargeMmap(self): model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) # simulate storing large arrays separately - model.save(testfile(), sep_limit=0) + model.save(fname, sep_limit=0) # test loading the large model arrays with mmap - model2 = ldamodel.LdaModel.load(testfile(), mmap='r') + model2 = ldamodel.LdaModel.load(fname, mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(isinstance(model2.word_topics, np.memmap)) self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmapCompressed(self): if not self.mallet_path: @@ -171,9 +166,9 @@ def testLargeMmapCompressed(self): # test loading the large model arrays with mmap self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r') -#endclass TestLdaMallet +# endclass TestLdaMallet + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() - diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 22d9ba530a..71cc10f70c 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -18,14 +18,13 @@ import six import numpy as np -import scipy.linalg from gensim.corpora import mmcorpus, Dictionary from gensim.models import ldamodel, ldamulticore from gensim import matutils, utils from gensim.test import basetests -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -61,13 +60,12 @@ def setUp(self): self.class_ = ldamodel.LdaModel self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) - def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization - for i in range(25): # restart at most 5 times + for i in range(25): # restart at most 5 times # create the transformation model model = self.class_(id2word=dictionary, num_topics=2, passes=100) model.update(self.corpus) @@ -76,9 +74,9 @@ def testTransform(self): doc = list(corpus)[0] transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests + vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % @@ -146,7 +144,6 @@ def testAlpha(self): kwargs['alpha'] = "gensim is cool" self.assertRaises(ValueError, self.class_, **kwargs) - def testEtaAuto(self): model1 = self.class_(corpus, id2word=dictionary, eta='symmetric', passes=10) modelauto = self.class_(corpus, id2word=dictionary, eta='auto', passes=10) @@ -193,7 +190,7 @@ def testEta(self): self.assertEqual(model.eta.shape, expected_shape) self.assertTrue(all(model.eta == np.array([0.3] * num_terms))) - # should be ok with num_topics x num_terms + # should be ok with num_topics x num_terms testeta = np.array([[0.5] * len(dictionary)] * 2) kwargs['eta'] = testeta self.class_(**kwargs) @@ -234,7 +231,7 @@ def testGetTopicTerms(self): def testGetDocumentTopics(self): - model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100, random_state=np.random.seed(0)) + model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)) doc_topics = model.get_document_topics(self.corpus) @@ -244,26 +241,26 @@ def testGetDocumentTopics(self): self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) - #Test case to use the get_document_topic function for the corpus + # Test case to use the get_document_topic function for the corpus all_topics = model.get_document_topics(self.corpus, per_word_topics=True) self.assertEqual(model.state.numdocs, len(corpus)) for topic in all_topics: self.assertTrue(isinstance(topic, tuple)) - for k, v in topic[0]: # list of doc_topics + for k, v in topic[0]: # list of doc_topics self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) - for w, topic_list in topic[1]: # list of word_topics + for w, topic_list in topic[1]: # list of word_topics self.assertTrue(isinstance(w, int)) self.assertTrue(isinstance(topic_list, list)) - for w, phi_values in topic[2]: # list of word_phis + for w, phi_values in topic[2]: # list of word_phis self.assertTrue(isinstance(w, int)) self.assertTrue(isinstance(phi_values, list)) - #Test case to check the filtering effect of minimum_probability and minimum_phi_value + # Test case to check the filtering effect of minimum_probability and minimum_phi_value doc_topic_count_na = 0 word_phi_count_na = 0 @@ -273,25 +270,24 @@ def testGetDocumentTopics(self): for topic in all_topics: self.assertTrue(isinstance(topic, tuple)) - for k, v in topic[0]: # list of doc_topics + for k, v in topic[0]: # list of doc_topics self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) if len(topic[0]) != 0: doc_topic_count_na += 1 - for w, topic_list in topic[1]: # list of word_topics + for w, topic_list in topic[1]: # list of word_topics self.assertTrue(isinstance(w, int)) self.assertTrue(isinstance(topic_list, list)) - for w, phi_values in topic[2]: # list of word_phis + for w, phi_values in topic[2]: # list of word_phis self.assertTrue(isinstance(w, int)) self.assertTrue(isinstance(phi_values, list)) if len(phi_values) != 0: word_phi_count_na += 1 self.assertTrue(model.state.numdocs > doc_topic_count_na) - self.assertTrue( sum([len(i) for i in corpus]) > word_phi_count_na) - + self.assertTrue(sum([len(i) for i in corpus]) > word_phi_count_na) doc_topics, word_topics, word_phis = model.get_document_topics(self.corpus[1], per_word_topics=True) @@ -309,8 +305,9 @@ def testGetDocumentTopics(self): # word_topics looks like this: ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). # we check one case in word_topics, i.e of the first word in the doc, and it's likely topics. - expected_word = 0 + # FIXME: Fails on osx and win + # expected_word = 0 # self.assertEqual(word_topics[0][0], expected_word) # self.assertTrue(0 in word_topics[0][1]) @@ -325,9 +322,8 @@ def testTermTopics(self): self.assertTrue(isinstance(probability, float)) # checks if topic '1' is in the result list - # FIXME: Fails on osx and win - # self.assertTrue(1 in result[0]) - + # FIXME: Fails on osx and win + # self.assertTrue(1 in result[0]) # if user has entered word instead, check with word result = model.get_term_topics(str(model.id2word[2])) @@ -336,9 +332,8 @@ def testTermTopics(self): self.assertTrue(isinstance(probability, float)) # checks if topic '1' is in the result list - # FIXME: Fails on osx and win - # self.assertTrue(1 in result[0]) - + # FIXME: Fails on osx and win + # self.assertTrue(1 in result[0]) def testPasses(self): # long message includes the original error message with a custom one @@ -407,7 +402,7 @@ def testPersistence(self): self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testModelCompatibilityWithPythonVersions(self): fname_model_2_7 = datapath('ldamodel_python_2_7') @@ -417,12 +412,11 @@ def testModelCompatibilityWithPythonVersions(self): self.assertEqual(model_2_7.num_topics, model_3_5.num_topics) self.assertTrue(np.allclose(model_2_7.expElogbeta, model_3_5.expElogbeta)) tstvec = [] - self.assertTrue(np.allclose(model_2_7[tstvec], model_3_5[tstvec])) # try projecting an empty vector - id2word_2_7 = dict((k,v) for k,v in model_2_7.id2word.iteritems()) - id2word_3_5 = dict((k,v) for k,v in model_3_5.id2word.iteritems()) + self.assertTrue(np.allclose(model_2_7[tstvec], model_3_5[tstvec])) # try projecting an empty vector + id2word_2_7 = dict((k, v) for k, v in model_2_7.id2word.iteritems()) + id2word_3_5 = dict((k, v) for k, v in model_3_5.id2word.iteritems()) self.assertEqual(set(id2word_2_7.keys()), set(id2word_3_5.keys())) - def testPersistenceIgnore(self): fname = testfile('testPersistenceIgnore') model = ldamodel.LdaModel(self.corpus, num_topics=2) @@ -442,22 +436,22 @@ def testPersistenceCompressed(self): self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmap(self): fname = testfile() model = self.model # simulate storing large arrays separately - model.save(testfile(), sep_limit=0) + model.save(fname, sep_limit=0) # test loading the large model arrays with mmap - model2 = self.class_.load(testfile(), mmap='r') + model2 = self.class_.load(fname, mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(isinstance(model2.expElogbeta, np.memmap)) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmapCompressed(self): fname = testfile() + '.gz' @@ -493,7 +487,7 @@ def testRandomStateBackwardCompatibility(self): self.assertTrue(isinstance(i[0], int)) self.assertTrue(isinstance(i[1], six.string_types)) -#endclass TestLdaModel +# endclass TestLdaModel class TestLdaMulticore(TestLdaModel): @@ -507,7 +501,7 @@ def testAlphaAuto(self): self.assertRaises(RuntimeError, self.class_, alpha='auto') -#endclass TestLdaMulticore +# endclass TestLdaMulticore if __name__ == '__main__': diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index d2924921e6..4ca7b104fc 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -5,19 +5,19 @@ """ import numpy as np # for arrays, array broadcasting etc. -from gensim.models import ldaseqmodel, ldamodel +from gensim.models import ldaseqmodel from gensim.corpora import Dictionary import os.path import unittest import logging -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data/DTM', fname) class TestLdaSeq(unittest.TestCase): - # we are setting up a DTM model and fitting it, and checking topic-word and doc-topic results. + # we are setting up a DTM model and fitting it, and checking topic-word and doc-topic results. def setUp(self): texts = [ [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'], @@ -56,7 +56,7 @@ def setUp(self): sstats = np.loadtxt(datapath('sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] - self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats) + self.ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats) # testing topic word proportions def testTopicWord(self): @@ -72,6 +72,7 @@ def testDocTopic(self): expected_doc_topic = 0.00066577896138482028 self.assertAlmostEqual(doc_topic[0], expected_doc_topic, places=2) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_ldavowpalwabbit_wrapper.py b/gensim/test/test_ldavowpalwabbit_wrapper.py index 74e3591f4f..3cf6f9f6bb 100644 --- a/gensim/test/test_ldavowpalwabbit_wrapper.py +++ b/gensim/test/test_ldavowpalwabbit_wrapper.py @@ -27,7 +27,7 @@ from gensim.models.wrappers.ldavowpalwabbit import LdaVowpalWabbit -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -69,7 +69,7 @@ def setUp(self): def test_save_load(self): """Test loading/saving LdaVowpalWabbit model.""" - if not self.vw_path: # for python 2.6 + if not self.vw_path: # for python 2.6 return lda = LdaVowpalWabbit(self.vw_path, corpus=self.corpus, @@ -104,7 +104,7 @@ def test_save_load(self): def test_model_update(self): """Test updating existing LdaVowpalWabbit model.""" - if not self.vw_path: # for python 2.6 + if not self.vw_path: # for python 2.6 return lda = LdaVowpalWabbit(self.vw_path, corpus=[self.corpus[0]], @@ -124,7 +124,7 @@ def test_model_update(self): def test_perplexity(self): """Test LdaVowpalWabbit perplexity is within expected range.""" - if not self.vw_path: # for python 2.6 + if not self.vw_path: # for python 2.6 return lda = LdaVowpalWabbit(self.vw_path, corpus=self.corpus, @@ -144,7 +144,7 @@ def test_perplexity(self): def test_topic_coherence(self): """Test LdaVowpalWabbit topic coherence.""" - if not self.vw_path: # for python 2.6 + if not self.vw_path: # for python 2.6 return corpus, dictionary = get_corpus() lda = LdaVowpalWabbit(self.vw_path, @@ -196,7 +196,7 @@ def test_topic_coherence(self): def test_corpus_to_vw(self): """Test corpus to Vowpal Wabbit format conversion.""" - if not self.vw_path: # for python 2.6 + if not self.vw_path: # for python 2.6 return corpus = [[(0, 5), (7, 1), (5, 3), (0, 2)], [(7, 2), (2, 1), (3, 11)], diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py index 7a6dbab498..b51101c8b1 100644 --- a/gensim/test/test_lee.py +++ b/gensim/test/test_lee.py @@ -65,7 +65,6 @@ def setUp(self): sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)] - def test_corpus(self): """availability and integrity of corpus""" documents_in_bg_corpus = 300 @@ -75,7 +74,6 @@ def test_corpus(self): self.assertEqual(len(corpus), documents_in_corpus) self.assertEqual(len(human_sim_vector), len_sim_vector) - def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) @@ -108,7 +106,6 @@ def test_lee(self): logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) - # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 @@ -134,7 +131,6 @@ def test_lee(self): # self.assertTrue(cor > 0.35) - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_logentropy_model.py b/gensim/test/test_logentropy_model.py index 08d4e6e713..07f982dad9 100644 --- a/gensim/test/test_logentropy_model.py +++ b/gensim/test/test_logentropy_model.py @@ -15,15 +15,12 @@ import os.path import tempfile -import six import numpy as np -import scipy.linalg from gensim.corpora import mmcorpus, Dictionary from gensim.models import logentropy_model -from gensim import matutils -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -51,7 +48,6 @@ def setUp(self): self.corpus_small = mmcorpus.MmCorpus(datapath('test_corpus_small.mm')) self.corpus_ok = mmcorpus.MmCorpus(datapath('test_corpus_ok.mm')) - def testTransform(self): # create the transformation model model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=False) @@ -65,7 +61,6 @@ def testTransform(self): (3, 1.20941755462856)] self.assertTrue(np.allclose(transformed, expected)) - def testPersistence(self): fname = testfile() model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True) @@ -83,7 +78,7 @@ def testPersistenceCompressed(self): self.assertTrue(model.entr == model2.entr) tstvec = [] self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) -#endclass TestLogEntropyModel +# endclass TestLogEntropyModel if __name__ == '__main__': diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index 012ac6b8f5..e2c32bda66 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -76,11 +76,11 @@ def testCorpusTransform(self): model = self.model got = np.vstack(matutils.sparse2full(doc, 2) for doc in model[self.corpus]) expected = np.array([ - [0.65946639, 0.14211544], + [0.65946639, 0.14211544], [2.02454305, -0.42088759], - [1.54655361, 0.32358921], - [1.81114125, 0.5890525 ], - [0.9336738 , -0.27138939], + [1.54655361, 0.32358921], + [1.81114125, 0.5890525], + [0.9336738, -0.27138939], [0.01274618, -0.49016181], [0.04888203, -1.11294699], [0.08063836, -1.56345594], diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index e698cae444..dd660f629f 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -71,7 +71,6 @@ def test_textcorpus(self): miislita2 = corpora.MmCorpus(ftmp) self.assertEqual(list(miislita), list(miislita2)) - def test_save_load_ability(self): """ Make sure we can save and load (un/pickle) TextCorpus objects (as long @@ -90,7 +89,6 @@ def test_save_load_ability(self): self.assertEqual(len(miislita), len(miislita2)) self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id) - def test_miislita_high_level(self): # construct corpus from file miislita = CorpusMiislita(datapath('miIslita.cor')) diff --git a/gensim/test/test_normmodel.py b/gensim/test/test_normmodel.py index 8d1315e75d..77221b4a4d 100644 --- a/gensim/test/test_normmodel.py +++ b/gensim/test/test_normmodel.py @@ -22,7 +22,7 @@ from gensim.corpora import mmcorpus from gensim.models import normmodel -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -136,7 +136,7 @@ def testPersistence(self): model2 = normmodel.NormModel.load(fname) self.assertTrue(model.norms == model2.norms) tstvec = [] - self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) # try projecting an empty vector + self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) # try projecting an empty vector def testPersistenceCompressed(self): fname = testfile() + '.gz' @@ -145,7 +145,7 @@ def testPersistenceCompressed(self): model2 = normmodel.NormModel.load(fname, mmap=None) self.assertTrue(model.norms == model2.norms) tstvec = [] - self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) # try projecting an empty vector + self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) # try projecting an empty vector if __name__ == '__main__': diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py index 35dc9efabf..fd0429e9f6 100644 --- a/gensim/test/test_parsing.py +++ b/gensim/test/test_parsing.py @@ -8,8 +8,7 @@ import logging import unittest import numpy as np - -from gensim.parsing.preprocessing import * +from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation2, strip_tags, strip_short, strip_numeric, strip_non_alphanum, strip_multiple_whitespaces, split_alphanum, stem_text # several documents @@ -43,7 +42,6 @@ classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) - class TestPreprocessing(unittest.TestCase): def testStripNumeric(self): diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 688f92dbd0..5397d6e4c3 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -33,7 +33,7 @@ ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey'], - ['graph', 'minors', 'survey','human','interface'] #test bigrams within same sentence + ['graph', 'minors', 'survey', 'human', 'interface'] # test bigrams within same sentence ] unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] @@ -44,6 +44,7 @@ def gen_sentences(): class TestPhrasesCommon(unittest.TestCase): """ Tests that need to be run for both Prases and Phraser classes.""" + def setUp(self): self.bigram = Phrases(sentences, min_count=1, threshold=1) self.bigram_default = Phrases(sentences) @@ -195,11 +196,12 @@ def testPruning(self): """Test that max_vocab_size parameter is respected.""" bigram = Phrases(sentences, max_vocab_size=5) self.assertTrue(len(bigram.vocab) <= 5) -#endclass TestPhrasesModel +# endclass TestPhrasesModel class TestPhraserModel(TestPhrasesCommon): """ Test Phraser models.""" + def setUp(self): """Set up Phraser models for the tests.""" bigram_phrases = Phrases(sentences, min_count=1, threshold=1) diff --git a/gensim/test/test_rpmodel.py b/gensim/test/test_rpmodel.py index 2abbc745dd..2de5dd6546 100644 --- a/gensim/test/test_rpmodel.py +++ b/gensim/test/test_rpmodel.py @@ -15,15 +15,13 @@ import os.path import tempfile -import six import numpy as np -import scipy.linalg from gensim.corpora import mmcorpus, Dictionary from gensim.models import rpmodel from gensim import matutils -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -46,24 +44,22 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') - class TestRpModel(unittest.TestCase): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) def testTransform(self): # create the transformation model - np.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) + np.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) model = rpmodel.RpModel(self.corpus, num_topics=2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests + vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = np.array([-0.70710677, 0.70710677]) - self.assertTrue(np.allclose(vec, expected)) # transformed entries must be equal up to sign - + self.assertTrue(np.allclose(vec, expected)) # transformed entries must be equal up to sign def testPersistence(self): fname = testfile() @@ -73,7 +69,7 @@ def testPersistence(self): self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.projection, model2.projection)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): fname = testfile() + '.gz' @@ -83,8 +79,8 @@ def testPersistenceCompressed(self): self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.projection, model2.projection)) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector -#endclass TestRpModel + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector +# endclass TestRpModel if __name__ == '__main__': diff --git a/gensim/test/test_segmentation.py b/gensim/test/test_segmentation.py index c437ca8fa8..a4c9356a26 100644 --- a/gensim/test/test_segmentation.py +++ b/gensim/test/test_segmentation.py @@ -49,6 +49,7 @@ def testSOneSet(self): self.assertEqual(actual[s_i][j][0], expected[s_i][j][0]) self.assertTrue(np.allclose(actual[s_i][j][1], expected[s_i][j][1])) + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main() diff --git a/gensim/test/test_sharded_corpus.py b/gensim/test/test_sharded_corpus.py index d50f618384..871048ea4e 100644 --- a/gensim/test/test_sharded_corpus.py +++ b/gensim/test/test_sharded_corpus.py @@ -18,10 +18,8 @@ ############################################################################# - class TestShardedCorpus(unittest.TestCase): - # @classmethod # def setUpClass(cls): # cls.dim = 1000 @@ -73,7 +71,7 @@ def test_load(self): def test_getitem(self): - _ = self.corpus[130] + _ = self.corpus[130] # noqa:F841 # Does retrieving the item load the correct shard? self.assertEqual(self.corpus.current_shard_n, 1) @@ -83,13 +81,13 @@ def test_getitem(self): self.assertEqual(self.corpus.current_shard_n, 2) for i in xrange(220, 227): - self.assertTrue(np.array_equal(self.corpus[i], item[i-220])) + self.assertTrue(np.array_equal(self.corpus[i], item[i - 220])) def test_sparse_serialization(self): no_exception = True try: - dataset = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, + dataset = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, # noqa:F841 dim=self.dim, sparse_serialization=True) except Exception: no_exception = False @@ -236,7 +234,6 @@ def test_getitem_dense2gensim(self): i, j, str(ilist[i][j]), i, j, str(dslice[i][j]))) - iscorp, _ = is_corpus(ilist) self.assertTrue(iscorp, "Is the object returned by list notation " "a gensim corpus?") @@ -257,6 +254,7 @@ def test_resize(self): ############################################################################## + if __name__ == '__main__': suite = unittest.TestSuite() loader = unittest.TestLoader() diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index a7a59c77d3..9ce5263df7 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -17,21 +17,21 @@ import numpy import scipy -from gensim.corpora import mmcorpus, Dictionary +from gensim.corpora import Dictionary from gensim.models import word2vec from gensim.models import doc2vec from gensim.models import KeyedVectors from gensim.models.wrappers import fasttext -from gensim import matutils, utils, similarities +from gensim import matutils, similarities from gensim.models import Word2Vec try: - from pyemd import emd + from pyemd import emd # noqa:F401 PYEMD_EXT = True except ImportError: PYEMD_EXT = False -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -61,6 +61,7 @@ class _TestSimilarityABC(object): """ Base class for SparseMatrixSimilarity and MatrixSimilarity unit tests. """ + def testFull(self, num_best=None, shardsize=100): if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=shardsize) @@ -83,18 +84,17 @@ def testFull(self, num_best=None, shardsize=100): index.num_best = num_best query = corpus[0] sims = index[query] - expected = [(0, 0.99999994), (2, 0.28867513), (3, 0.23570226), (1, 0.23570226)][ : num_best] + expected = [(0, 0.99999994), (2, 0.28867513), (3, 0.23570226), (1, 0.23570226)][: num_best] # convert sims to full numpy arrays, so we can use allclose() and ignore # ordering of items with the same similarity value expected = matutils.sparse2full(expected, len(index)) - if num_best is not None: # when num_best is None, sims is already a numpy array + if num_best is not None: # when num_best is None, sims is already a numpy array sims = matutils.sparse2full(sims, len(index)) self.assertTrue(numpy.allclose(expected, sims)) if self.cls == similarities.Similarity: index.destroy() - def testNumBest(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: @@ -126,7 +126,6 @@ def test_scipy2scipy_clipped(self): self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped)) self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3) - def testChunking(self): if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) @@ -135,9 +134,9 @@ def testChunking(self): query = corpus[:3] sims = index[query] expected = numpy.array([ - [0.99999994, 0.23570226, 0.28867513, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.0 ], - [0.23570226, 1.0, 0.40824831, 0.33333334, 0.70710677, 0.0, 0.0, 0.0, 0.23570226 ], - [0.28867513, 0.40824831, 1.0, 0.61237246, 0.28867513, 0.0, 0.0, 0.0, 0.0 ] + [0.99999994, 0.23570226, 0.28867513, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.23570226, 1.0, 0.40824831, 0.33333334, 0.70710677, 0.0, 0.0, 0.0, 0.23570226], + [0.28867513, 0.40824831, 1.0, 0.61237246, 0.28867513, 0.0, 0.0, 0.0, 0.0] ], dtype=numpy.float32) self.assertTrue(numpy.allclose(expected, sims)) @@ -151,7 +150,6 @@ def testChunking(self): if self.cls == similarities.Similarity: index.destroy() - def testIter(self): if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) @@ -159,21 +157,20 @@ def testIter(self): index = self.cls(corpus, num_features=len(dictionary)) sims = [sim for sim in index] expected = numpy.array([ - [ 0.99999994, 0.23570226, 0.28867513, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.0 ], - [ 0.23570226, 1.0, 0.40824831, 0.33333334, 0.70710677, 0.0, 0.0, 0.0, 0.23570226 ], - [ 0.28867513, 0.40824831, 1.0, 0.61237246, 0.28867513, 0.0, 0.0, 0.0, 0.0 ], - [ 0.23570226, 0.33333334, 0.61237246, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ], - [ 0.0, 0.70710677, 0.28867513, 0.0, 0.99999994, 0.0, 0.0, 0.0, 0.0 ], - [ 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.70710677, 0.57735026, 0.0 ], - [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.99999994, 0.81649655, 0.40824828 ], - [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.81649655, 0.99999994, 0.66666663 ], - [ 0.0, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.40824828, 0.66666663, 0.99999994 ] + [0.99999994, 0.23570226, 0.28867513, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.23570226, 1.0, 0.40824831, 0.33333334, 0.70710677, 0.0, 0.0, 0.0, 0.23570226], + [0.28867513, 0.40824831, 1.0, 0.61237246, 0.28867513, 0.0, 0.0, 0.0, 0.0], + [0.23570226, 0.33333334, 0.61237246, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.70710677, 0.28867513, 0.0, 0.99999994, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.70710677, 0.57735026, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.99999994, 0.81649655, 0.40824828], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.81649655, 0.99999994, 0.66666663], + [0.0, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.40824828, 0.66666663, 0.99999994] ], dtype=numpy.float32) self.assertTrue(numpy.allclose(expected, sims)) if self.cls == similarities.Similarity: index.destroy() - def testPersistency(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return @@ -278,7 +275,6 @@ def testLargeCompressed(self): self.assertTrue(numpy.allclose(index.index, index2.index)) self.assertEqual(index.num_best, index2.num_best) - def testMmap(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return @@ -324,10 +320,12 @@ def testMmapCompressed(self): # same thing, but use mmap to load arrays self.assertRaises(IOError, self.cls.load, fname, mmap='r') + class TestMatrixSimilarity(unittest.TestCase, _TestSimilarityABC): def setUp(self): self.cls = similarities.MatrixSimilarity + class TestWmdSimilarity(unittest.TestCase, _TestSimilarityABC): def setUp(self): self.cls = similarities.WmdSimilarity @@ -437,7 +435,6 @@ def testMaintainSparsityWithNumBest(self): self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims]) - class TestSimilarity(unittest.TestCase, _TestSimilarityABC): def setUp(self): self.cls = similarities.Similarity @@ -450,7 +447,7 @@ def testSharding(self): def testReopen(self): """test re-opening partially full shards""" index = similarities.Similarity(None, corpus[:5], num_features=len(dictionary), shardsize=9) - _ = index[corpus[0]] # forces shard close + _ = index[corpus[0]] # noqa:F841 forces shard close index.add_documents(corpus[5:]) query = corpus[0] sims = index[query] @@ -464,7 +461,6 @@ def testMmapCompressed(self): # turns out this test doesn't exercise this because there are no arrays # to be mmaped! - def testChunksize(self): index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) expected = [sim for sim in index] @@ -478,7 +474,7 @@ class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: - import annoy + import annoy # noqa:F401 except ImportError: raise unittest.SkipTest("Annoy library is not available") @@ -565,11 +561,12 @@ def assertLoadedIndexEqual(self, index, model): self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees) + class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: - import annoy + import annoy # noqa:F401 except ImportError: raise unittest.SkipTest("Annoy library is not available") diff --git a/gensim/test/test_similarity_metrics.py b/gensim/test/test_similarity_metrics.py index 858e388e77..23bf0e60f7 100644 --- a/gensim/test/test_similarity_metrics.py +++ b/gensim/test/test_similarity_metrics.py @@ -20,7 +20,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import ldamodel -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) @@ -37,7 +37,6 @@ corpus = [dictionary.doc2bow(text) for text in texts] - class TestIsBow(unittest.TestCase): def test_None(self): # test None @@ -64,7 +63,7 @@ def test_bow(self): potentialbow = [] result = matutils.isbow(potentialbow) expected = True - self.assertEqual(expected, result) + self.assertEqual(expected, result) # checking corpus; should return false potentialbow = [[(2, 1), (3, 1), (4, 1), (5, 1), (1, 1), (7, 1)]] @@ -85,11 +84,12 @@ def test_bow(self): self.assertEqual(expected, result) # checking np array format bag of words - potentialbow = np.array([[1, 0.4], [0, 0.2],[2, 0.2]]) + potentialbow = np.array([[1, 0.4], [0, 0.2], [2, 0.2]]) result = matutils.isbow(potentialbow) expected = True self.assertEqual(expected, result) + class TestHellinger(unittest.TestCase): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) @@ -128,7 +128,6 @@ def test_distributions(self): expected = 0.185241936534 self.assertAlmostEqual(expected, result) - # checking ndarray, csr_matrix as inputs vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]]) vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) @@ -143,15 +142,16 @@ def test_distributions(self): expected = 0.309742984153 self.assertAlmostEqual(expected, result) - # testing LDA distribution vectors + # testing LDA distribution vectors np.random.seed(0) - model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100) + model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.hellinger(lda_vec1, lda_vec2) expected = 1.0406845281146034e-06 self.assertAlmostEqual(expected, result) + class TestKL(unittest.TestCase): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) @@ -212,20 +212,21 @@ def test_distributions(self): # testing LDA distribution vectors np.random.seed(0) - model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100) + model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.kullback_leibler(lda_vec1, lda_vec2) expected = 4.283407e-12 self.assertAlmostEqual(expected, result, places=5) + class TestJaccard(unittest.TestCase): def test_inputs(self): # all empty inputs will give a divide by zero exception vec_1 = [] vec_2 = [] - self.assertRaises(ZeroDivisionError, matutils.jaccard , vec_1, vec_2) + self.assertRaises(ZeroDivisionError, matutils.jaccard, vec_1, vec_2) def test_distributions(self): @@ -250,6 +251,7 @@ def test_distributions(self): expected = 1 - 0.333333333333 self.assertAlmostEqual(expected, result) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 997b3d9fb1..ade44e900a 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -966,5 +966,6 @@ def testModelNotFitted(self): phrases_transformer = PhrasesTransformer() self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) + if __name__ == '__main__': unittest.main() diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index a1ba6f223a..ce44c5f0a3 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -87,7 +87,7 @@ def test_text_summarization_raises_exception_on_short_input_text(self): text = "\n".join(text.split('\n')[:8]) self.assertTrue(summarize(text) is not None) - + def test_text_summarization_returns_input_on_single_input_sentence(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -107,7 +107,7 @@ def test_corpus_summarization_raises_exception_on_short_input_text(self): # Keeps the first 8 sentences to make the text shorter. sentences = text.split('\n')[:8] - + # Generate the corpus. tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) @@ -185,6 +185,7 @@ def test_low_distinct_words_corpus_summarization_is_none(self): self.assertTrue(summarize_corpus(corpus) is None) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index e7dc62e960..bb00b5482d 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -15,15 +15,12 @@ import os.path import tempfile -import six import numpy as np -import scipy.linalg from gensim.corpora import mmcorpus, Dictionary from gensim.models import tfidfmodel -from gensim import matutils -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -46,7 +43,6 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') - class TestTfidfModel(unittest.TestCase): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) @@ -62,7 +58,6 @@ def testTransform(self): expected = [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)] self.assertTrue(np.allclose(transformed, expected)) - def testInit(self): # create the transformation model by analyzing a corpus # uses the global `corpus`! @@ -77,7 +72,6 @@ def testInit(self): model2 = tfidfmodel.TfidfModel(dictionary=dictionary) self.assertEqual(model1.idfs, model2.idfs) - def testPersistence(self): fname = testfile() model = tfidfmodel.TfidfModel(self.corpus, normalize=True) @@ -85,7 +79,7 @@ def testPersistence(self): model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): fname = testfile() + '.gz' @@ -94,9 +88,8 @@ def testPersistenceCompressed(self): model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector -#endclass TestTfidfModel - + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector +# endclass TestTfidfModel if __name__ == '__main__': diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index 612d55dd68..ae94b5c2ad 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -83,14 +83,15 @@ def test_decode_entities(self): expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af' self.assertEquals(utils.decode_htmlentities(body), expected) + class TestSampleDict(unittest.TestCase): def test_sample_dict(self): - d = {1:2,2:3,3:4,4:5} - expected_dict = [(1,2),(2,3)] - expected_dict_random = [(k,v) for k,v in iteritems(d)] - sampled_dict = utils.sample_dict(d,2,False) - self.assertEqual(sampled_dict,expected_dict) - sampled_dict_random = utils.sample_dict(d,2) + d = {1: 2, 2: 3, 3: 4, 4: 5} + expected_dict = [(1, 2), (2, 3)] + expected_dict_random = [(k, v) for k, v in iteritems(d)] + sampled_dict = utils.sample_dict(d, 2, False) + self.assertEqual(sampled_dict, expected_dict) + sampled_dict_random = utils.sample_dict(d, 2) if sampled_dict_random in expected_dict_random: self.assertTrue(True) diff --git a/gensim/test/test_varembed_wrapper.py b/gensim/test/test_varembed_wrapper.py index 5b83bbf50b..2053f7ffc3 100644 --- a/gensim/test/test_varembed_wrapper.py +++ b/gensim/test/test_varembed_wrapper.py @@ -59,6 +59,7 @@ def testAddMorphemesToEmbeddings(self): self.model_sanity(model_with_morphemes) # Check syn0 is different for both models. self.assertFalse(np.allclose(model.syn0, model_with_morphemes.syn0)) + def testLookup(self): """Test lookup of vector for a particular word and list""" model = varembed.VarEmbed.load_varembed_format(vectors=varembed_model_vector_file) diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index d8b8b721c9..9bdbcbdb8d 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -10,8 +10,6 @@ import os -import sys -import types import logging import unittest @@ -49,9 +47,9 @@ def test_first_element(self): """ wc = WikiCorpus(datapath(FILENAME), processes=1) - l = wc.get_texts() - self.assertTrue(u'anarchism' in next(l)) - self.assertTrue(u'autism' in next(l)) + texts = wc.get_texts() + self.assertTrue(u'anarchism' in next(texts)) + self.assertTrue(u'autism' in next(texts)) def test_unicode_element(self): """ @@ -60,8 +58,8 @@ def test_unicode_element(self): """ wc = WikiCorpus(datapath(FILENAME_U), processes=1) - l = wc.get_texts() - self.assertTrue(u'папа' in next(l)) + texts = wc.get_texts() + self.assertTrue(u'папа' in next(texts)) if __name__ == '__main__': diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index bd8b9c7668..29ae713b90 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -13,25 +13,22 @@ import unittest import os import tempfile -import itertools import bz2 import sys import numpy as np -from gensim import utils, matutils -from gensim.utils import check_output -from subprocess import PIPE +from gensim import utils from gensim.models import word2vec, keyedvectors from testfixtures import log_capture try: - from pyemd import emd + from pyemd import emd # noqa:F401 PYEMD_EXT = True except ImportError: PYEMD_EXT = False -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -41,6 +38,7 @@ def __iter__(self): for line in f: yield utils.simple_preprocess(line) + list_corpus = list(LeeCorpus()) sentences = [ @@ -64,22 +62,27 @@ def __iter__(self): ['artificial', 'intelligence', 'system'] ] + def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_word2vec.tst') + def _rule(word, count, min_count): if word == "human": return utils.RULE_DISCARD # throw out else: return utils.RULE_DEFAULT # apply default rule, i.e. min_count + + def load_on_instance(): # Save and load a Word2Vec Model on instance for test model = word2vec.Word2Vec(sentences, min_count=1) model.save(testfile()) - model = word2vec.Word2Vec() # should fail at this point + model = word2vec.Word2Vec() # should fail at this point return model.load(testfile()) + class TestWord2VecModel(unittest.TestCase): def testOnlineLearning(self): """Test that the algorithm is able to add new words to the @@ -106,7 +109,6 @@ def testOnlineLearningAfterSave(self): model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) - def onlineSanity(self, model): terro, others = [], [] for l in list_corpus: @@ -201,7 +203,7 @@ def testSyn0NormNotSaved(self): def testLoadPreKeyedVectorModel(self): """Test loading pre-KeyedVectors word2vec model""" - if sys.version_info[:2] == (3,4): + if sys.version_info[:2] == (3, 4): model_file_suffix = '_py3_4' elif sys.version_info < (3,): model_file_suffix = '_py2' @@ -251,7 +253,6 @@ def testNoTrainingCFormat(self): binary_model.wv = kv self.assertRaises(ValueError, binary_model.train, sentences) - def testTooShortBinaryWord2VecFormat(self): tfile = testfile() model = word2vec.Word2Vec(sentences, min_count=1) @@ -303,7 +304,6 @@ def testPersistenceKeyedVectorsFormatWithVocab(self): kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count) - def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): """Test storing/loading the entire model and vocabulary in word2vec format chained with saving and loading via `save` and `load` methods`. @@ -316,7 +316,6 @@ def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): binary_model_with_vocab_kv.save(testfile()) self.assertRaises(AttributeError, word2vec.Word2Vec.load, testfile()) - def testLargeMmap(self): """Test storing/loading the entire model.""" model = word2vec.Word2Vec(sentences, min_count=1) @@ -352,7 +351,7 @@ def testVocab(self): self.assertRaises(RuntimeError, word2vec.Word2Vec, []) # input not empty, but rather completely filtered out - self.assertRaises(RuntimeError, word2vec.Word2Vec, corpus, min_count=total_words+1) + self.assertRaises(RuntimeError, word2vec.Word2Vec, corpus, min_count=total_words + 1) def testTraining(self): """Test word2vec training.""" @@ -573,7 +572,7 @@ def testParallel(self): for workers in [2, 4]: model = word2vec.Word2Vec(corpus, workers=workers) - sims = model.most_similar('israeli') + sims = model.most_similar('israeli') # noqa:F841 # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization # so let's test only for top3 # TODO: commented out for now; find a more robust way to compare against "gold standard" @@ -622,16 +621,16 @@ def testNormalizeAfterTrainingData(self): def testPredictOutputWord(self): '''Test word2vec predict_output_word method handling for negative sampling scheme''' - #under normal circumstances + # under normal circumstances model_with_neg = word2vec.Word2Vec(sentences, min_count=1) predictions_with_neg = model_with_neg.predict_output_word(['system', 'human'], topn=5) - self.assertTrue(len(predictions_with_neg)==5) + self.assertTrue(len(predictions_with_neg) == 5) - #out-of-vobaculary scenario + # out-of-vobaculary scenario predictions_out_of_vocab = model_with_neg.predict_output_word(['some', 'random', 'words'], topn=5) self.assertEqual(predictions_out_of_vocab, None) - #when required model parameters have been deleted + # when required model parameters have been deleted model_with_neg.init_sims() model_with_neg.wv.save_word2vec_format(testfile(), binary=True) kv_model_with_neg = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) @@ -639,7 +638,7 @@ def testPredictOutputWord(self): binary_model_with_neg.wv = kv_model_with_neg self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human']) - #negative sampling scheme not used + # negative sampling scheme not used model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0) self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human']) @@ -679,6 +678,7 @@ def test_train_with_explicit_param(self): with self.assertRaises(ValueError): model.train(sentences) + def test_sentences_should_not_be_a_generator(self): """ Is sentences a generator object? @@ -706,7 +706,7 @@ def test_compute_training_loss(self): self.assertTrue(training_loss_val > 0.0) -#endclass TestWord2VecModel +# endclass TestWord2VecModel class TestWMD(unittest.TestCase): def testNonzero(self): @@ -791,7 +791,7 @@ def testPathLineSentencesOneFile(self): self.assertEqual(words, utils.to_unicode(orig.readline()).split()) -#endclass TestWord2VecSentenceIterators +# endclass TestWord2VecSentenceIterators # TODO: get correct path to Python binary # class TestWord2VecScripts(unittest.TestCase): diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index 5b8260fdf2..8f8d5b5f9d 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -18,13 +18,15 @@ from gensim.models.wrappers import wordrank -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + def testfile(): # temporary model will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_wordrank.test') + class TestWordrank(unittest.TestCase): def setUp(self): wr_home = os.environ.get('WR_HOME', None) @@ -42,7 +44,7 @@ def testLoadWordrankFormat(self): vocab_size, dim = 76, 50 self.assertEqual(model.syn0.shape, (vocab_size, dim)) self.assertEqual(len(model.vocab), vocab_size) - os.remove(self.wr_file+'.w2vformat') + os.remove(self.wr_file + '.w2vformat') def testEnsemble(self): """Test ensemble of two embeddings""" @@ -50,7 +52,7 @@ def testEnsemble(self): return new_emb = self.test_model.ensemble_embedding(self.wr_file, self.wr_file) self.assertEqual(new_emb.shape, (76, 50)) - os.remove(self.wr_file+'.w2vformat') + os.remove(self.wr_file + '.w2vformat') def testPersistence(self): """Test storing/loading the entire model""" @@ -77,8 +79,7 @@ def models_equal(self, model, model2): self.assertEqual(set(model.vocab.keys()), set(model2.vocab.keys())) self.assertTrue(numpy.allclose(model.syn0, model2.syn0)) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() - - \ No newline at end of file diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 341834c92f..065943a28f 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -14,6 +14,7 @@ logger = logging.getLogger(__name__) + def arithmetic_mean(confirmed_measures): """ This functoin performs the arithmetic mean aggregation on the output obtained from diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 4845a26859..9097036914 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -9,10 +9,10 @@ """ import logging -import numpy as np logger = logging.getLogger(__name__) + def s_one_pre(topics): """ This function performs s_one_pre segmentation on a list of topics. @@ -40,6 +40,7 @@ def s_one_pre(topics): return s_one_pre + def s_one_one(topics): """ This function performs s_one_one segmentation on a list of topics. @@ -70,6 +71,7 @@ def s_one_one(topics): return s_one_one + def s_one_set(topics): """ This function performs s_one_set segmentation on a list of topics. diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 351f15a932..7305fe9792 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -438,7 +438,7 @@ def run(self): logger.info( "%s interrupted after processing %d documents", self.__class__.__name__, self.accumulator.num_docs) - except: + except Exception: logger.exception("worker encountered unexpected exception") finally: self.reply_to_master() diff --git a/gensim/utils.py b/gensim/utils.py index cb7a204511..47d7bc98cd 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -13,8 +13,6 @@ import logging import warnings -logger = logging.getLogger(__name__) - try: from html.entities import name2codepoint as n2cp except ImportError: @@ -41,11 +39,13 @@ import numbers import scipy.sparse +from six import iterkeys, iteritems, u, string_types, unichr +from six.moves import xrange + if sys.version_info[0] >= 3: unicode = str -from six import iterkeys, iteritems, u, string_types, unichr -from six.moves import xrange +logger = logging.getLogger(__name__) try: from smart_open import smart_open @@ -129,6 +129,8 @@ def __enter__(self): def __exit__(self, type, value, traceback): pass + + nocm = NoCM() @@ -230,6 +232,8 @@ def any2utf8(text, errors='strict', encoding='utf8'): return text.encode('utf8') # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 return unicode(text, encoding, errors=errors).encode('utf8') + + to_utf8 = any2utf8 @@ -238,6 +242,8 @@ def any2unicode(text, encoding='utf8', errors='strict'): if isinstance(text, unicode): return text return unicode(text, encoding, errors=errors) + + to_unicode = any2unicode @@ -462,7 +468,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, self.__dict__['__scipys'] = scipys self.__dict__['__ignoreds'] = ignoreds self.__dict__['__recursive_saveloads'] = recursive_saveloads - except: + except Exception: # restore the attributes if exception-interrupted for attrib, val in iteritems(asides): setattr(self, attrib, val) @@ -502,7 +508,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, except TypeError: # `fname_or_handle` does not have write attribute self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) -#endclass SaveLoad +# endclass SaveLoad def identity(p): @@ -532,6 +538,7 @@ class FakeDict(object): is a waste of memory. """ + def __init__(self, num_terms): self.num_terms = num_terms @@ -599,7 +606,7 @@ def is_corpus(obj): try: if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack return True, obj - except: + except Exception: pass try: if hasattr(obj, 'next') or hasattr(obj, '__next__'): @@ -637,14 +644,14 @@ def get_my_ip(): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect((ns._pyroUri.host, ns._pyroUri.port)) result, port = s.getsockname() - except: + except Exception: try: # see what ifconfig says about our default interface import commands result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:] if len(result.split('.')) != 4: raise Exception() - except: + except Exception: # give up, leave the resolution to gethostbyname result = socket.gethostbyname(socket.gethostname()) return result @@ -655,6 +662,7 @@ class RepeatCorpus(SaveLoad): Used in the tutorial on distributed computing and likely not useful anywhere else. """ + def __init__(self, corpus, reps): """ Wrap a `corpus` as another corpus of length `reps`. This is achieved by @@ -794,7 +802,7 @@ def substitute_entity(match): return safe_unichr(cp) else: return match.group() - except: + except Exception: # in case of errors, return original input return match.group() @@ -823,6 +831,7 @@ def chunkize_serial(iterable, chunksize, as_numpy=False): # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference yield wrapped_chunk.pop() + grouper = chunkize_serial @@ -858,7 +867,7 @@ def run(self): logger.debug("prepared another chunk of %i documents (qsize=%s)" % (len(wrapped_chunk[0]), qsize)) self.q.put(wrapped_chunk.pop(), block=True) -#endclass InputQueue +# endclass InputQueue if os.name == 'nt': @@ -1039,7 +1048,7 @@ def has_pattern(): Function which returns a flag indicating whether pattern is installed or not """ try: - from pattern.en import parse + from pattern.en import parse # noqa:F401 return True except ImportError: return False @@ -1142,6 +1151,7 @@ def qsize(queue): # OS X doesn't support qsize return -1 + RULE_DEFAULT = 0 RULE_DISCARD = 1 RULE_KEEP = 2 diff --git a/setup.py b/setup.py index 6097e34c5a..afeff174cd 100644 --- a/setup.py +++ b/setup.py @@ -13,21 +13,22 @@ import os import sys import warnings -import io - -if sys.version_info[:2] < (2, 7) or (sys.version_info[:1] == 3 and sys.version_info[:2] < (3, 5)): - raise Exception('This version of gensim needs Python 2.7, 3.5 or later.') import ez_setup -ez_setup.use_setuptools() from setuptools import setup, find_packages, Extension from setuptools.command.build_ext import build_ext +if sys.version_info[:2] < (2, 7) or (sys.version_info[:1] == 3 and sys.version_info[:2] < (3, 5)): + raise Exception('This version of gensim needs Python 2.7, 3.5 or later.') + +ez_setup.use_setuptools() # the following code is adapted from tornado's setup.py: # https://github.com/tornadoweb/tornado/blob/master/setup.py # to support installing without the extension on platforms where # no compiler is available. + + class custom_build_ext(build_ext): """Allow C extension building to fail. @@ -89,17 +90,16 @@ def build_extension(self, ext): # importing numpy directly in this script, before it's actually installed! # http://stackoverflow.com/questions/19919905/how-to-bootstrap-numpy-installation-in-setup-py def finalize_options(self): - build_ext.finalize_options(self) - # Prevent numpy from thinking it is still in its setup process: - # https://docs.python.org/2/library/__builtin__.html#module-__builtin__ - if isinstance(__builtins__, dict): - __builtins__["__NUMPY_SETUP__"] = False - else: - __builtins__.__NUMPY_SETUP__ = False - - import numpy - self.include_dirs.append(numpy.get_include()) + build_ext.finalize_options(self) + # Prevent numpy from thinking it is still in its setup process: + # https://docs.python.org/2/library/__builtin__.html#module-__builtin__ + if isinstance(__builtins__, dict): + __builtins__["__NUMPY_SETUP__"] = False + else: + __builtins__.__NUMPY_SETUP__ = False + import numpy + self.include_dirs.append(numpy.get_include()) model_dir = os.path.join(os.path.dirname(__file__), 'gensim', 'models') @@ -113,7 +113,6 @@ def finalize_options(self): cmdclass.update(vars(wheelhouse_uploader.cmd)) - LONG_DESCRIPTION = u""" ============================================== gensim -- Topic Modelling in Python