diff --git a/continuous_integration/travis/flake8_diff.sh b/continuous_integration/travis/flake8_diff.sh index 2f04ab3e31..37d12de61d 100755 --- a/continuous_integration/travis/flake8_diff.sh +++ b/continuous_integration/travis/flake8_diff.sh @@ -19,18 +19,18 @@ set -e set -o pipefail PROJECT=RaRe-Technologies/gensim -PROJECT_URL=https://github.com/$PROJECT.git +PROJECT_URL=https://github.com/${PROJECT}.git # Find the remote with the project name (upstream in most cases) -REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '') +REMOTE=$(git remote -v | grep ${PROJECT} | cut -f1 | head -1 || echo '') # Add a temporary remote if needed. For example this is necessary when # Travis is configured to run in a fork. In this case 'origin' is the # fork and not the reference repo we want to diff against. if [[ -z "$REMOTE" ]]; then TMP_REMOTE=tmp_reference_upstream - REMOTE=$TMP_REMOTE - git remote add $REMOTE $PROJECT_URL + REMOTE=${TMP_REMOTE} + git remote add ${REMOTE} ${PROJECT_URL} fi echo "Remotes:" @@ -56,15 +56,15 @@ if [[ "$TRAVIS" == "true" ]]; then echo "New branch, no commit range from Travis so passing this test by convention" exit 0 fi - COMMIT_RANGE=$TRAVIS_COMMIT_RANGE + COMMIT_RANGE=${TRAVIS_COMMIT_RANGE} fi else # We want to fetch the code as it is in the PR branch and not # the result of the merge into develop. This way line numbers # reported by Travis will match with the local code. - LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST + LOCAL_BRANCH_REF=travis_pr_${TRAVIS_PULL_REQUEST} # In Travis the PR target is always origin - git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF + git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:refs/${LOCAL_BRANCH_REF} fi fi @@ -76,34 +76,34 @@ if [[ -z "$COMMIT_RANGE" ]]; then fi echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:" echo '--------------------------------------------------------------------------------' - git log -2 $LOCAL_BRANCH_REF + git log -2 ${LOCAL_BRANCH_REF} REMOTE_MASTER_REF="$REMOTE/develop" # Make sure that $REMOTE_MASTER_REF is a valid reference echo -e "\nFetching $REMOTE_MASTER_REF" echo '--------------------------------------------------------------------------------' - git fetch $REMOTE develop:refs/remotes/$REMOTE_MASTER_REF - LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF) - REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short $REMOTE_MASTER_REF) + git fetch ${REMOTE} develop:refs/remotes/${REMOTE_MASTER_REF} + LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short ${LOCAL_BRANCH_REF}) + REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short ${REMOTE_MASTER_REF}) - COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MASTER_REF) || \ - echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MASTER_REF -q)" + COMMIT=$(git merge-base ${LOCAL_BRANCH_REF} ${REMOTE_MASTER_REF}) || \ + echo "No common ancestor found for $(git show ${LOCAL_BRANCH_REF} -q) and $(git show ${REMOTE_MASTER_REF} -q)" if [ -z "$COMMIT" ]; then exit 1 fi - COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT) + COMMIT_SHORT_HASH=$(git rev-parse --short ${COMMIT}) echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\ "and $REMOTE_MASTER_REF ($REMOTE_MASTER_SHORT_HASH) is $COMMIT_SHORT_HASH:" echo '--------------------------------------------------------------------------------' - git show --no-patch $COMMIT_SHORT_HASH + git show --no-patch ${COMMIT_SHORT_HASH} COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH" if [[ -n "$TMP_REMOTE" ]]; then - git remote remove $TMP_REMOTE + git remote remove ${TMP_REMOTE} fi else @@ -111,19 +111,19 @@ else fi echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \ - "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):" + "($(git rev-list ${COMMIT_RANGE} | wc -l) commit(s)):" echo '--------------------------------------------------------------------------------' # We ignore files from sklearn/externals. # Excluding vec files since they contain non-utf8 content and flake8 raises exception for non-utf8 input # We need the following command to exit with 0 hence the echo in case # there is no match -MODIFIED_PY_FILES="$(git diff --name-only $COMMIT_RANGE | grep '[a-zA-Z0-9]*.py$' || echo "no_match")" -MODIFIED_IPYNB_FILES="$(git diff --name-only $COMMIT_RANGE | grep '[a-zA-Z0-9]*.ipynb$' || echo "no_match")" +MODIFIED_PY_FILES="$(git diff --name-only ${COMMIT_RANGE} | grep '[a-zA-Z0-9]*.py$' || echo "no_match")" +MODIFIED_IPYNB_FILES="$(git diff --name-only ${COMMIT_RANGE} | grep '[a-zA-Z0-9]*.ipynb$' || echo "no_match")" -echo "*.py files: " $MODIFIED_PY_FILES -echo "*.ipynb files: " $MODIFIED_IPYNB_FILES +echo "*.py files: " ${MODIFIED_PY_FILES} +echo "*.ipynb files: " ${MODIFIED_IPYNB_FILES} check_files() { @@ -133,7 +133,7 @@ check_files() { if [ -n "$files" ]; then # Conservative approach: diff without context (--unified=0) so that code # that was not changed does not create failures - git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options + git diff --unified=0 ${COMMIT_RANGE} -- ${files} | flake8 --diff --show-source ${options} fi } @@ -150,6 +150,6 @@ else for fname in ${MODIFIED_IPYNB_FILES} do echo "File: $fname" - jupyter nbconvert --to script --stdout $fname | flake8 - --show-source --ignore=E501,E731,E12,W503,E402 --builtins=get_ipython || true + jupyter nbconvert --to script --stdout ${fname} | flake8 - --show-source --ignore=E501,E731,E12,W503,E402 --builtins=get_ipython || true done fi diff --git a/continuous_integration/travis/install.sh b/continuous_integration/travis/install.sh index 1ba1796a4b..c14ac86925 100755 --- a/continuous_integration/travis/install.sh +++ b/continuous_integration/travis/install.sh @@ -9,5 +9,5 @@ export PATH=/home/travis/miniconda2/bin:$PATH conda update --yes conda -conda create --yes -n gensim-test python=$PYTHON_VERSION pip atlas flake8 jupyter numpy==$NUMPY_VERSION scipy==$SCIPY_VERSION && source activate gensim-test +conda create --yes -n gensim-test python=${PYTHON_VERSION} pip atlas flake8 jupyter numpy==${NUMPY_VERSION} scipy==${SCIPY_VERSION} && source activate gensim-test pip install . && pip install .[test] diff --git a/docker/start_jupyter_notebook.sh b/docker/start_jupyter_notebook.sh index 4c5946d056..7893536dd6 100644 --- a/docker/start_jupyter_notebook.sh +++ b/docker/start_jupyter_notebook.sh @@ -4,4 +4,4 @@ PORT=$1 NOTEBOOK_DIR=/gensim/docs/notebooks DEFAULT_URL=/notebooks/gensim%20Quick%20Start.ipynb -jupyter notebook --no-browser --ip=* --port=$PORT --allow-root --notebook-dir=$NOTEBOOK_DIR --NotebookApp.token=\"\" --NotebookApp.default_url=$DEFAULT_URL \ No newline at end of file +jupyter notebook --no-browser --ip=* --port=${PORT} --allow-root --notebook-dir=${NOTEBOOK_DIR} --NotebookApp.token=\"\" --NotebookApp.default_url=${DEFAULT_URL} diff --git a/docs/notebooks/test_notebooks.py b/docs/notebooks/test_notebooks.py index 05dd7082f5..77633b7037 100644 --- a/docs/notebooks/test_notebooks.py +++ b/docs/notebooks/test_notebooks.py @@ -28,16 +28,11 @@ def _notebook_run(path): print(str(e.traceback).split("\n")[-2]) else: raise e - except TimeoutError as e: + except RuntimeError as e: print(e) finally: nbformat.write(nb, fout) - #nb = nbformat.read(fout, nbformat.current_nbformat) - - #errors = errors.extend( - #[output for cell in nb.cells if "outputs" in cell - # for output in cell["outputs"] if output.output_type == "error"]) return nb, errors diff --git a/docs/src/conf.py b/docs/src/conf.py index 622db5fec7..b1557c906c 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -11,7 +11,8 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os +import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -34,7 +35,7 @@ source_suffix = '.rst' # The encoding of source files. -#source_encoding = 'utf-8' +# source_encoding = 'utf-8' # The master toctree document. master_doc = 'indextoc' @@ -58,67 +59,67 @@ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. -#unused_docs = [] +# unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = ['_build'] # The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -#html_theme = 'default' +# html_theme = 'default' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#main_colour = "#ffbbbb" +# main_colour = "#ffbbbb" html_theme_options = { -#"rightsidebar": "false", -#"stickysidebar": "true", -#"bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", -#"headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", -#"sidebarbgcolor": "fuckyou", -#"footerbgcolor": "#771111", -#"relbarbgcolor": "#993333", -#"sidebartextcolor": "#000000", -#"sidebarlinkcolor": "#330000", -#"codebgcolor": "#fffff0", -#"headtextcolor": "#000080", -#"headbgcolor": "#f0f0ff", -#"bgcolor": "#ffffff", +# "rightsidebar": "false", +# "stickysidebar": "true", +# "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", +# "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", +# "sidebarbgcolor": "fuckyou", +# "footerbgcolor": "#771111", +# "relbarbgcolor": "#993333", +# "sidebartextcolor": "#000000", +# "sidebarlinkcolor": "#330000", +# "codebgcolor": "#fffff0", +# "headtextcolor": "#000080", +# "headbgcolor": "#f0f0ff", +# "bgcolor": "#ffffff", } @@ -130,11 +131,11 @@ html_title = "gensim" # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = '' +# html_short_title = '' # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 @@ -152,17 +153,17 @@ # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -html_sidebars = {} #{'index': ['download.html', 'globaltoc.html', 'searchbox.html', 'indexsidebar.html']} -#html_sidebars = {'index': ['globaltoc.html', 'searchbox.html']} +html_sidebars = {} # {'index': ['download.html', 'globaltoc.html', 'searchbox.html', 'indexsidebar.html']} +# html_sidebars = {'index': ['globaltoc.html', 'searchbox.html']} # If false, no module index is generated. -#html_use_modindex = True +# html_use_modindex = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. html_split_index = False @@ -175,10 +176,10 @@ # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' +# html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'gensimdoc' @@ -188,32 +189,30 @@ # -- Options for LaTeX output -------------------------------------------------- # The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' +# latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' +# latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', 'gensim.tex', u'gensim Documentation', u'Radim Řehůřek', 'manual'), -] +latex_documents = [('index', 'gensim.tex', u'gensim Documentation', u'Radim Řehůřek', 'manual')] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. latex_use_parts = False # Additional stuff for the LaTeX preamble. -#latex_preamble = '' +# latex_preamble = '' # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_use_modindex = True +# latex_use_modindex = True suppress_warnings = ['image.nonlocal_uri', 'ref.citation', 'ref.footnote'] diff --git a/gensim/__init__.py b/gensim/__init__.py index c267afe4de..3923460991 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -3,7 +3,7 @@ similarities within a corpus of documents. """ -from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization # noqa:F401 +from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization, utils # noqa:F401 import logging __version__ = '2.3.0' diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 327a36fc14..6bd96da716 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -45,7 +45,7 @@ def __init__(self, fname, fname_vocab=None): `fname.vocab`. """ IndexedCorpus.__init__(self, fname) - logger.info("loading corpus from %s" % fname) + logger.info("loading corpus from %s", fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) @@ -102,7 +102,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): else: num_terms = 1 + max([-1] + id2word.keys()) - logger.info("storing corpus in Blei's LDA-C format into %s" % fname) + logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: @@ -113,7 +113,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') - logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) + logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) @@ -127,5 +127,3 @@ def docbyoffset(self, offset): with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline()) - -# endclass BleiCorpus diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 6d3288bc0f..969437e571 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -36,7 +36,7 @@ def __init__(self, fname, labels): `labels` = are class labels present in the input file? => skip the first column """ - logger.info("loading corpus from %s" % fname) + logger.info("loading corpus from %s", fname) self.fname = fname self.length = None self.labels = labels @@ -45,7 +45,7 @@ def __init__(self, fname, labels): head = ''.join(itertools.islice(utils.smart_open(self.fname), 5)) self.headers = csv.Sniffer().has_header(head) self.dialect = csv.Sniffer().sniff(head) - logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers)) + logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) def __iter__(self): """ @@ -60,8 +60,6 @@ def __iter__(self): for line_no, line in enumerate(reader): if self.labels: line.pop(0) # ignore the first column = class label - yield list(enumerate(map(float, line))) + yield list(enumerate(float(x) for x in line)) self.length = line_no + 1 # store the total number of CSV rows = documents - -# endclass CsvCorpus diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index d32276688b..08c4097f03 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -61,7 +61,7 @@ def __getitem__(self, tokenid): if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly - self.id2token = dict((v, k) for k, v in iteritems(self.token2id)) + self.id2token = utils.revdict(self.token2id) return self.id2token[tokenid] # will throw for non-existent ids def __iter__(self): @@ -120,7 +120,8 @@ def add_documents(self, documents, prune_at=2000000): logger.info( "built %s from %i documents (total %i corpus positions)", - self, self.num_docs, self.num_pos) + self, self.num_docs, self.num_pos + ) def doc2bow(self, document, allow_update=False, return_missing=False): """ @@ -147,14 +148,14 @@ def doc2bow(self, document, allow_update=False, return_missing=False): token2id = self.token2id if allow_update or return_missing: - missing = dict((w, freq) for w, freq in iteritems(counter) if w not in token2id) + missing = {w: freq for w, freq in iteritems(counter) if w not in token2id} if allow_update: for w in missing: # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) - result = dict((token2id[w], freq) for w, freq in iteritems(counter) if w in token2id) + result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id} if allow_update: self.num_docs += 1 @@ -201,15 +202,17 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N else: good_ids = ( v for v in itervalues(self.token2id) - if no_below <= self.dfs.get(v, 0) <= no_above_abs) + if no_below <= self.dfs.get(v, 0) <= no_above_abs + ) good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) if keep_n is not None: good_ids = good_ids[:keep_n] - bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)] + bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)] logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10]) logger.info( "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents", - len(good_ids), no_below, no_above_abs, 100.0 * no_above) + len(good_ids), no_below, no_above_abs, 100.0 * no_above + ) # do the actual filtering, then rebuild dictionary to remove gaps in ids self.filter_tokens(good_ids=good_ids) @@ -229,11 +232,11 @@ def filter_n_most_frequent(self, remove_n): most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True) most_frequent_ids = most_frequent_ids[:remove_n] # do the actual filtering, then rebuild dictionary to remove gaps in ids - most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids] + most_frequent_words = [(self[idx], self.dfs.get(idx, 0)) for idx in most_frequent_ids] logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10]) self.filter_tokens(bad_ids=most_frequent_ids) - logger.info("resulting dictionary: %s" % self) + logger.info("resulting dictionary: %s", self) def filter_tokens(self, bad_ids=None, good_ids=None): """ @@ -244,20 +247,12 @@ def filter_tokens(self, bad_ids=None, good_ids=None): """ if bad_ids is not None: bad_ids = set(bad_ids) - self.token2id = dict((token, tokenid) - for token, tokenid in iteritems(self.token2id) - if tokenid not in bad_ids) - self.dfs = dict((tokenid, freq) - for tokenid, freq in iteritems(self.dfs) - if tokenid not in bad_ids) + self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids} + self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids} if good_ids is not None: good_ids = set(good_ids) - self.token2id = dict((token, tokenid) - for token, tokenid in iteritems(self.token2id) - if tokenid in good_ids) - self.dfs = dict((tokenid, freq) - for tokenid, freq in iteritems(self.dfs) - if tokenid in good_ids) + self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids} + self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids} self.compactify() def compactify(self): @@ -274,9 +269,9 @@ def compactify(self): idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids - self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) + self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)} self.id2token = {} - self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs)) + self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)} def save_as_text(self, fname, sort_by_word=True): """ @@ -405,15 +400,16 @@ def from_corpus(corpus, id2word=None): if id2word is None: # make sure length(result) == get_max_id(corpus) + 1 - result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1)) + result.token2id = {unicode(i): i for i in xrange(max_id + 1)} else: # id=>word mapping given: simply copy it - result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word)) - for id in itervalues(result.token2id): + result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)} + for idx in itervalues(result.token2id): # make sure all token ids have a valid `dfs` entry - result.dfs[id] = result.dfs.get(id, 0) + result.dfs[idx] = result.dfs.get(idx, 0) logger.info( "built %s from %i documents (total %i corpus positions)", - result, result.num_docs, result.num_pos) + result, result.num_docs, result.num_pos + ) return result diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 63f966b3cd..687ec241ac 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -101,7 +101,7 @@ def keys(self): return range(len(self)) def __str__(self): - return ("HashDictionary(%i id range)" % len(self)) + return "HashDictionary(%i id range)" % len(self) @staticmethod def from_documents(*args, **kwargs): @@ -117,11 +117,12 @@ def add_documents(self, documents): """ for docno, document in enumerate(documents): if docno % 10000 == 0: - logger.info("adding document #%i to %s" % (docno, self)) + logger.info("adding document #%i to %s", docno, self) self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids logger.info( "built %s from %i documents (total %i corpus positions)", - self, self.num_docs, self.num_pos) + self, self.num_docs, self.num_pos + ) def doc2bow(self, document, allow_update=False, return_missing=False): """ @@ -182,24 +183,21 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] - ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n]) - - self.dfs_debug = dict((word, freq) - for word, freq in iteritems(self.dfs_debug) - if word in ok) - self.token2id = dict((token, tokenid) - for token, tokenid in iteritems(self.token2id) - if token in self.dfs_debug) - self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug)) - for tokenid, tokens in iteritems(self.id2token)) - self.dfs = dict((tokenid, freq) - for tokenid, freq in iteritems(self.dfs) - if self.id2token.get(tokenid, set())) + ok = frozenset(word for word, freq in sorted(ok, key=lambda x: -x[1])[:keep_n]) + + self.dfs_debug = {word: freq for word, freq in iteritems(self.dfs_debug) if word in ok} + self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug} + self.id2token = { + tokenid: {token for token in tokens if token in self.dfs_debug} + for tokenid, tokens in iteritems(self.id2token) + } + self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())} # for word->document frequency logger.info( "kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents", - no_below, no_above_abs, 100.0 * no_above) + no_below, no_above_abs, 100.0 * no_above + ) def save_as_text(self, fname): """ @@ -216,6 +214,6 @@ def save_as_text(self, fname): words = sorted(self[tokenid]) if words: words_df = [(word, self.dfs_debug.get(word, 0)) for word in words] - words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])] + words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda x: -x[1])] words_df = '\t'.join(words_df) fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), words_df))) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 62f29b25ed..af79a2fd5f 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -50,7 +50,7 @@ def __init__(self, fname, index_fname=None): self.index = utils.unpickle(index_fname) # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) - logger.info("loaded corpus index from %s" % index_fname) + logger.info("loaded corpus index from %s", index_fname) except Exception: self.index = None self.length = None @@ -95,15 +95,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) if offsets is None: - raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % - serializer.__name__) + raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility - logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) + logger.info("saving %s index to %s", serializer.__name__, index_fname) utils.pickle(offsets, index_fname) def __len__(self): @@ -115,7 +114,7 @@ def __len__(self): return len(self.index) if self.length is None: logger.info("caching corpus length") - self.length = sum(1 for doc in self) + self.length = sum(1 for _ in self) return self.length def __getitem__(self, docno): @@ -128,6 +127,3 @@ def __getitem__(self, docno): return self.docbyoffset(self.index[docno]) else: raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') - - -# endclass IndexedCorpus diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 315490cdcc..d5265f6571 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -15,7 +15,7 @@ from gensim import utils from gensim.corpora import IndexedCorpus -from six import iteritems, iterkeys +from six import iterkeys from six.moves import xrange, zip as izip @@ -63,7 +63,7 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) - logger.info("loading corpus from %s" % fname) + logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) @@ -79,13 +79,15 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) else: - logger.info("using provided word mapping (%i ids)" % len(id2word)) + logger.info("using provided word mapping (%i ids)", len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples - logger.info("loaded corpus with %i documents and %i terms from %s" % - (self.num_docs, self.num_terms, fname)) + logger.info( + "loaded corpus with %i documents and %i terms from %s", + self.num_docs, self.num_terms, fname + ) def _calculate_num_docs(self): # the first line in input data is the number of documents (integer). throws exception on bad input. @@ -118,12 +120,11 @@ def line2doc(self, line): use_words.append(word) marker.add(word) # construct a list of (wordIndex, wordFrequency) 2-tuples - doc = list(zip(map(self.word2id.get, use_words), - map(words.count, use_words))) + doc = [(self.word2id.get(w), words.count(w)) for w in use_words] else: uniq_words = set(words) # construct a list of (word, wordFrequency) 2-tuples - doc = list(zip(uniq_words, map(words.count, uniq_words))) + doc = [(w, words.count(w)) for w in uniq_words] # return the document, then forget it and move on to the next one # note that this way, only one doc is stored in memory at a time, not the whole corpus @@ -165,9 +166,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: - logger.warning("List-of-words format can only save vectors with " - "integer elements; %i float entries were truncated to integer value" % - truncated) + logger.warning( + "List-of-words format can only save vectors with integer elements; " + "%i float entries were truncated to integer value", truncated + ) return offsets def docbyoffset(self, offset): @@ -185,6 +187,4 @@ def id2word(self): @id2word.setter def id2word(self, val): self._id2word = val - self.word2id = dict((v, k) for k, v in iteritems(val)) - -# endclass LowCorpus + self.word2id = utils.revdict(val) diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index 00333e9358..cacf0074bd 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -42,7 +42,7 @@ def __init__(self, fname, id2word=None, metadata=False): def _calculate_num_docs(self): with utils.smart_open(self.fname) as fin: - result = sum([1 for x in fin]) + result = sum(1 for _ in fin) return result def __iter__(self): @@ -85,7 +85,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) - logger.info("storing corpus in Mallet format into %s" % fname) + logger.info("storing corpus in Mallet format into %s", fname) truncated = 0 offsets = [] @@ -106,9 +106,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: - logger.warning("Mallet format can only save vectors with " - "integer elements; %i float entries were truncated to integer value" % - truncated) + logger.warning( + "Mallet format can only save vectors with integer elements; " + "%i float entries were truncated to integer value", truncated + ) return offsets @@ -119,5 +120,3 @@ def docbyoffset(self, offset): with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline()) - -# endclass MalletCorpus diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 08e809443b..2158f0a526 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -45,8 +45,8 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): This function is automatically called by `MmCorpus.serialize`; don't call it directly, call `serialize` instead. """ - logger.info("storing corpus in Matrix Market format to %s" % fname) + logger.info("storing corpus in Matrix Market format to %s", fname) num_terms = len(id2word) if id2word is not None else None - return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata) - -# endclass MmCorpus + return matutils.MmWriter.write_corpus( + fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata + ) diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 255fc2b7fe..e5904f2773 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -234,8 +234,7 @@ def __init__(self, output_prefix, corpus, dim=None, self.current_offset = None # The index into the dataset which # corresponds to index 0 of current shard - logger.info('Initializing sharded corpus with prefix ' - '{0}'.format(output_prefix)) + logger.info('Initializing sharded corpus with prefix %s', output_prefix) if (not os.path.isfile(output_prefix)) or overwrite: logger.info('Building from corpus...') self.init_shards(output_prefix, corpus, shardsize) @@ -243,8 +242,7 @@ def __init__(self, output_prefix, corpus, dim=None, # Save automatically, to facilitate re-loading # and retain information about how the corpus # was serialized. - logger.info('Saving ShardedCorpus object to ' - '{0}'.format(self.output_prefix)) + logger.info('Saving ShardedCorpus object to %s', self.output_prefix) self.save() else: logger.info('Cloning existing...') @@ -254,19 +252,18 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp """Initialize shards from the corpus.""" if not gensim.utils.is_corpus(corpus): - raise ValueError('Cannot initialize shards without a corpus to read' - ' from! (Got corpus type: {0})'.format(type(corpus))) + raise ValueError( + "Cannot initialize shards without a corpus to read from! (Got corpus type: {0})".format(type(corpus)) + ) proposed_dim = self._guess_n_features(corpus) if proposed_dim != self.dim: if self.dim is None: - logger.info('Deriving dataset dimension from corpus: ' - '{0}'.format(proposed_dim)) + logger.info('Deriving dataset dimension from corpus: %d', proposed_dim) else: logger.warning( - 'Dataset dimension derived from input corpus diffe' - 'rs from initialization argument, using corpus.' - '(corpus {0}, init arg {1})'.format(proposed_dim, self.dim) + "Dataset dimension derived from input corpus differs from initialization argument, " + "using corpus. (corpus %d, init arg %d)", proposed_dim, self.dim ) self.dim = proposed_dim @@ -277,11 +274,10 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp logger.info('Running init from corpus.') for n, doc_chunk in enumerate(gensim.utils.grouper(corpus, chunksize=shardsize)): - logger.info('Chunk no. {0} at {1} s'.format(n, time.clock() - start_time)) + logger.info('Chunk no. %d at %f s', n, time.clock() - start_time) current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype) - logger.debug('Current chunk dimension: ' - '{0} x {1}'.format(len(doc_chunk), self.dim)) + logger.debug('Current chunk dimension: %d x %d', len(doc_chunk), self.dim) for i, doc in enumerate(doc_chunk): doc = dict(doc) @@ -294,7 +290,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp self.save_shard(current_shard) end_time = time.clock() - logger.info('Built {0} shards in {1} s.'.format(self.n_shards, end_time - start_time)) + logger.info('Built %d shards in %f s.', self.n_shards, end_time - start_time) def init_by_clone(self): """ @@ -309,12 +305,12 @@ def init_by_clone(self): if temp.dim != self.dim: if self.dim is None: - logger.info('Loaded dataset dimension: {0}'.format(temp.dim)) + logger.info('Loaded dataset dimension: %d', temp.dim) else: logger.warning( - 'Loaded dataset dimension differs from init arg ' - 'dimension, using loaded dim. ' - '(loaded {0}, init {1})'.format(temp.dim, self.dim) + "Loaded dataset dimension differs from init arg dimension, " + "using loaded dim. (loaded %d, init %d)", + temp.dim, self.dim ) self.dim = temp.dim # To be consistent with the loaded data! @@ -346,8 +342,6 @@ def load_shard(self, n): """ Load (unpickle) the n-th shard as the "live" part of the dataset into the Dataset object.""" - # logger.debug('ShardedCorpus loading shard {0}, ' - # 'current shard: {1}'.format(n, self.current_shard_n)) # No-op if the shard is already open. if self.current_shard_n == n: @@ -389,22 +383,12 @@ def shard_by_offset(self, offset): ' supported.'.format(offset)) return k - k = -1 - for i, o in enumerate(self.offsets): - if o > offset: # Condition should fire for every valid offset, - # since the last offset is n_docs (one-past-end). - k = i - 1 # First offset is always 0, so i is at least 1. - break - - return k - def in_current(self, offset): """ Determine whether the given offset falls within the current shard. """ - return (self.current_offset <= offset) \ - and (offset < self.offsets[self.current_shard_n + 1]) + return (self.current_offset <= offset) and (offset < self.offsets[self.current_shard_n + 1]) def in_next(self, offset): """ @@ -416,8 +400,7 @@ def in_next(self, offset): """ if self.current_shard_n == self.n_shards: return False # There's no next shard. - return (self.offsets[self.current_shard_n + 1] <= offset) \ - and (offset < self.offsets[self.current_shard_n + 2]) + return (self.offsets[self.current_shard_n + 1] <= offset) and (offset < self.offsets[self.current_shard_n + 2]) def resize_shards(self, shardsize): """ @@ -472,9 +455,7 @@ def resize_shards(self, shardsize): for old_shard_n, old_shard_name in enumerate(old_shard_names): os.remove(old_shard_name) except Exception as e: - logger.error('Exception occurred during old shard no. {0} ' - 'removal: {1}.\nAttempting to at least move ' - 'new shards in.'.format(old_shard_n, str(e))) + logger.error('Exception occurred during old shard no. %d removal: %s.\nAttempting to at least move new shards in.', old_shard_n, str(e)) finally: # If something happens with cleaning up - try to at least get the # new guys in. @@ -483,9 +464,8 @@ def resize_shards(self, shardsize): os.rename(new_shard_name, self._shard_name(shard_n)) # If something happens when we're in this stage, we're screwed. except Exception as e: - print(e) - raise RuntimeError('Resizing completely failed for some reason.' - ' Sorry, dataset is probably ruined...') + logger.exception(e) + raise RuntimeError('Resizing completely failed for some reason. Sorry, dataset is probably ruined...') finally: # Sets the new shard stats. self.n_shards = n_new_shards @@ -529,21 +509,20 @@ def _guess_n_features(self, corpus): return self._guess_n_features(corpus.corpus) else: if not self.dim: - raise TypeError('Couldn\'t find number of features, ' - 'refusing to guess (dimension set to {0},' - 'type of corpus: {1}).'.format(self.dim, type(corpus))) - else: - logger.warning( - 'Couldn\'t find number of features, trusting ' - 'supplied dimension ({0})'.format(self.dim) + raise TypeError( + "Couldn't find number of features, refusing to guess " + "(dimension set to {0}, type of corpus: {1})." + .format(self.dim, type(corpus)) ) + else: + logger.warning("Couldn't find number of features, trusting supplied dimension (%d)", self.dim) n_features = self.dim if self.dim and n_features != self.dim: logger.warning( - 'Discovered inconsistent dataset dim ({0}) and ' - 'feature count from corpus ({1}). Coercing to dimension' - ' given by argument.'.format(self.dim, n_features) + "Discovered inconsistent dataset dim (%d) and feature count from corpus (%d). " + "Coercing to dimension given by argument.", + self.dim, n_features ) return n_features @@ -598,8 +577,7 @@ def __getitem__(self, offset): start = offset.start stop = offset.stop if stop > self.n_docs: - raise IndexError('Requested slice offset {0} out of range' - ' ({1} docs)'.format(stop, self.n_docs)) + raise IndexError('Requested slice offset {0} out of range ({1} docs)'.format(stop, self.n_docs)) # - get range of shards over which to iterate first_shard = self.shard_by_offset(start) @@ -610,16 +588,11 @@ def __getitem__(self, offset): # This fails on one-past # slice indexing; that's why there's a code branch here. - # logger.debug('ShardedCorpus: Retrieving slice {0}: ' - # 'shard {1}'.format((offset.start, offset.stop), - # (first_shard, last_shard))) - self.load_shard(first_shard) # The easy case: both in one shard. if first_shard == last_shard: - s_result = self.current_shard[start - self.current_offset: - stop - self.current_offset] + s_result = self.current_shard[start - self.current_offset: stop - self.current_offset] # Handle different sparsity settings: s_result = self._getitem_format(s_result) @@ -627,11 +600,9 @@ def __getitem__(self, offset): # The hard case: the slice is distributed across multiple shards # - initialize numpy.zeros() - s_result = numpy.zeros((stop - start, self.dim), - dtype=self.current_shard.dtype) + s_result = numpy.zeros((stop - start, self.dim), dtype=self.current_shard.dtype) if self.sparse_serialization: - s_result = sparse.csr_matrix((0, self.dim), - dtype=self.current_shard.dtype) + s_result = sparse.csr_matrix((0, self.dim), dtype=self.current_shard.dtype) # - gradually build it up. We will be using three set of start:stop # indexes: @@ -652,13 +623,11 @@ def __getitem__(self, offset): # - if in ending shard, these are from 0 # to (stop - current_offset) shard_start = start - self.current_offset - shard_stop = self.offsets[self.current_shard_n + 1] - \ - self.current_offset + shard_stop = self.offsets[self.current_shard_n + 1] - self.current_offset # s_result[result_start:result_stop] = self.current_shard[ # shard_start:shard_stop] - s_result = self.__add_to_slice(s_result, result_start, result_stop, - shard_start, shard_stop) + s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop) # First and last get special treatment, these are in between for shard_n in xrange(first_shard + 1, last_shard): @@ -669,9 +638,7 @@ def __getitem__(self, offset): shard_start = 0 shard_stop = self.shardsize - s_result = self.__add_to_slice(s_result, result_start, - result_stop, shard_start, - shard_stop) + s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop) # Last shard self.load_shard(last_shard) @@ -680,9 +647,7 @@ def __getitem__(self, offset): shard_start = 0 shard_stop = stop - self.current_offset - s_result = self.__add_to_slice(s_result, result_start, result_stop, - shard_start, shard_stop) - + s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop) s_result = self._getitem_format(s_result) return s_result @@ -707,10 +672,7 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): Returns the resulting s_result. """ if (result_stop - result_start) != (stop - start): - raise ValueError('Result start/stop range different than stop/start' - 'range (%d - %d vs. %d - %d)'.format(result_start, - result_stop, - start, stop)) + raise ValueError('Result start/stop range different than stop/start range (%d - %d vs. %d - %d)'.format(result_start, result_stop, start, stop)) # Dense data: just copy using numpy's slice notation if not self.sparse_serialization: @@ -722,10 +684,7 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): # result. else: if s_result.shape != (result_start, self.dim): - raise ValueError('Assuption about sparse s_result shape ' - 'invalid: {0} expected rows, {1} real ' - 'rows.'.format(result_start, - s_result.shape[0])) + raise ValueError('Assuption about sparse s_result shape invalid: {0} expected rows, {1} real rows.'.format(result_start, s_result.shape[0])) tmp_matrix = self.current_shard[start:stop] s_result = sparse.vstack([s_result, tmp_matrix]) @@ -789,19 +748,12 @@ def save(self, *args, **kwargs): if len(args) == 0: args = tuple([self.output_prefix]) - attrs_to_ignore = ['current_shard', - 'current_shard_n', - 'current_offset'] + attrs_to_ignore = ['current_shard', 'current_shard_n', 'current_offset'] if 'ignore' not in kwargs: kwargs['ignore'] = frozenset(attrs_to_ignore) else: - kwargs['ignore'] = frozenset([v for v in kwargs['ignore']] - + attrs_to_ignore) + kwargs['ignore'] = frozenset([v for v in kwargs['ignore']] + attrs_to_ignore) super(ShardedCorpus, self).save(*args, **kwargs) - # - # self.reset() - # with smart_open(self.output_prefix, 'wb') as pickle_handle: - # cPickle.dump(self, pickle_handle) @classmethod def load(cls, fname, mmap=None): @@ -811,8 +763,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, - metadata=False, **kwargs): + def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -834,9 +785,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, ShardedCorpus(fname, corpus, **kwargs) @classmethod - def serialize(serializer, fname, corpus, id2word=None, - index_fname=None, progress_cnt=None, labels=None, - metadata=False, **kwargs): + def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, **kwargs): """ Iterate through the document stream `corpus`, saving the documents as a ShardedCorpus to `fname`. @@ -849,6 +798,4 @@ def serialize(serializer, fname, corpus, id2word=None, Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.save_corpus(fname, corpus, id2word=id2word, - progress_cnt=progress_cnt, metadata=metadata, - **kwargs) + serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 5e24419421..290414836e 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -56,7 +56,7 @@ def __init__(self, fname, store_labels=True): """ IndexedCorpus.__init__(self, fname) - logger.info("loading corpus from %s" % fname) + logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.length = None @@ -89,7 +89,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): This function is automatically called by `SvmLightCorpus.serialize`; don't call it directly, call `serialize` instead. """ - logger.info("converting corpus to SVMlight format: %s" % fname) + logger.info("converting corpus to SVMlight format: %s", fname) offsets = [] with utils.smart_open(fname, 'wb') as fout: @@ -129,5 +129,3 @@ def doc2line(doc, label=0): """ pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base return "%s %s\n" % (label, pairs) - -# endclass SvmLightCorpus diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 4be904e613..7265d20d0c 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -112,8 +112,7 @@ class TextCorpus(interfaces.CorpusABC): 6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords """ - def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, - tokenizer=None, token_filters=None): + def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None): """ Args: input (str): path to top-level directory to traverse for corpus documents. @@ -171,9 +170,7 @@ def init_dictionary(self, dictionary): else: logger.info("Input stream provided but dictionary already initialized") else: - logger.warning( - "No input document stream provided; assuming " - "dictionary will be initialized some other way.") + logger.warning("No input document stream provided; assuming dictionary will be initialized some other way.") def __iter__(self): """The function that defines a corpus. @@ -231,8 +228,7 @@ def step_through_preprocess(self, text): yield (self.tokenizer, tokens) for token_filter in self.token_filters: - tokens = token_filter(tokens) - yield (token_filter, tokens) + yield (token_filter, token_filter(tokens)) def get_texts(self): """Iterate over the collection, yielding one document at a time. A document @@ -308,7 +304,6 @@ def __len__(self): # cache the corpus length self.length = sum(1 for _ in self.getstream()) return self.length -# endclass TextCorpus class TextDirectoryCorpus(TextCorpus): @@ -433,7 +428,6 @@ def _cache_corpus_length(self): self.length = sum(1 for _ in self.iter_filepaths()) else: self.length = sum(1 for _ in self.getstream()) -# endclass TextDirectoryCorpus def walk(top, topdown=True, onerror=None, followlinks=False, depth=0): diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 0c09cc7e34..a8911ee07f 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -21,7 +21,6 @@ from gensim.corpora import IndexedCorpus from gensim.matutils import MmReader from gensim.matutils import MmWriter -from six import iteritems from six.moves import xrange @@ -37,7 +36,7 @@ def __init__(self, input): which is expected to be in the UCI Bag-of-Words format. """ - logger.info('Initializing corpus reader from %s' % input) + logger.info('Initializing corpus reader from %s', input) self.input = input @@ -50,16 +49,16 @@ def __init__(self, input): except StopIteration: pass - logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' % - (self.num_docs, self.num_terms, self.num_nnz)) + logger.info( + "accepted corpus with %i documents, %i features, %i non-zero entries", + self.num_docs, self.num_terms, self.num_nnz + ) def skip_headers(self, input_file): for lineno, _ in enumerate(input_file): if lineno == 2: break -# endclass UciReader - class UciWriter(MmWriter): """ @@ -110,7 +109,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): offsets = [] for docno, bow in enumerate(corpus): if docno % progress_cnt == 0: - logger.info("PROGRESS: saving document #%i" % docno) + logger.info("PROGRESS: saving document #%i", docno) if index: posnow = writer.fout.tell() if posnow == poslast: @@ -125,11 +124,11 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): num_docs = docno + 1 if num_docs * num_terms != 0: - logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % - (num_docs, num_terms, - 100.0 * num_nnz / (num_docs * num_terms), - num_nnz, - num_docs * num_terms)) + logger.info( + "saved %ix%i matrix, density=%.3f%% (%i/%i)", + num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), + num_nnz, num_docs * num_terms + ) # now write proper headers, by seeking and overwriting the spaces written earlier writer.update_headers(num_docs, num_terms, num_nnz) @@ -138,8 +137,6 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): if index: return offsets -# endclass UciWriter - class UciCorpus(UciReader, IndexedCorpus): """ @@ -179,14 +176,14 @@ def create_dictionary(self): dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word - dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word)) + dictionary.token2id = utils.revdict(self.id2word) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: - logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) + logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs) for word, count in doc: dictionary.dfs[word] += 1 @@ -214,13 +211,11 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False) # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') - logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) + logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) - logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) + logger.info("storing corpus in UCI Bag-of-Words format: %s", fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt) - -# endclass UciCorpus diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index ea87cce4a2..4a70f106f3 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -57,9 +57,11 @@ # MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that # ought to be ignored -IGNORED_NAMESPACES = ['Wikipedia', 'Category', 'File', 'Portal', 'Template', - 'MediaWiki', 'User', 'Help', 'Book', 'Draft', - 'WikiProject', 'Special', 'Talk'] +IGNORED_NAMESPACES = [ + 'Wikipedia', 'Category', 'File', 'Portal', 'Template', + 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject', + 'Special', 'Talk' +] def filter_wiki(raw): @@ -143,10 +145,7 @@ def remove_template(s): prev_c = c # Remove all the templates - s = ''.join([s[end + 1:start] for start, end in - zip(starts + [None], [-1] + ends)]) - - return s + return ''.join([s[end + 1:start] for start, end in zip(starts + [None], [-1] + ends)]) def remove_file(s): @@ -184,8 +183,7 @@ def get_namespace(tag): m = re.match("^{(.*?)}", tag) namespace = m.group(1) if m else "" if not namespace.startswith("http://www.mediawiki.org/xml/export-"): - raise ValueError("%s not recognized as MediaWiki dump namespace" - % namespace) + raise ValueError("%s not recognized as MediaWiki dump namespace" % namespace) return namespace @@ -271,8 +269,7 @@ class WikiCorpus(TextCorpus): """ - def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, - filter_namespaces=('0',)): + def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. @@ -311,10 +308,10 @@ def get_texts(self): """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 - texts = \ - ((text, self.lemmatize, title, pageid) - for title, text, pageid - in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) + texts = ( + (text, self.lemmatize, title, pageid) for title, text, pageid + in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces) + ) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: @@ -335,15 +332,16 @@ def get_texts(self): yield tokens except KeyboardInterrupt: logger.warn( - "user terminated iteration over Wikipedia corpus after %i documents with %i positions" - " (total %i articles, %i positions before pruning articles shorter than %i words)", - articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) + "user terminated iteration over Wikipedia corpus after %i documents with %i positions " + "(total %i articles, %i positions before pruning articles shorter than %i words)", + articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS + ) else: logger.info( - "finished iterating over Wikipedia corpus of %i documents with %i positions" - " (total %i articles, %i positions before pruning articles shorter than %i words)", - articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) + "finished iterating over Wikipedia corpus of %i documents with %i positions " + "(total %i articles, %i positions before pruning articles shorter than %i words)", + articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS + ) self.length = articles # cache corpus length finally: pool.terminate() -# endclass WikiCorpus diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py index d8fb8c4cb5..07fc247f8b 100644 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ b/gensim/examples/dmlcz/dmlcorpus.py @@ -40,9 +40,9 @@ def __init__(self, configId, resultDir, acceptLangs=None): self.sources = {} # all article sources; see sources.DmlSource class for an example of source if acceptLangs is None: # which languages to accept - acceptLangs = set(['any']) # if not specified, accept all languages (including unknown/unspecified) + acceptLangs = {'any'} # if not specified, accept all languages (including unknown/unspecified) self.acceptLangs = set(acceptLangs) - logger.info('initialized %s' % self) + logger.info('initialized %s', self) def resultFile(self, fname): return os.path.join(self.resultDir, self.configId + '_' + fname) @@ -105,13 +105,12 @@ def buildDictionary(self): them into tokens and converting tokens to their ids (creating new ids as necessary). """ - logger.info("creating dictionary from %i articles" % len(self.documents)) + logger.info("creating dictionary from %i articles", len(self.documents)) self.dictionary = dictionary.Dictionary() numPositions = 0 for docNo, (sourceId, docUri) in enumerate(self.documents): if docNo % 1000 == 0: - logger.info("PROGRESS: at document #%i/%i (%s, %s)" % - (docNo, len(self.documents), sourceId, docUri)) + logger.info("PROGRESS: at document #%i/%i (%s, %s)", docNo, len(self.documents), sourceId, docUri) source = self.config.sources[sourceId] contents = source.getContent(docUri) words = [source.normalizeWord(word) for word in source.tokenize(contents)] @@ -119,8 +118,7 @@ def buildDictionary(self): # convert to bag-of-words, but ignore the result -- here we only care about updating token ids _ = self.dictionary.doc2bow(words, allowUpdate=True) # noqa:F841 - logger.info("built %s from %i documents (total %i corpus positions)" % - (self.dictionary, len(self.documents), numPositions)) + logger.info("built %s from %i documents (total %i corpus positions)", self.dictionary, len(self.documents), numPositions) def processConfig(self, config, shuffle=False): """ @@ -135,31 +133,29 @@ def processConfig(self, config, shuffle=False): """ self.config = config self.documents = [] - logger.info("processing config %s" % config) + logger.info("processing config %s", config) for sourceId, source in config.sources.iteritems(): - logger.info("processing source '%s'" % sourceId) + logger.info("processing source '%s'", sourceId) accepted = [] for articleUri in source.findArticles(): meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value) if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata accepted.append((sourceId, articleUri)) - logger.info("accepted %i articles for source '%s'" % - (len(accepted), sourceId)) + logger.info("accepted %i articles for source '%s'", len(accepted), sourceId) self.documents.extend(accepted) if not self.documents: logger.warning('no articles at all found from the config; something went wrong!') if shuffle: - logger.info("shuffling %i documents for random order" % len(self.documents)) + logger.info("shuffling %i documents for random order", len(self.documents)) import random random.shuffle(self.documents) - logger.info("accepted total of %i articles for %s" % - (len(self.documents), str(config))) + logger.info("accepted total of %i articles for %s", len(self.documents), str(config)) def saveDictionary(self, fname): - logger.info("saving dictionary mapping to %s" % fname) + logger.info("saving dictionary mapping to %s", fname) fout = open(fname, 'w') for tokenId, token in self.dictionary.id2token.iteritems(): fout.write("%i\t%s\n" % (tokenId, token)) @@ -177,7 +173,7 @@ def loadDictionary(fname): return result def saveDocuments(self, fname): - logger.info("saving documents mapping to %s" % fname) + logger.info("saving documents mapping to %s", fname) fout = open(fname, 'w') for docNo, docId in enumerate(self.documents): sourceId, docUri = docId diff --git a/gensim/examples/dmlcz/gensim_build.py b/gensim/examples/dmlcz/gensim_build.py index 9695241fb3..bb62103109 100755 --- a/gensim/examples/dmlcz/gensim_build.py +++ b/gensim/examples/dmlcz/gensim_build.py @@ -24,24 +24,20 @@ if AT_HOME: SOURCE_LIST = [ - sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), - sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), - sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), - ] - -# SOURCE_LIST = [ -# sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'), -# ] + sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), + sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), + sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), + ] RESULT_DIR = '/Users/kofola/workspace/dml/data/results' else: SOURCE_LIST = [ - sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), - sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), - sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), - ] + sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), + sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), + sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), + ] RESULT_DIR = '/data/dmlcz/xrehurek/results' @@ -60,7 +56,7 @@ def buildDmlCorpus(config): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) - logging.info("running %s" % ' '.join(sys.argv)) + logging.info("running %s", ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -76,4 +72,4 @@ def buildDmlCorpus(config): config.addSource(source) buildDmlCorpus(config) - logging.info("finished running %s" % program) + logging.info("finished running %s", program) diff --git a/gensim/examples/dmlcz/gensim_genmodel.py b/gensim/examples/dmlcz/gensim_genmodel.py index df11f9696c..a2f2b792e7 100755 --- a/gensim/examples/dmlcz/gensim_genmodel.py +++ b/gensim/examples/dmlcz/gensim_genmodel.py @@ -31,7 +31,7 @@ if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) - logging.info("running %s" % ' '.join(sys.argv)) + logging.info("running %s", ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -46,9 +46,9 @@ config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) - logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) + logging.info("loading word id mapping from %s", config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) - logging.info("loaded %i word ids" % len(id2word)) + logging.info("loaded %i word ids", len(id2word)) corpus = MmCorpus(config.resultFile('bow.mm')) @@ -56,23 +56,23 @@ model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) model.save(config.resultFile('model_tfidf.pkl')) elif method == 'lda': - model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA) + model = ldamodel.LdaModel(corpus, id2word=id2word, num_topics=DIM_LDA) model.save(config.resultFile('model_lda.pkl')) elif method == 'lsi': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space - model = lsimodel.LsiModel(tfidf[corpus], id2word=id2word, numTopics=DIM_LSI) + model = lsimodel.LsiModel(tfidf[corpus], id2word=id2word, num_topics=DIM_LSI) model.save(config.resultFile('model_lsi.pkl')) elif method == 'rp': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space - model = rpmodel.RpModel(tfidf[corpus], id2word=id2word, numTopics=DIM_RP) + model = rpmodel.RpModel(tfidf[corpus], id2word=id2word, num_topics=DIM_RP) model.save(config.resultFile('model_rp.pkl')) else: raise ValueError('unknown topic extraction method: %s' % repr(method)) MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus]) - logging.info("finished running %s" % program) + logging.info("finished running %s", program) diff --git a/gensim/examples/dmlcz/gensim_xml.py b/gensim/examples/dmlcz/gensim_xml.py index f810d045d4..0b8661ac77 100755 --- a/gensim/examples/dmlcz/gensim_xml.py +++ b/gensim/examples/dmlcz/gensim_xml.py @@ -72,20 +72,20 @@ def generateSimilar(corpus, index, method): if SAVE_EMPTY or articles: output = ''.join(articles) # concat all similars to one string if not DRY_RUN: # only open output files for writing if DRY_RUN is false - logging.info("generating %s (%i similars)" % (outfile, len(articles))) + logging.info("generating %s (%i similars)", outfile, len(articles)) outfile = open(outfile, 'w') outfile.write(SIMILAR % output) # add xml headers and print to file outfile.close() else: - logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output)) + logging.info("would be generating %s (%i similars):%s\n", outfile, len(articles), output) else: - logging.debug("skipping %s (no similar found)" % outfile) + logging.debug("skipping %s (no similar found)", outfile) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) - logging.info("running %s" % ' '.join(sys.argv)) + logging.info("running %s", ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) @@ -100,9 +100,9 @@ def generateSimilar(corpus, index, method): config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) - logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) + logging.info("loading word id mapping from %s", config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) - logging.info("loaded %i word ids" % len(id2word)) + logging.info("loaded %i word ids", len(id2word)) corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('_%s.mm' % method)) @@ -110,11 +110,11 @@ def generateSimilar(corpus, index, method): # initialize structure for similarity queries if method == 'lsi' or method == 'rp': # for these methods, use dense vectors - index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) + index = MatrixSimilarity(input, num_best=MAX_SIMILAR + 1, num_features=input.numTerms) else: - index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) + index = SparseMatrixSimilarity(input, num_best=MAX_SIMILAR + 1) index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format - logging.info("finished running %s" % program) + logging.info("finished running %s", program) diff --git a/gensim/examples/dmlcz/runall.sh b/gensim/examples/dmlcz/runall.sh index 5d3f5819f2..236c1dce80 100644 --- a/gensim/examples/dmlcz/runall.sh +++ b/gensim/examples/dmlcz/runall.sh @@ -7,7 +7,7 @@ BIN_PATH=~/xrehurek/gensim/dmlcz RESULT_PATH=~/xrehurek/results # set python path, so that python can find and import gensim modules -export PYTHONPATH=~/xrehurek:$PYTHONPATH +export PYTHONPATH=~/xrehurek:${PYTHONPATH} # Language is set to 'any', meaning all articles are processed for similarity in # one go, regardless of their language. @@ -17,18 +17,18 @@ language=any # ========== parse all article sources, build article co-occurence matrix ====== -${BIN_PATH}/gensim_build.py $language 2>&1 | tee ${RESULT_PATH}/gensim_build.log +${BIN_PATH}/gensim_build.py ${language} 2>&1 | tee ${RESULT_PATH}/gensim_build.log # ========== build transformation models ======================================= for method in tfidf rp; do - ( ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) & + ( ${BIN_PATH}/gensim_genmodel.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) & done wait method=lsi -${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log +${BIN_PATH}/gensim_genmodel.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log # =========== generate output xml files ======================================== @@ -36,6 +36,6 @@ ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensi # NOTE if out of memory, move tfidf out of the loop (tfidf uses a lot of memory here) for method in tfidf lsi rp; do - ( ${BIN_PATH}/gensim_xml.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) & + ( ${BIN_PATH}/gensim_xml.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) & done wait diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py index da4e0ac0b0..4193da0820 100644 --- a/gensim/examples/dmlcz/sources.py +++ b/gensim/examples/dmlcz/sources.py @@ -110,7 +110,7 @@ def parseDmlMeta(cls, xmlfile): name, cont = name.strip(), cont.strip() if name == 'msc': if len(cont) != 5: - logger.warning('invalid MSC=%s in %s' % (cont, xmlfile)) + logger.warning('invalid MSC=%s in %s', cont, xmlfile) result.setdefault('msc', []).append(cont) continue if name == 'idMR': @@ -132,25 +132,24 @@ def isArticle(self, path): return False # and contain the fulltext.txt file if not os.path.exists(os.path.join(path, 'fulltext.txt')): - logger.info('missing fulltext in %s' % path) + logger.info('missing fulltext in %s', path) return False # and also the meta.xml file if not os.path.exists(os.path.join(path, 'meta.xml')): - logger.info('missing meta.xml in %s' % path) + logger.info('missing meta.xml in %s', path) return False return True def findArticles(self): dirTotal = artAccepted = 0 - logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir)) + logger.info("looking for '%s' articles inside %s", self.sourceId, self.baseDir) for root, dirs, files in os.walk(self.baseDir): dirTotal += 1 root = os.path.normpath(root) if self.isArticle(root): artAccepted += 1 yield self.idFromDir(root) - logger.info('%i directories processed, found %i articles' % - (dirTotal, artAccepted)) + logger.info('%i directories processed, found %i articles', dirTotal, artAccepted) def getContent(self, uri): """ @@ -200,15 +199,15 @@ def isArticle(self, path): return False # and contain a dspace_id file if not (os.path.exists(os.path.join(path, 'dspace_id'))): - logger.info('missing dspace_id in %s' % path) + logger.info('missing dspace_id in %s', path) return False # and contain either fulltext.txt or fulltext_dspace.txt file if not (os.path.exists(os.path.join(path, 'fulltext.txt')) or os.path.exists(os.path.join(path, 'fulltext-dspace.txt'))): - logger.info('missing fulltext in %s' % path) + logger.info('missing fulltext in %s', path) return False # and contain the meta.xml file if not os.path.exists(os.path.join(path, 'meta.xml')): - logger.info('missing meta.xml in %s' % path) + logger.info('missing meta.xml in %s', path) return False return True @@ -278,7 +277,6 @@ class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler): # these errors silently. def error(self, exception): pass -# logger.debug("SAX error parsing xml: %s" % exception) warning = fatalError = error # endclass ArxmlivErrorHandler @@ -302,21 +300,20 @@ def isArticle(self, path): return False # and contain the tex.xml file if not os.path.exists(os.path.join(path, 'tex.xml')): - logger.warning('missing tex.xml in %s' % path) + logger.warning('missing tex.xml in %s', path) return False return True def findArticles(self): dirTotal = artAccepted = 0 - logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir)) + logger.info("looking for '%s' articles inside %s", self.sourceId, self.baseDir) for root, dirs, files in os.walk(self.baseDir): dirTotal += 1 root = os.path.normpath(root) if self.isArticle(root): artAccepted += 1 yield self.idFromDir(root) - logger.info('%i directories processed, found %i articles' % - (dirTotal, artAccepted)) + logger.info('%i directories processed, found %i articles', dirTotal, artAccepted) def getContent(self, uri): """ diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 4087fd8893..81f85a8527 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -56,8 +56,7 @@ def __iter__(self): def save(self, *args, **kwargs): import warnings - warnings.warn("corpus.save() stores only the (tiny) iteration object; " - "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)") + warnings.warn("corpus.save() stores only the (tiny) iteration object; to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)") super(CorpusABC, self).save(*args, **kwargs) def __len__(self): @@ -95,12 +94,11 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): raise NotImplementedError('cannot instantiate abstract base class') # example code: - logger.info("converting corpus to ??? format: %s" % fname) + logger.info("converting corpus to ??? format: %s", fname) with utils.smart_open(fname, 'wb') as fout: for doc in corpus: # iterate over the document stream fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk -# endclass CorpusABC class TransformedCorpus(CorpusABC): @@ -127,7 +125,6 @@ def __getitem__(self, docno): return self.obj[self.corpus[docno]] else: raise RuntimeError('Type {} does not support slicing.'.format(type(self.corpus))) -# endclass TransformedCorpus class TransformationABC(utils.SaveLoad): @@ -162,7 +159,6 @@ def _apply(self, corpus, chunksize=None, **kwargs): and return the result as another corpus. """ return TransformedCorpus(self, corpus, chunksize, **kwargs) -# endclass TransformationABC class SimilarityABC(utils.SaveLoad): @@ -263,7 +259,7 @@ def __iter__(self): # (unlike numpy). so, clip the end of the chunk explicitly to make # scipy.sparse happy chunk_end = min(self.index.shape[0], chunk_start + self.chunksize) - chunk = self.index[chunk_start : chunk_end] + chunk = self.index[chunk_start: chunk_end] for sim in self[chunk]: yield sim else: @@ -272,4 +268,3 @@ def __iter__(self): # restore old normalization value self.normalize = norm -# endclass SimilarityABC diff --git a/gensim/matutils.py b/gensim/matutils.py index 58904e1657..4b072bfd4a 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -21,26 +21,12 @@ from scipy.stats import entropy import scipy.linalg from scipy.linalg.lapack import get_lapack_funcs +from scipy.linalg.special_matrices import triu from scipy.special import psi # gamma function utils from six import iteritems, itervalues, string_types from six.moves import xrange, zip as izip -# scipy is not a stable package yet, locations change, so try to work -# around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8) -try: - from scipy.linalg.basic import triu -except ImportError: - from scipy.linalg.special_matrices import triu - -try: - from np import triu_indices -except ImportError: - # np < 1.4 - def triu_indices(n, k=0): - m = np.ones((n, n), int) - a = triu(m, k) - return np.where(a != 0) blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] @@ -101,7 +87,7 @@ def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz= data = np.empty((num_nnz,), dtype=dtype) for docno, doc in enumerate(corpus): if printprogress and docno % printprogress == 0: - logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs)) + logger.info("PROGRESS: at document #%i/%i", docno, num_docs) posnext = posnow + len(doc) indices[posnow: posnext] = [feature_id for feature_id, _ in doc] data[posnow: posnext] = [feature_weight for _, feature_weight in doc] @@ -114,7 +100,7 @@ def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz= num_nnz, data, indices, indptr = 0, [], [], [0] for docno, doc in enumerate(corpus): if printprogress and docno % printprogress == 0: - logger.info("PROGRESS: at document #%i" % (docno)) + logger.info("PROGRESS: at document #%i", docno) indices.extend([feature_id for feature_id, _ in doc]) data.extend([feature_weight for _, feature_weight in doc]) num_nnz += len(doc) @@ -150,7 +136,7 @@ def zeros_aligned(shape, dtype, order='C', align=128): nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded") start_index = -buffer.ctypes.data % align - return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order) + return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order) def ismatrix(m): @@ -333,7 +319,6 @@ def __iter__(self): def __len__(self): return len(self.dense) -# endclass DenseCorpus class Sparse2Corpus(object): @@ -356,7 +341,6 @@ def __iter__(self): def __len__(self): return self.sparse.shape[1] -# endclass Sparse2Corpus def veclen(vec): @@ -381,7 +365,7 @@ def ret_log_normalize_vec(vec, axis=1): log_shift = log_max - np.log(len(vec) + 1.0) - max_val tot = np.sum(np.exp(vec + log_shift)) log_norm = np.log(tot) - log_shift - vec = vec - log_norm + vec -= log_norm else: if axis == 1: # independently normalize each sample max_val = np.max(vec, 1) @@ -391,10 +375,10 @@ def ret_log_normalize_vec(vec, axis=1): vec = vec - log_norm[:, np.newaxis] elif axis == 0: # normalize each feature k = ret_log_normalize_vec(vec.T) - return (k[0].T, k[1]) + return k[0].T, k[1] else: raise ValueError("'%s' is not a supported axis" % axis) - return (vec, log_norm) + return vec, log_norm blas_nrm2 = blas('nrm2', np.array([], dtype=float)) @@ -435,7 +419,7 @@ def unitvec(vec, norm='l2'): try: first = next(iter(vec)) # is there at least one element? - except Exception: + except StopIteration: return vec if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format @@ -476,10 +460,10 @@ def isbow(vec): vec = vec.todense().tolist() try: id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking - id_, val_ = int(id_), float(val_) + int(id_), float(val_) except IndexError: return True # this is to handle the empty input case - except Exception: + except (ValueError, TypeError): return False return True @@ -610,7 +594,7 @@ def dirichlet_expectation(alpha): For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`. """ - if (len(alpha.shape) == 1): + if len(alpha.shape) == 1: result = psi(alpha) - psi(np.sum(alpha)) else: result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] @@ -628,7 +612,7 @@ def qr_destroy(la): del la[0], la # now `a` is the only reference to the input matrix m, n = a.shape # perform q, r = QR(a); code hacked out of scipy.linalg.qr - logger.debug("computing QR of %s dense matrix" % str(a.shape)) + logger.debug("computing QR of %s dense matrix", str(a.shape)) geqrf, = get_lapack_funcs(('geqrf',), (a,)) qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) @@ -675,12 +659,10 @@ def write_headers(self, num_docs, num_terms, num_nnz): if num_nnz < 0: # we don't know the matrix shape/density yet, so only log a general line - logger.info("saving sparse matrix to %s" % self.fname) + logger.info("saving sparse matrix to %s", self.fname) self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody else: - logger.info( - "saving sparse %sx%s matrix with %i non-zero entries to %s", - num_docs, num_terms, num_nnz, self.fname) + logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s", num_docs, num_terms, num_nnz, self.fname) self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) self.last_docno = -1 self.headers_written = True @@ -737,7 +719,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, else: bow = doc if docno % progress_cnt == 0: - logger.info("PROGRESS: saving document #%i" % docno) + logger.info("PROGRESS: saving document #%i", docno) if index: posnow = mw.fout.tell() if posnow == poslast: @@ -755,11 +737,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, num_terms = num_terms or _num_terms if num_docs * num_terms != 0: - logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % ( - num_docs, num_terms, - 100.0 * num_nnz / (num_docs * num_terms), - num_nnz, - num_docs * num_terms)) + logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)", num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms) # now write proper headers, by seeking and overwriting the spaces written earlier mw.fake_headers(num_docs, num_terms, num_nnz) @@ -779,10 +757,9 @@ def __del__(self): self.close() # does nothing if called twice (on an already closed file), so no worries def close(self): - logger.debug("closing %s" % self.fname) + logger.debug("closing %s", self.fname) if hasattr(self, 'fout'): self.fout.close() -# endclass MmWriter class MmReader(object): @@ -806,7 +783,7 @@ def __init__(self, input, transposed=True): `input` is either a string (file path) or a file-like object that supports `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). """ - logger.info("initializing corpus reader from %s" % input) + logger.info("initializing corpus reader from %s", input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: @@ -821,14 +798,12 @@ def __init__(self, input, transposed=True): for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): - self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) + self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break - logger.info( - "accepted corpus with %i documents, %i features, %i non-zero entries", - self.num_docs, self.num_terms, self.num_nnz) + logger.info("accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz) def __len__(self): return self.num_docs @@ -917,4 +892,3 @@ def docbyoffset(self, offset): document.append((termid, val,)) # add another field to the current document return document -# endclass MmReader diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 5c25a86fd5..530f7c4980 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -36,7 +36,7 @@ class VocabTransform(interfaces.TransformationABC): Old features that have no counterpart in the new ids are discarded. This can be used to filter vocabulary of a corpus "online":: - >>> old2new = dict((oldid, newid) for newid, oldid in enumerate(ids_you_want_to_keep)) + >>> old2new = {oldid: newid for newid, oldid in enumerate(ids_you_want_to_keep)} >>> vt = VocabTransform(old2new) >>> for vec_with_new_ids in vt[corpus_with_old_ids]: >>> ... @@ -44,7 +44,6 @@ class VocabTransform(interfaces.TransformationABC): """ def __init__(self, old2new, id2token=None): - # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems()) self.old2new = old2new self.id2token = id2token @@ -58,4 +57,3 @@ def __getitem__(self, bow): return self._apply(bow) return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new) -# endclass VocabTransform diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 7e1ee02d62..bcc63e8f8c 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -84,7 +84,7 @@ def construct_doc2author(corpus, author2doc): return doc2author -def construct_author2doc(corpus, doc2author): +def construct_author2doc(doc2author): """Make a mapping from author IDs to document IDs.""" # First get a set of all authors. @@ -118,10 +118,10 @@ class AuthorTopicModel(LdaModel): """ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, - chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, - alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, - gamma_threshold=0.001, serialized=False, serialization_path=None, - minimum_probability=0.01, random_state=None): + chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, + alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, + gamma_threshold=0.001, serialized=False, serialization_path=None, + minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained @@ -212,7 +212,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.id2word = id2word if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + raise ValueError( + "at least one of corpus/id2word must be specified, to establish input space dimensionality" + ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") @@ -252,7 +254,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d if serialized and not serialization_path: raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).") if serialized and serialization_path: - assert not isfile(serialization_path), "A file already exists at the serialization_path path; choose a different serialization_path, or delete the file." + assert not isfile(serialization_path), \ + "A file already exists at the serialization_path path; " \ + "choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. @@ -260,7 +264,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') - assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) + assert self.alpha.shape == (self.num_topics,), \ + "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': @@ -272,7 +277,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % - (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) + (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms) + ) # VB constants self.iterations = iterations @@ -300,7 +306,7 @@ def init_empty_corpus(self): """ if self.serialized: - # Tnitialize the corpus as a serialized empty list. + # Initialize the corpus as a serialized empty list. # This corpus will be extended in self.update. MmCorpus.serialize(self.serialization_path, []) # Serialize empty corpus. self.corpus = MmCorpus(self.serialization_path) # Store serialized corpus object in self.corpus. @@ -333,9 +339,8 @@ def extend_corpus(self, corpus): assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists." self.corpus.extend(corpus) - def compute_phinorm(self, ids, authors_d, expElogthetad, expElogbetad): + def compute_phinorm(self, expElogthetad, expElogbetad): """Efficiently computes the normalizing factor in phi.""" - phinorm = np.zeros(len(ids)) expElogtheta_sum = expElogthetad.sum(axis=0) phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100 @@ -362,8 +367,8 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c """ try: - _ = len(chunk) # noqa:F841 - except Exception: + len(chunk) + except TypeError: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: @@ -389,9 +394,9 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c # TODO: this is duplication of code in LdaModel. Refactor. if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)): # make sure the term IDs are ints, otherwise np will get upset - ids = [int(id) for id, _ in doc] + ids = [int(idx) for idx, _ in doc] else: - ids = [id for id, _ in doc] + ids = [idx for idx, _ in doc] cts = np.array([cnt for _, cnt in doc]) # Get all authors in current document, and convert the author names to integer IDs. @@ -406,11 +411,10 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c expElogbetad = self.expElogbeta[:, ids] # Compute the normalizing constant of phi for the current document. - phinorm = self.compute_phinorm(ids, authors_d, expElogthetad, expElogbetad) + phinorm = self.compute_phinorm(expElogthetad, expElogbetad) # Iterate between gamma and phi until convergence - for iteration in xrange(self.iterations): - + for _ in xrange(self.iterations): lastgamma = tilde_gamma.copy() # Update gamma. @@ -428,7 +432,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c expElogthetad = np.exp(Elogthetad) # Update the normalizing constant in phi. - phinorm = self.compute_phinorm(ids, authors_d, expElogthetad, expElogbetad) + phinorm = self.compute_phinorm(expElogthetad, expElogbetad) # Check for convergence. # Criterion is mean change in "local" gamma. @@ -452,8 +456,10 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm) if len(chunk) > 1: - logger.debug("%i/%i documents converged within %i iterations", - converged, len(chunk), self.iterations) + logger.debug( + "%i/%i documents converged within %i iterations", + converged, len(chunk), self.iterations + ) if collect_sstats: # This step finishes computing the sufficient statistics for the @@ -473,7 +479,10 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id # TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible. if state is None: state = self.state - gamma, sstats = self.inference(chunk, author2doc, doc2author, rhot, collect_sstats=True, chunk_doc_idx=chunk_doc_idx) + gamma, sstats = self.inference( + chunk, author2doc, doc2author, rhot, + collect_sstats=True, chunk_doc_idx=chunk_doc_idx + ) state.sstats += sstats state.numdocs += len(chunk) return gamma @@ -492,13 +501,14 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): corpus_words = sum(cnt for document in chunk for _, cnt in document) subsample_ratio = 1.0 * total_docs / len(chunk) perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) - logger.info("%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words" % - (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words)) + logger.info( + "%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words", + perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words + ) return perwordbound - def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, - passes=None, update_every=None, eval_every=None, iterations=None, - gamma_threshold=None, chunks_as_numpy=False): + def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None, + update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -590,14 +600,14 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, if doc2author is None: doc2author = construct_doc2author(corpus, author2doc) elif author2doc is None: - author2doc = construct_author2doc(corpus, doc2author) + author2doc = construct_author2doc(doc2author) # Number of authors that need to be updated. num_input_authors = len(author2doc) try: len_input_corpus = len(corpus) - except Exception: + except TypeError: logger.warning("input corpus stream has no len(); counting documents") len_input_corpus = sum(1 for _ in corpus) if len_input_corpus == 0: @@ -650,7 +660,7 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, # Train on all documents of authors in input_corpus. train_corpus_idx = [] - for a in author2doc.keys(): # For all authors in input corpus. + for _ in author2doc.keys(): # For all authors in input corpus. for doc_ids in self.author2doc.values(): # For all documents in total corpus. train_corpus_idx.extend(doc_ids) @@ -674,17 +684,15 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) - logger.info("running %s author-topic training, %s topics, %s authors, %i passes over " - "the supplied corpus of %i documents, updating model once " - "every %i documents, evaluating perplexity every %i documents, " - "iterating %ix with a convergence threshold of %f", - updatetype, self.num_topics, num_input_authors, passes, lencorpus, - updateafter, evalafter, iterations, - gamma_threshold) + logger.info( + "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once " + "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f", + updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter, + evalafter, iterations, gamma_threshold + ) if updates_per_pass * passes < 10: - logger.warning("too few updates, training might not converge; consider " - "increasing the number of passes or iterations to improve accuracy") + logger.warning("too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy") # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, @@ -694,7 +702,7 @@ def rho(): for pass_ in xrange(passes): if self.dispatcher: - logger.info('initializing %s workers' % self.numworkers) + logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset(self.state) else: # gamma is not needed in "other", thus its shape is (0, 0). @@ -713,13 +721,17 @@ def rho(): if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it - logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i', - pass_, chunk_no * chunksize + len(chunk), lencorpus) + logger.info( + "PROGRESS: pass %i, dispatching documents up to #%i/%i", + pass_, chunk_no * chunksize + len(chunk), lencorpus + ) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: - logger.info('PROGRESS: pass %i, at document #%i/%i', - pass_, chunk_no * chunksize + len(chunk), lencorpus) + logger.info( + "PROGRESS: pass %i, at document #%i/%i", + pass_, chunk_no * chunksize + len(chunk), lencorpus + ) # do_estep requires the indexes of the documents being trained on, to know what authors # correspond to the documents. gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx) @@ -757,8 +769,6 @@ def rho(): other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other - dirty = False - # endfor entire corpus update def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None): """ @@ -833,11 +843,11 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which # is the same computation as in normalizing phi. - phinorm = self.compute_phinorm(ids, authors_d, expElogtheta[authors_d, :], expElogbeta[:, ids]) + phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids]) word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm)) # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures - # that the likelihood is always rougly on the same scale. + # that the likelihood is always roughly on the same scale. word_score *= subsample_ratio # E[log p(theta | alpha) - log q(theta | gamma)] @@ -863,12 +873,12 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, return total_score def get_document_topics(self, word_id, minimum_probability=None): - ''' + """ This method overwrites `LdaModel.get_document_topics` and simply raises an exception. `get_document_topics` is not valid for the author-topic model, use `get_author_topics` instead. - ''' + """ raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.') @@ -891,13 +901,12 @@ def get_author_topics(self, author_name, minimum_probability=None): topic_dist = self.state.gamma[author_id, :] / sum(self.state.gamma[author_id, :]) - author_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) - if topicvalue >= minimum_probability] + author_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability] return author_topics def __getitem__(self, author_names, eps=None): - ''' + """ Return topic distribution for input author as a list of (topic_id, topic_probabiity) 2-tuples. @@ -905,7 +914,7 @@ def __getitem__(self, author_names, eps=None): Do not call this method directly, instead use `model[author_names]`. - ''' + """ if isinstance(author_names, list): items = [] for a in author_names: @@ -914,4 +923,3 @@ def __getitem__(self, author_names, eps=None): items = self.get_author_topics(author_names, minimum_probability=eps) return items -# endclass AuthorTopicModel diff --git a/gensim/models/callbacks.py b/gensim/models/callbacks.py index d4598cca1d..3ef7fbe978 100644 --- a/gensim/models/callbacks.py +++ b/gensim/models/callbacks.py @@ -45,7 +45,8 @@ class CoherenceMetric(Metric): """ Metric class for coherence evaluation """ - def __init__(self, corpus=None, texts=None, dictionary=None, coherence=None, window_size=None, topn=10, logger=None, viz_env=None, title=None): + def __init__(self, corpus=None, texts=None, dictionary=None, coherence=None, + window_size=None, topn=10, logger=None, viz_env=None, title=None): """ Args: corpus : Gensim document corpus. @@ -108,7 +109,10 @@ def get_value(self, **kwargs): self.model = None self.topics = None super(CoherenceMetric, self).set_parameters(**kwargs) - cm = gensim.models.CoherenceModel(self.model, self.topics, self.texts, self.corpus, self.dictionary, self.window_size, self.coherence, self.topn) + cm = gensim.models.CoherenceModel( + self.model, self.topics, self.texts, self.corpus, self.dictionary, + self.window_size, self.coherence, self.topn + ) return cm.get_coherence() @@ -146,7 +150,8 @@ class DiffMetric(Metric): """ Metric class for topic difference evaluation """ - def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True, annotation=False, normed=True, logger=None, viz_env=None, title=None): + def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True, + annotation=False, normed=True, logger=None, viz_env=None, title=None): """ Args: distance : measure used to calculate difference between any topic pair. Available values: @@ -181,7 +186,10 @@ def get_value(self, **kwargs): other_model : second topic model instance to calculate the difference from """ super(DiffMetric, self).set_parameters(**kwargs) - diff_diagonal, _ = self.model.diff(self.other_model, self.distance, self.num_words, self.n_ann_terms, self.diagonal, self.annotation, self.normed) + diff_diagonal, _ = self.model.diff( + self.other_model, self.distance, self.num_words, self.n_ann_terms, + self.diagonal, self.annotation, self.normed + ) return diff_diagonal @@ -189,7 +197,8 @@ class ConvergenceMetric(Metric): """ Metric class for convergence evaluation """ - def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True, annotation=False, normed=True, logger=None, viz_env=None, title=None): + def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True, + annotation=False, normed=True, logger=None, viz_env=None, title=None): """ Args: distance : measure used to calculate difference between any topic pair. Available values: @@ -224,7 +233,10 @@ def get_value(self, **kwargs): other_model : second topic model instance to calculate the difference from """ super(ConvergenceMetric, self).set_parameters(**kwargs) - diff_diagonal, _ = self.model.diff(self.other_model, self.distance, self.num_words, self.n_ann_terms, self.diagonal, self.annotation, self.normed) + diff_diagonal, _ = self.model.diff( + self.other_model, self.distance, self.num_words, self.n_ann_terms, + self.diagonal, self.annotation, self.normed + ) return np.sum(diff_diagonal) @@ -287,23 +299,33 @@ def on_epoch_end(self, epoch, topics=None): if epoch == 0: if value.ndim > 0: diff_mat = np.array([value]) - viz_metric = self.viz.heatmap(X=diff_mat.T, env=metric.viz_env, opts=dict(xlabel='Epochs', ylabel=label, title=label)) + viz_metric = self.viz.heatmap( + X=diff_mat.T, env=metric.viz_env, opts=dict(xlabel='Epochs', ylabel=label, title=label) + ) # store current epoch's diff diagonal self.diff_mat.put(diff_mat) # saving initial plot window self.windows.append(copy.deepcopy(viz_metric)) else: - viz_metric = self.viz.line(Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, opts=dict(xlabel='Epochs', ylabel=label, title=label)) + viz_metric = self.viz.line( + Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, + opts=dict(xlabel='Epochs', ylabel=label, title=label) + ) # saving initial plot window self.windows.append(copy.deepcopy(viz_metric)) else: if value.ndim > 0: # concatenate with previous epoch's diff diagonals diff_mat = np.concatenate((self.diff_mat.get(), np.array([value]))) - self.viz.heatmap(X=diff_mat.T, env=metric.viz_env, win=self.windows[i], opts=dict(xlabel='Epochs', ylabel=label, title=label)) + self.viz.heatmap( + X=diff_mat.T, env=metric.viz_env, win=self.windows[i], + opts=dict(xlabel='Epochs', ylabel=label, title=label) + ) self.diff_mat.put(diff_mat) else: - self.viz.updateTrace(Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, win=self.windows[i]) + self.viz.updateTrace( + Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, win=self.windows[i] + ) if metric.logger == "shell": statement = "".join(("Epoch ", str(epoch), ": ", label, " estimate: ", str(value))) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 4fa80ea15e..f6781903a8 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -168,7 +168,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= if isinstance(model.id2word, FakeDict): raise ValueError( "The associated dictionary should be provided with the corpus or 'id2word'" - " for topic model should be set as the associated dictionary.") + " for topic model should be set as the associated dictionary." + ) else: self.dictionary = model.id2word else: @@ -236,9 +237,7 @@ def topics(self, topics): if self.model is not None: new_topics = self._get_topics() if topics is not None: - logger.warning( - "Ignoring topics you are attempting to set in favor of model's topics: %s", - self.model) + logger.warning("Ignoring topics you are attempting to set in favor of model's topics: %s", self.model) elif topics is not None: new_topics = [] for topic in topics: @@ -275,7 +274,8 @@ def _get_topics(self): except AttributeError: raise ValueError( "This topic model is not currently supported. Supported topic models" - " should implement the `get_topics` method.") + " should implement the `get_topics` method." + ) def segment_topics(self): return self.measure.seg(self.topics) @@ -294,7 +294,8 @@ def estimate_probabilities(self, segmented_topics=None): self._accumulator = self.measure.prob( texts=self.texts, segmented_topics=segmented_topics, dictionary=self.dictionary, window_size=self.window_size, - processes=self.processes) + processes=self.processes + ) return self._accumulator diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 5e0dc20418..04bf923c65 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -70,9 +70,9 @@ try: from gensim.models.doc2vec_inner import train_document_dbow, train_document_dm, train_document_dm_concat from gensim.models.word2vec_inner import FAST_VERSION # blas-adaptation shared from word2vec - logger.debug('Fast version of {0} is being used'.format(__name__)) + logger.debug('Fast version of %s is being used', __name__) except ImportError: - logger.warning('Slow version of {0} is being used'.format(__name__)) + logger.warning('Slow version of %s is being used', __name__) # failed... fall back to plain numpy (20-80x slower training than the above) FAST_VERSION = -1 @@ -109,9 +109,10 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_batch_sg(model, [doc_words], alpha, work) for doctag_index in doctag_indexes: for word in doc_words: - train_sg_pair(model, word, doctag_index, alpha, learn_vectors=learn_doctags, - learn_hidden=learn_hidden, context_vectors=doctag_vectors, - context_locks=doctag_locks) + train_sg_pair( + model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden, + context_vectors=doctag_vectors, context_locks=doctag_locks + ) return len(doc_words) @@ -173,9 +174,9 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N return len(word_vocabs) - def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, - learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, + learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, + doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). @@ -382,10 +383,8 @@ def estimated_lookup_memory(self): def reset_weights(self, model): length = max(len(self.doctags), self.count) if self.mapfile_path: - self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, - mode='w+', shape=(length, model.vector_size)) - self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, - mode='w+', shape=(length,)) + self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size)) + self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,)) self.doctag_syn0_lockf.fill(1.0) else: self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL) @@ -481,7 +480,11 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip return dists best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) # ignore (don't return) docs from the input - result = [(self.index_to_doctag(sim + clip_start), float(dists[sim])) for sim in best if (sim + clip_start) not in all_docs] + result = [ + (self.index_to_doctag(sim + clip_start), float(dists[sim])) + for sim in best + if (sim + clip_start) not in all_docs + ] return result[:topn] def doesnt_match(self, docs): @@ -494,7 +497,7 @@ def doesnt_match(self, docs): self.init_sims() docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns - logger.debug("using docs %s" % docs) + logger.debug("using docs %s", docs) if not docs: raise ValueError("cannot select a doc from an empty list") vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL) @@ -551,8 +554,7 @@ def repeat(self, word_count): class Doc2Vec(Word2Vec): """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" - def __init__(self, documents=None, dm_mean=None, - dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, + def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs): """ Initialize the model from an iterable of `documents`. Each document is a @@ -664,7 +666,7 @@ def reset_weights(self): if self.dm and self.dm_concat: # expand l1 size to match concatenated tags+words length self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size - logger.info("using concatenative %d-dimensional layer1" % (self.layer1_size)) + logger.info("using concatenative %d-dimensional layer1", self.layer1_size) super(Doc2Vec, self).reset_weights() self.docvecs.reset_weights(self) @@ -686,14 +688,16 @@ def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False if not checked_string_types: if isinstance(document.words, string_types): logger.warning( - "Each 'words' should be a list of words (usually unicode strings)." - "First 'words' here is instead plain %s." % type(document.words) + "Each 'words' should be a list of words (usually unicode strings). First 'words' here is instead plain %s.", + type(document.words) ) checked_string_types += 1 if document_no % progress_per == 0: interval_rate = (total_words - interval_count) / (default_timer() - interval_start) - logger.info("PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", - document_no, total_words, interval_rate, len(vocab), len(self.docvecs)) + logger.info( + "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", + document_no, total_words, interval_rate, len(vocab), len(self.docvecs) + ) interval_start = default_timer() interval_count = total_words document_length = len(document.words) @@ -709,8 +713,10 @@ def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - logger.info("collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(vocab), len(self.docvecs), document_no + 1, total_words) + logger.info( + "collected %i word types and %i unique tags from a corpus of %i examples and %i words", + len(vocab), len(self.docvecs), document_no + 1, total_words + ) self.corpus_count = document_no + 1 self.raw_vocab = vocab @@ -721,15 +727,20 @@ def _do_train_job(self, job, alpha, inits): indexed_doctags = self.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags if self.sg: - tally += train_document_dbow(self, doc.words, doctag_indexes, alpha, work, - train_words=self.dbow_words, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + tally += train_document_dbow( + self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words, + doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + ) elif self.dm_concat: - tally += train_document_dm_concat(self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + tally += train_document_dm_concat( + self, doc.words, doctag_indexes, alpha, work, neu1, + doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + ) else: - tally += train_document_dm(self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + tally += train_document_dm( + self, doc.words, doctag_indexes, alpha, work, neu1, + doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + ) self.docvecs.trained_item(indexed_doctags) return tally, self._raw_word_count(job) @@ -754,17 +765,20 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): for i in range(steps): if self.sg: - train_document_dbow(self, doc_words, doctag_indexes, alpha, work, - learn_words=False, learn_hidden=False, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + train_document_dbow( + self, doc_words, doctag_indexes, alpha, work, + learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + ) elif self.dm_concat: - train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + train_document_dm_concat( + self, doc_words, doctag_indexes, alpha, work, neu1, + learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + ) else: - train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + train_document_dm( + self, doc_words, doctag_indexes, alpha, work, neu1, + learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + ) alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha return doctag_vectors[0] @@ -850,7 +864,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* with utils.smart_open(fname, 'ab') as fout: if not word_vec: total_vec = len(self.docvecs) - logger.info("storing %sx%s projection weights into %s" % (total_vec, self.vector_size, fname)) + logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) # store as in input order for i in range(len(self.docvecs)): diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index 2c0b2c5655..c4fee9ed0d 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -640,7 +640,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, window_indexes[n] = null_word_index else: window_indexes[n] = indexes[m] - n = n + 1 + n += 1 for m in range(2 * window): memcpy(&_neu1[(doctag_len + m) * vector_size], &_word_vectors[window_indexes[m] * vector_size], vector_size * cython.sizeof(REAL_t)) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index b26c8fa639..ebae837b57 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -196,7 +196,7 @@ def inference(self, chunk): raise RuntimeError("model must be trained to perform inference") chunk = list(chunk) if len(chunk) > 1: - logger.debug("performing inference on a chunk of %i documents" % len(chunk)) + logger.debug("performing inference on a chunk of %i documents", len(chunk)) gamma = np.zeros((len(chunk), self.lda_beta.shape[0])) for d, doc in enumerate(chunk): @@ -214,8 +214,7 @@ def __getitem__(self, bow, eps=0.01): gamma = self.inference([bow])[0] topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else [] - return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) - if topicvalue >= eps] + return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps] def update(self, corpus): save_freq = max(1, int(10000 / self.chunksize)) # save every 10k docs, roughly @@ -265,7 +264,7 @@ def update_chunk(self, chunk, update=True, opt_o=True): unique_words[word_id] = len(unique_words) word_list.append(word_id) - Wt = len(word_list) # length of words in these documents + wt = len(word_list) # length of words in these documents # ...and do the lazy updates on the necessary columns of lambda rw = np.array([self.m_r[t] for t in self.m_timestamp[word_list]]) @@ -274,7 +273,7 @@ def update_chunk(self, chunk, update=True, opt_o=True): psi(self.m_eta + self.m_lambda[:, word_list]) - \ psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) - ss = SuffStats(self.m_T, Wt, len(chunk)) + ss = SuffStats(self.m_T, wt, len(chunk)) Elogsticks_1st = expect_log_sticks(self.m_var_sticks) # global sticks @@ -285,19 +284,19 @@ def update_chunk(self, chunk, update=True, opt_o=True): if len(doc) > 0: doc_word_ids, doc_word_counts = zip(*doc) doc_score = self.doc_e_step( - doc, ss, Elogsticks_1st, - word_list, unique_words, doc_word_ids, - doc_word_counts, self.m_var_converge) + ss, Elogsticks_1st, + unique_words, doc_word_ids, + doc_word_counts, self.m_var_converge + ) count += sum(doc_word_counts) score += doc_score if update: self.update_lambda(ss, word_list, opt_o) - return (score, count) + return score, count - def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, - unique_words, doc_word_ids, doc_word_counts, var_converge): + def doc_e_step(self, ss, Elogsticks_1st, unique_words, doc_word_ids, doc_word_counts, var_converge): """ e step for a single doc """ @@ -392,8 +391,7 @@ def update_lambda(self, sstats, word_list, opt_o): self.m_rhot = rhot # Update appropriate columns of lambda based on documents. - self.m_lambda[:, word_list] = self.m_lambda[:, word_list] * (1 - rhot) + \ - rhot * self.m_D * sstats.m_var_beta_ss / sstats.m_chunksize + self.m_lambda[:, word_list] = self.m_lambda[:, word_list] * (1 - rhot) + rhot * self.m_D * sstats.m_var_beta_ss / sstats.m_chunksize self.m_lambda_sum = (1 - rhot) * self.m_lambda_sum + \ rhot * self.m_D * np.sum(sstats.m_var_beta_ss, axis=1) / sstats.m_chunksize @@ -401,8 +399,7 @@ def update_lambda(self, sstats, word_list, opt_o): self.m_timestamp[word_list] = self.m_updatect self.m_r.append(self.m_r[-1] + np.log(1 - rhot)) - self.m_varphi_ss = (1.0 - rhot) * self.m_varphi_ss + rhot * \ - sstats.m_var_sticks_ss * self.m_D / sstats.m_chunksize + self.m_varphi_ss = (1.0 - rhot) * self.m_varphi_ss + rhot * sstats.m_var_sticks_ss * self.m_D / sstats.m_chunksize if opt_o: self.optimal_ordering() @@ -431,10 +428,8 @@ def update_expectations(self): topics we've learned we'll get the correct behavior. """ for w in xrange(self.m_W): - self.m_lambda[:, w] *= np.exp(self.m_r[-1] - - self.m_r[self.m_timestamp[w]]) - self.m_Elogbeta = psi(self.m_eta + self.m_lambda) - \ - psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) + self.m_lambda[:, w] *= np.exp(self.m_r[-1] - self.m_r[self.m_timestamp[w]]) + self.m_Elogbeta = psi(self.m_eta + self.m_lambda) - psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True @@ -448,8 +443,10 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No """ if num_words is not None: # deprecated num_words is used - logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") - logger.warning("Please use topn instead.") + logger.warning( + "The parameter num_words for show_topic() would be deprecated in the updated version. " + "Please use topn instead." + ) topn = num_words if not self.m_status_up_to_date: @@ -492,7 +489,7 @@ def save_topics(self, doc_count=None): else: fname = 'doc-%i' % doc_count fname = '%s/%s.topics' % (self.outputdir, fname) - logger.info("saving topics to %s" % fname) + logger.info("saving topics to %s", fname) betas = self.m_lambda + self.m_eta np.savetxt(fname, betas) @@ -527,13 +524,12 @@ def hdp_to_lda(self): alpha[i] = sticks[i] * left left = left - alpha[i] alpha[self.m_T - 1] = left - alpha = alpha * self.m_alpha + alpha *= self.m_alpha # beta - beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta + - self.m_lambda_sum[:, np.newaxis]) + beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) - return (alpha, beta) + return alpha, beta def suggested_lda_model(self): """ @@ -560,12 +556,14 @@ def evaluate_test_corpus(self, corpus): lda_betad = self.lda_beta[:, doc_word_ids] log_predicts = np.log(np.dot(theta, lda_betad)) doc_score = sum(log_predicts) / len(doc) - logger.info('TEST: %6d %.5f' % (i, doc_score)) + logger.info('TEST: %6d %.5f', i, doc_score) score += likelihood total_words += sum(doc_word_counts) - logger.info('TEST: average score: %.5f, total score: %.5f, test docs: %d' % (score / total_words, score, len(corpus))) + logger.info( + "TEST: average score: %.5f, total score: %.5f, test docs: %d", + score / total_words, score, len(corpus) + ) return score -# endclass HdpModel class HdpTopicFormatter(object): @@ -627,14 +625,20 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): def print_topic(self, topic_id, topn=None, num_words=None): if num_words is not None: # deprecated num_words is used - warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.") + warnings.warn( + "The parameter num_words for print_topic() would be deprecated in the updated version. " + "Please use topn instead." + ) topn = num_words return self.show_topic(topic_id, topn, formatted=True) def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None,): if num_words is not None: # deprecated num_words is used - warnings.warn("The parameter num_words for show_topic() would be deprecated in the updated version. Please use topn instead.") + warnings.warn( + "The parameter num_words for show_topic() would be deprecated in the updated version. " + "Please use topn instead." + ) topn = num_words lambdak = list(self.data[topic_id, :]) @@ -668,4 +672,3 @@ def format_topic(self, topic_id, topic_terms): fmt = (topic_id, fmt) return fmt -# endclass HdpTopicFormatter diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index c2dfd419cb..75cc11ee69 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -146,11 +146,11 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None) total_vec = len(self.vocab) vector_size = self.syn0.shape[1] if fvocab is not None: - logger.info("storing vocabulary in %s" % (fvocab)) + logger.info("storing vocabulary in %s", fvocab) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) - logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname)) + logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(self.vocab), vector_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) @@ -205,7 +205,7 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = map(int, header.split()) # throws for invalid file format + vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls() @@ -232,7 +232,7 @@ def add_word(word, weights): if binary: binary_len = dtype(REAL).itemsize * vector_size - for line_no in xrange(vocab_size): + for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: @@ -253,8 +253,8 @@ def add_word(word, weights): raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) - word, weights = parts[0], list(map(REAL, parts[1:])) + raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) + word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): logger.info( @@ -264,7 +264,7 @@ def add_word(word, weights): result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) assert (len(result.vocab), vector_size) == result.syn0.shape - logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) + logger.info("loaded %s matrix from %s", result.syn0.shape, fname) return result def word_vec(self, word, use_norm=False): @@ -400,12 +400,13 @@ def wmdistance(self, document1, document2): diff1 = len_pre_oov1 - len(document1) diff2 = len_pre_oov2 - len(document2) if diff1 > 0 or diff2 > 0: - logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', - diff1, diff2) + logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2) if len(document1) == 0 or len(document2) == 0: - logger.info('At least one of the documents had no words that were' - 'in the vocabulary. Aborting (returning inf).') + logger.info( + "At least one of the documents had no words that werein the vocabulary. " + "Aborting (returning inf)." + ) return float('inf') dictionary = Dictionary(documents=[document1, document2]) @@ -481,8 +482,10 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) positive = [positive] - all_words = set([self.vocab[word].index for word in positive + negative - if not isinstance(word, ndarray) and word in self.vocab]) + all_words = { + self.vocab[word].index for word in positive + negative + if not isinstance(word, ndarray) and word in self.vocab + } positive = [ self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word @@ -638,16 +641,16 @@ def n_similarity(self, ws1, ws2): raise ZeroDivisionError('Atleast one of the passed list is empty.') v1 = [self[word] for word in ws1] v2 = [self[word] for word in ws2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), - matutils.unitvec(array(v2).mean(axis=0))) + return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) @staticmethod def log_accuracy(section): correct, incorrect = len(section['correct']), len(section['incorrect']) if correct + incorrect > 0: - logger.info("%s: %.1f%% (%i/%i)" % - (section['section'], 100.0 * correct / (correct + incorrect), - correct, correct + incorrect)) + logger.info( + "%s: %.1f%% (%i/%i)", + section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect + ) def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): """ @@ -672,7 +675,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab) + ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): @@ -692,16 +695,16 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c a, b, c, expected = [word.upper() for word in line.split()] else: a, b, c, expected = [word for word in line.split()] - except Exception: - logger.info("skipping invalid line #%i in %s" % (line_no, questions)) + except ValueError: + logger.info("skipping invalid line #%i in %s", line_no, questions) continue if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip())) + logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) continue original_vocab = self.vocab self.vocab = ok_vocab - ignore = set([a, b, c]) # input words to be ignored + ignore = {a, b, c} # input words to be ignored predicted = None # find the most likely prediction, ignoring OOV words and input words sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) @@ -736,8 +739,7 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0]) logger.info('Pairs with unknown words ratio: %.1f%%', oov) - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, - dummy4unknown=False): + def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """ Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`. @@ -762,7 +764,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case Otherwise (default False), these pairs are skipped entirely. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab) + ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) similarity_gold = [] similarity_model = [] @@ -783,7 +785,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case else: a, b, sim = [word for word in line.split(delimiter)] sim = float(sim) - except Exception: + except (ValueError, TypeError): logger.info('skipping invalid line #%d in %s', line_no, pairs) continue if a not in ok_vocab or b not in ok_vocab: @@ -802,15 +804,12 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case pearson = stats.pearsonr(similarity_gold, similarity_model) oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100 - logger.debug( - 'Pearson correlation coefficient against %s: %f with p-value %f', - pairs, pearson[0], pearson[1] - ) + logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) logger.debug( 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', pairs, spearman[0], spearman[1] ) - logger.debug('Pairs with unknown words: %d' % oov) + logger.debug('Pairs with unknown words: %d', oov) self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) return pearson, spearman, oov_ratio @@ -844,5 +843,8 @@ def get_embedding_layer(self, train_embeddings=False): # set `trainable` as `False` to use the pretrained word embedding # No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights - layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights], trainable=train_embeddings) + layer = Embedding( + input_dim=weights.shape[0], output_dim=weights.shape[1], + weights=[weights], trainable=train_embeddings + ) return layer diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py index 91e7f237c7..6b3bd53c44 100755 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -85,11 +85,11 @@ def initialize(self, **model_params): worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously - logger.info("registering worker #%i at %s" % (workerid, uri)) + logger.info("registering worker #%i at %s", workerid, uri) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: - logger.warning("unresponsive worker at %s, deleting it from the name server" % uri) + logger.warning("unresponsive worker at %s, deleting it from the name server", uri) ns.remove(name) if not self.workers: @@ -104,16 +104,16 @@ def getworkers(self): @Pyro4.expose def getjob(self, worker_id): - logger.info("worker #%i requesting a new job" % worker_id) + logger.info("worker #%i requesting a new job", worker_id) job = self.jobs.get(block=True, timeout=1) - logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize())) + logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize()) return job @Pyro4.expose def putjob(self, job): self._jobsreceived += 1 self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) - logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize()) + logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize()) @Pyro4.expose def getstate(self): @@ -121,11 +121,11 @@ def getstate(self): Merge states from across all workers and return the result. """ logger.info("end of input, assigning all remaining jobs") - logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) + logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived) while self._jobsdone < self._jobsreceived: time.sleep(0.5) # check every half a second - logger.info("merging states from %i workers" % len(self.workers)) + logger.info("merging states from %i workers", len(self.workers)) workers = list(self.workers.values()) result = workers[0].getstate() for worker in workers[1:]: @@ -140,7 +140,7 @@ def reset(self, state): Initialize all workers for a new EM iterations. """ for workerid, worker in iteritems(self.workers): - logger.info("resetting worker %s" % workerid) + logger.info("resetting worker %s", workerid) worker.reset(state) worker.requestjob() self._jobsdone = 0 @@ -158,7 +158,7 @@ def jobdone(self, workerid): and `worker.requestjob()`. """ self._jobsdone += 1 - logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) + logger.info("worker #%s finished job #%i", workerid, self._jobsdone) self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way) def jobsdone(self): @@ -171,7 +171,7 @@ def exit(self): Terminate all registered workers and then the dispatcher. """ for workerid, worker in iteritems(self.workers): - logger.info("terminating worker %s" % workerid) + logger.info("terminating worker %s", workerid) worker.exit() logger.info("terminating dispatcher") os._exit(0) # exit the whole process (not just this thread ala sys.exit()) @@ -180,27 +180,25 @@ def exit(self): def main(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--maxsize", help="How many jobs (=chunks of N documents) " - "to keep 'pre-fetched' in a queue (default: %(default)s)", - type=int, default=MAX_JOBS_QUEUE) + parser.add_argument("--maxsize", help="How many jobs (=chunks of N documents) to keep 'pre-fetched' in a queue (default: %(default)s)", type=int, default=MAX_JOBS_QUEUE) parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', default=True, const=False) parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) - parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", - const=logging.INFO, default=logging.WARNING) + parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING) args = parser.parse_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) - ns_conf = {"broadcast": args.no_broadcast, - "host": args.host, - "port": args.port, - "hmac_key": args.hmac} + ns_conf = { + "broadcast": args.no_broadcast, + "host": args.host, + "port": args.port, + "hmac_key": args.hmac + } utils.pyro_daemon(LDA_DISPATCHER_PREFIX, Dispatcher(maxsize=args.maxsize, ns_conf=ns_conf), ns_conf=ns_conf) - logger.info("finished running %s", " ".join(sys.argv)) diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py index ec87c29148..8656672db6 100755 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -50,7 +50,7 @@ def initialize(self, myid, dispatcher, **model_params): self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher self.finished = False - logger.info("initializing worker #%s" % myid) + logger.info("initializing worker #%s", myid) self.model = ldamodel.LdaModel(**model_params) @Pyro4.expose @@ -70,27 +70,26 @@ def requestjob(self): # no new job: try again, unless we're finished with all work continue if job is not None: - logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) + logger.info("worker #%s received job #%i", self.myid, self.jobsdone) self.processjob(job) self.dispatcher.jobdone(self.myid) else: - logger.info("worker #%i stopping asking for jobs" % self.myid) + logger.info("worker #%i stopping asking for jobs", self.myid) @utils.synchronous('lock_update') def processjob(self, job): - logger.debug("starting to process job #%i" % self.jobsdone) + logger.debug("starting to process job #%i", self.jobsdone) self.model.do_estep(job) self.jobsdone += 1 if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0: fname = os.path.join(tempfile.gettempdir(), 'lda_worker.pkl') self.model.save(fname) - logger.info("finished processing job #%i" % (self.jobsdone - 1)) + logger.info("finished processing job #%i", self.jobsdone - 1) @Pyro4.expose @utils.synchronous('lock_update') def getstate(self): - logger.info("worker #%i returning its state after %s jobs" % - (self.myid, self.jobsdone)) + logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone) result = self.model.state assert isinstance(result, ldamodel.LdaState) self.model.clear() # free up mem in-between two EM cycles @@ -101,7 +100,7 @@ def getstate(self): @utils.synchronous('lock_update') def reset(self, state): assert state is not None - logger.info("resetting worker #%i" % self.myid) + logger.info("resetting worker #%i", self.myid) self.model.state = state self.model.sync_state() self.model.state.reset() @@ -109,32 +108,29 @@ def reset(self, state): @Pyro4.oneway def exit(self): - logger.info("terminating worker #%i" % self.myid) + logger.info("terminating worker #%i", self.myid) os._exit(0) -# endclass Worker def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) - parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", - action='store_const', default=True, const=False) + parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', default=True, const=False) parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) - parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", - const=logging.INFO, default=logging.WARNING) + parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING) args = parser.parse_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) - ns_conf = {"broadcast": args.no_broadcast, - "host": args.host, - "port": args.port, - "hmac_key": args.hmac} - + ns_conf = { + "broadcast": args.no_broadcast, + "host": args.host, + "port": args.port, + "hmac_key": args.hmac + } utils.pyro_daemon(LDA_WORKER_PREFIX, Worker(), random_suffix=True, ns_conf=ns_conf) - logger.info("finished running %s", " ".join(sys.argv)) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 9ed01d84c8..91c4c450a6 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -48,12 +48,7 @@ from gensim.models.callbacks import Callback # log(sum(exp(x))) that tries to avoid overflow -try: - # try importing from here if older scipy is installed - from scipy.maxentropy import logsumexp -except ImportError: - # maxentropy has been removed in recent releases, logsumexp now in misc - from scipy.misc import logsumexp +from scipy.misc import logsumexp logger = logging.getLogger('gensim.models.ldamodel') @@ -147,8 +142,7 @@ def blend(self, rhot, other, targetsize=None): if other.numdocs == 0 or targetsize == other.numdocs: scale = 1.0 else: - logger.info("merging changes from %i documents into a model of %i documents", - other.numdocs, targetsize) + logger.info("merging changes from %i documents into a model of %i documents", other.numdocs, targetsize) scale = 1.0 * targetsize / other.numdocs self.sstats += rhot * scale * other.sstats @@ -288,7 +282,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') - assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) + assert self.alpha.shape == (self.num_topics,), \ + "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': @@ -324,10 +319,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) - self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, - chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) + self.dispatcher.initialize( + id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, + alpha=alpha, eta=eta, distributed=False + ) self.numworkers = len(self.dispatcher.getworkers()) - logger.info("using distributed version with %i workers" % self.numworkers) + logger.info("using distributed version with %i workers", self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err) @@ -382,8 +379,9 @@ def init_dir_prior(self, prior, name): return init_prior, is_auto def __str__(self): - return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \ - (self.num_terms, self.num_topics, self.decay, self.chunksize) + return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( + self.num_terms, self.num_topics, self.decay, self.chunksize + ) def sync_state(self): self.expElogbeta = np.exp(self.state.get_Elogbeta()) @@ -412,8 +410,8 @@ def inference(self, chunk, collect_sstats=False): """ try: - _ = len(chunk) - except Exception: + len(chunk) + except TypeError: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: @@ -436,9 +434,9 @@ def inference(self, chunk, collect_sstats=False): for d, doc in enumerate(chunk): if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)): # make sure the term IDs are ints, otherwise np will get upset - ids = [int(id) for id, _ in doc] + ids = [int(idx) for idx, _ in doc] else: - ids = [id for id, _ in doc] + ids = [idx for idx, _ in doc] cts = np.array([cnt for _, cnt in doc]) gammad = gamma[d, :] Elogthetad = Elogtheta[d, :] @@ -462,7 +460,7 @@ def inference(self, chunk, collect_sstats=False): phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = np.mean(abs(gammad - lastgamma)) - if (meanchange < self.gamma_threshold): + if meanchange < self.gamma_threshold: converged += 1 break gamma[d, :] = gammad @@ -472,8 +470,7 @@ def inference(self, chunk, collect_sstats=False): sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm) if len(chunk) > 1: - logger.debug("%i/%i documents converged within %i iterations", - converged, len(chunk), self.iterations) + logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations) if collect_sstats: # This step finishes computing the sufficient statistics for the @@ -533,8 +530,10 @@ def log_perplexity(self, chunk, total_docs=None): corpus_words = sum(cnt for document in chunk for _, cnt in document) subsample_ratio = 1.0 * total_docs / len(chunk) perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) - logger.info("%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" % - (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words)) + logger.info( + "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words", + perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words + ) return perwordbound def update(self, corpus, chunksize=None, decay=None, offset=None, @@ -621,12 +620,14 @@ def update(self, corpus, chunksize=None, decay=None, offset=None, "iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, passes, lencorpus, updateafter, evalafter, iterations, - gamma_threshold) + gamma_threshold + ) if updates_per_pass * passes < 10: logger.warning( - "too few updates, training might not converge; consider " - "increasing the number of passes or iterations to improve accuracy") + "too few updates, training might not converge; " + "consider increasing the number of passes or iterations to improve accuracy" + ) # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, @@ -643,7 +644,7 @@ def rho(): for pass_ in xrange(passes): if self.dispatcher: - logger.info('initializing %s workers' % self.numworkers) + logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) @@ -658,13 +659,17 @@ def rho(): if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it - logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i', - pass_, chunk_no * chunksize + len(chunk), lencorpus) + logger.info( + "PROGRESS: pass %i, dispatching documents up to #%i/%i", + pass_, chunk_no * chunksize + len(chunk), lencorpus + ) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: - logger.info('PROGRESS: pass %i, at document #%i/%i', - pass_, chunk_no * chunksize + len(chunk), lencorpus) + logger.info( + "PROGRESS: pass %i, at document #%i/%i", + pass_, chunk_no * chunksize + len(chunk), lencorpus + ) gammat = self.do_estep(chunk, other) if self.optimize_alpha: @@ -870,7 +875,7 @@ def get_topic_terms(self, topicid, topn=10): topic = self.get_topics()[topicid] topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn, reverse=True) - return [(id, topic[id]) for id in bestn] + return [(idx, topic[idx]) for idx in bestn] def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None, coherence='u_mass', topn=20, processes=-1): @@ -888,7 +893,8 @@ def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None, cm = CoherenceModel( model=self, corpus=corpus, texts=texts, dictionary=dictionary, window_size=window_size, coherence=coherence, topn=topn, - processes=processes) + processes=processes + ) coherence_scores = cm.get_coherence_per_topic() str_topics = [] @@ -995,7 +1001,8 @@ def get_term_topics(self, word_id, minimum_probability=None): return values - def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, diagonal=False, annotation=True, normed=True): + def diff(self, other, distance="kullback_leibler", num_words=100, + n_ann_terms=10, diagonal=False, annotation=True, normed=True): """ Calculate difference topic2topic between two Lda models `other` instances of `LdaMulticore` or `LdaModel` @@ -1049,7 +1056,8 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 d1, d2 = fst_topics, snd_topics if diagonal: - assert t1_size == t2_size, "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix" + assert t1_size == t2_size, \ + "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix" # initialize z and annotation array z = np.zeros(t1_size) if annotation: @@ -1096,7 +1104,7 @@ def __getitem__(self, bow, eps=None): """ return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics) - def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs): + def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, **kwargs): """ Save the model to file. @@ -1135,7 +1143,7 @@ def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, ** if isinstance(ignore, six.string_types): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list - ignore = list(set(['state', 'dispatcher', 'id2word']) | set(ignore)) + ignore = list({'state', 'dispatcher', 'id2word'} | set(ignore)) else: ignore = ['state', 'dispatcher', 'id2word'] @@ -1188,10 +1196,9 @@ def load(cls, fname, *args, **kwargs): # check if `id2word_fname` file is present on disk # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load - if (os.path.isfile(id2word_fname)): + if os.path.isfile(id2word_fname): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e) return result -# endclass LdaModel diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index 39c3c40666..0c22c64f7c 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -143,11 +143,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, if isinstance(alpha, six.string_types) and alpha == 'auto': raise NotImplementedError("auto-tuning alpha not implemented in multicore LDA; use plain LdaModel.") - super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics, + super(LdaMulticore, self).__init__( + corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability, - minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics) + minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics + ) def update(self, corpus, chunks_as_numpy=False): """ @@ -169,7 +171,7 @@ def update(self, corpus, chunks_as_numpy=False): """ try: lencorpus = len(corpus) - except Exception: + except TypeError: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: @@ -187,15 +189,17 @@ def update(self, corpus, chunks_as_numpy=False): evalafter = min(lencorpus, (self.eval_every or 0) * updateafter) updates_per_pass = max(1, lencorpus / updateafter) - logger.info("running %s LDA training, %s topics, %i passes over the" - " supplied corpus of %i documents, updating every %i documents," - " evaluating every ~%i documents, iterating %ix with a convergence threshold of %f", - updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, - self.iterations, self.gamma_threshold) + logger.info( + "running %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, " + "updating every %i documents, evaluating every ~%i documents, iterating %ix with a convergence threshold of %f", + updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, self.iterations, self.gamma_threshold + ) if updates_per_pass * self.passes < 10: - logger.warning("too few updates, training might not converge; consider " - "increasing the number of passes or iterations to improve accuracy") + logger.warning( + "too few updates, training might not converge; " + "consider increasing the number of passes or iterations to improve accuracy" + ) job_queue = Queue(maxsize=2 * self.workers) result_queue = Queue() @@ -240,9 +244,10 @@ def process_result_queue(force=False): job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1) chunk_put = True queue_size[0] += 1 - logger.info('PROGRESS: pass %i, dispatched chunk #%i = ' - 'documents up to #%i/%i, outstanding queue size %i', - pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0]) + logger.info( + "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, outstanding queue size %i", + pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0] + ) except queue.Full: # in case the input job queue is full, keep clearing the # result queue, to make sure we don't deadlock diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 6399e17aae..26709c04d6 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -50,8 +50,8 @@ class LdaSeqModel(utils.SaveLoad): """ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, - random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): + initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, + random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): """ `corpus` is any iterable gensim corpus @@ -91,7 +91,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ if corpus is not None: try: self.corpus_len = len(corpus) - except Exception: + except TypeError: logger.warning("input corpus stream has no len(); counting documents") self.corpus_len = sum(1 for _ in corpus) @@ -113,7 +113,10 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. self.topic_chains = [] for topic in range(0, num_topics): - sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) + sslm_ = sslm( + num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, + chain_variance=chain_variance, obs_variance=obs_variance + ) self.topic_chains.append(sslm_) # the following are class variables which are to be integrated during Document Influence Model @@ -125,7 +128,10 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM. if corpus is not None and time_slice is not None: if initialize == 'gensim': - lda_model = ldamodel.LdaModel(corpus, id2word=self.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas, random_state=random_state) + lda_model = ldamodel.LdaModel( + corpus, id2word=self.id2word, num_topics=self.num_topics, + passes=passes, alpha=self.alphas, random_state=random_state + ) self.sstats = np.transpose(lda_model.state.sstats) if initialize == 'ldamodel': self.sstats = np.transpose(lda_model.state.sstats) @@ -206,7 +212,7 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, topic_bound = self.fit_lda_seq_topics(topic_suffstats) bound += topic_bound - if ((bound - old_bound) < 0): + if (bound - old_bound) < 0: # if max_iter is too low, increase iterations. if lda_inference_max_iter < LOWER_ITER: lda_inference_max_iter *= ITER_MULT_LOW @@ -227,7 +233,8 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, return bound - def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize): + def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, + iter_, lda_inference_max_iter, chunksize): """ Inference or E- Step. This is used to set up the gensim LdaModel to be used for each time-slice. @@ -243,14 +250,21 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_infe model = "DTM" if model == "DTM": - bound, gammas = self.inferDTMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) + bound, gammas = self.inferDTMseq( + corpus, topic_suffstats, gammas, lhoods, lda, + ldapost, iter_, bound, lda_inference_max_iter, chunksize + ) elif model == "DIM": self.InfluenceTotalFixed(corpus) - bound, gammas = self.inferDIMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) + bound, gammas = self.inferDIMseq( + corpus, topic_suffstats, gammas, lhoods, lda, + ldapost, iter_, bound, lda_inference_max_iter, chunksize + ) return bound, gammas - def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize): + def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, + ldapost, iter_, bound, lda_inference_max_iter, chunksize): """ Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. Need to pass the LdaSeq model, corpus, sufficient stats, gammas and lhoods matrices previously created, @@ -281,9 +295,13 @@ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, ite # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible. if iter_ == 0: - doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter) + doc_lhood = LdaPost.fit_lda_post( + ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter + ) else: - doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter) + doc_lhood = LdaPost.fit_lda_post( + ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter + ) if topic_suffstats is not None: topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats) @@ -310,7 +328,6 @@ def fit_lda_seq_topics(self, topic_suffstats): Fit lda sequence topic wise. """ lhood = 0 - lhood_term = 0 for k, chain in enumerate(self.topic_chains): logger.info("Fitting topic number %i", k) @@ -416,8 +433,6 @@ def __getitem__(self, doc): # should even the likelihoods be returned? return doc_topic -# endclass LdaSeqModel - class sslm(utils.SaveLoad): """ @@ -593,10 +608,8 @@ def fit_sslm(self, sstats): sslm_max_iter = 2 converged = sslm_fit_threshold + 1 - totals = np.zeros(sstats.shape[1]) - # computing variance, fwd_variance - self.variance, self.fwd_variance = map(np.array, list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) + self.variance, self.fwd_variance = (np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) # column sum of sstats totals = sstats.sum(axis=0) @@ -631,8 +644,8 @@ def compute_bound(self, sstats, totals): Compute log probability bound. Forumula is as described in appendix of DTM by Blei. (formula no. 5) """ - W = self.vocab_len - T = self.num_time_slices + w = self.vocab_len + t = self.num_time_slices term_1 = 0 term_2 = 0 @@ -643,19 +656,19 @@ def compute_bound(self, sstats, totals): chain_variance = self.chain_variance # computing mean, fwd_mean - self.mean, self.fwd_mean = map(np.array, (zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)]))) + self.mean, self.fwd_mean = (np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)])) self.zeta = self.update_zeta() - for w in range(0, W): - val += (self.variance[w][0] - self.variance[w][T]) / 2 * chain_variance + for w in range(0, w): + val += (self.variance[w][0] - self.variance[w][t]) / 2 * chain_variance logger.info("Computing bound, all times") - for t in range(1, T + 1): + for t in range(1, t + 1): term_1 = 0.0 term_2 = 0.0 ent = 0.0 - for w in range(0, W): + for w in range(0, w): m = self.mean[w][t] prev_m = self.mean[w][t - 1] @@ -725,7 +738,9 @@ def update_obs(self, sstats, totals): if model == "DTM": # slowest part of method - obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0) + obs = optimize.fmin_cg( + f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0 + ) if model == "DIM": pass runs += 1 @@ -803,7 +818,6 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): for u in range(1, T + 1): mean_u = mean[u] - variance_u_prev = variance[u - 1] # noqa:F841 mean_u_prev = mean[u - 1] dmean_u = mean_deriv[u] dmean_u_prev = mean_deriv[u - 1] @@ -1026,7 +1040,6 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats): topic_suffstats[k] = topic_ss return topic_suffstats -# endclass LdaPost # the following functions are used in update_obs as the function to optimize @@ -1060,7 +1073,6 @@ def f_obs(x, *args): for t in range(1, T + 1): mean_t = mean[t] mean_t_prev = mean[t - 1] - var_t_prev = variance[t - 1] # noqa:F841 val = mean_t - mean_t_prev term1 += val * val diff --git a/gensim/models/logentropy_model.py b/gensim/models/logentropy_model.py index d4bfc93479..05f79ae3c2 100644 --- a/gensim/models/logentropy_model.py +++ b/gensim/models/logentropy_model.py @@ -45,7 +45,7 @@ class LogEntropyModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, corpus, id2word=None, normalize=True): + def __init__(self, corpus, normalize=True): """ `normalize` dictates whether the resulting vectors will be set to unit length. @@ -58,8 +58,7 @@ def __init__(self, corpus, id2word=None, normalize=True): self.initialize(corpus) def __str__(self): - return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, - self.n_words) + return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words) def initialize(self, corpus): """ @@ -71,7 +70,7 @@ def initialize(self, corpus): glob_num_words, doc_no = 0, -1 for doc_no, bow in enumerate(corpus): if doc_no % 10000 == 0: - logger.info("PROGRESS: processing document #%i" % doc_no) + logger.info("PROGRESS: processing document #%i", doc_no) glob_num_words += len(bow) for term_id, term_count in bow: glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count @@ -81,14 +80,14 @@ def initialize(self, corpus): self.n_words = glob_num_words # and finally compute the global weights - logger.info("calculating global log entropy weights for %i " - "documents and %i features (%i matrix non-zeros)" - % (self.n_docs, len(glob_freq), self.n_words)) + logger.info( + "calculating global log entropy weights for %i documents and %i features (%i matrix non-zeros)", + self.n_docs, len(glob_freq), self.n_words + ) logger.debug('iterating over corpus') for doc_no2, bow in enumerate(corpus): for key, freq in bow: - p = (float(freq) / glob_freq[key]) * math.log(float(freq) / - glob_freq[key]) + p = (float(freq) / glob_freq[key]) * math.log(float(freq) / glob_freq[key]) self.entr[key] = self.entr.get(key, 0.0) + p if doc_no2 != doc_no: raise ValueError("LogEntropyModel doesn't support generators as training data") @@ -107,8 +106,11 @@ def __getitem__(self, bow): return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) - vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) - for term_id, tf in bow if term_id in self.entr] + vector = [ + (term_id, math.log(tf + 1) * self.entr.get(term_id)) + for term_id, tf in bow + if term_id in self.entr + ] if self.normalize: vector = matutils.unitvec(vector) return vector diff --git a/gensim/models/lsi_dispatcher.py b/gensim/models/lsi_dispatcher.py index 5a69327522..dd18734dfb 100755 --- a/gensim/models/lsi_dispatcher.py +++ b/gensim/models/lsi_dispatcher.py @@ -80,7 +80,7 @@ def initialize(self, **model_params): worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously - logger.info("registering worker #%i from %s" % (workerid, uri)) + logger.info("registering worker #%i from %s", workerid, uri) worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: @@ -99,16 +99,16 @@ def getworkers(self): @Pyro4.expose def getjob(self, worker_id): - logger.info("worker #%i requesting a new job" % worker_id) + logger.info("worker #%i requesting a new job", worker_id) job = self.jobs.get(block=True, timeout=1) - logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize())) + logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize()) return job @Pyro4.expose def putjob(self, job): self._jobsreceived += 1 self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) - logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize()) + logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize()) @Pyro4.expose def getstate(self): @@ -116,7 +116,7 @@ def getstate(self): Merge projections from across all workers and return the final projection. """ logger.info("end of input, assigning all remaining jobs") - logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) + logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived) while self._jobsdone < self._jobsreceived: time.sleep(0.5) # check every half a second @@ -124,11 +124,11 @@ def getstate(self): # and not `workers - 1` merges! # but merging only takes place once, after all input data has been processed, # so the overall effect would be small... compared to the amount of coding :-) - logger.info("merging states from %i workers" % len(self.workers)) + logger.info("merging states from %i workers", self.workers) workers = list(self.workers.items()) result = workers[0][1].getstate() for workerid, worker in workers[1:]: - logger.info("pulling state from worker %s" % workerid) + logger.info("pulling state from worker %s", workerid) result.merge(worker.getstate()) logger.info("sending out merged projection") return result @@ -139,7 +139,7 @@ def reset(self): Initialize all workers for a new decomposition. """ for workerid, worker in iteritems(self.workers): - logger.info("resetting worker %s" % workerid) + logger.info("resetting worker %s", workerid) worker.reset() worker.requestjob() self._jobsdone = 0 @@ -157,7 +157,7 @@ def jobdone(self, workerid): worker.requestjob(). """ self._jobsdone += 1 - logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) + logger.info("worker #%s finished job #%i", workerid, self._jobsdone) worker = self.workers[workerid] worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) @@ -171,7 +171,7 @@ def exit(self): Terminate all registered workers and then the dispatcher. """ for workerid, worker in iteritems(self.workers): - logger.info("terminating worker %s" % workerid) + logger.info("terminating worker %s", workerid) worker.exit() logger.info("terminating dispatcher") os._exit(0) # exit the whole process (not just this thread ala sys.exit()) @@ -180,7 +180,7 @@ def exit(self): def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - logger.info("running %s" % " ".join(sys.argv)) + logger.info("running %s", " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) # make sure we have enough cmd line parameters @@ -194,7 +194,7 @@ def main(): maxsize = int(sys.argv[1]) utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=maxsize)) - logger.info("finished running %s" % program) + logger.info("finished running %s", program) if __name__ == '__main__': diff --git a/gensim/models/lsi_worker.py b/gensim/models/lsi_worker.py index 4cae372ffd..ffb31eafb9 100755 --- a/gensim/models/lsi_worker.py +++ b/gensim/models/lsi_worker.py @@ -47,7 +47,7 @@ def initialize(self, myid, dispatcher, **model_params): self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher self.finished = False - logger.info("initializing worker #%s" % myid) + logger.info("initializing worker #%s", myid) self.model = lsimodel.LsiModel(**model_params) @Pyro4.expose @@ -67,11 +67,11 @@ def requestjob(self): # no new job: try again, unless we're finished with all work continue if job is not None: - logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) + logger.info("worker #%s received job #%i", self.myid, self.jobsdone) self.processjob(job) self.dispatcher.jobdone(self.myid) else: - logger.info("worker #%i stopping asking for jobs" % self.myid) + logger.info("worker #%i stopping asking for jobs", self.myid) @utils.synchronous('lock_update') def processjob(self, job): @@ -84,8 +84,7 @@ def processjob(self, job): @Pyro4.expose @utils.synchronous('lock_update') def getstate(self): - logger.info("worker #%i returning its state after %s jobs" % - (self.myid, self.jobsdone)) + logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone) assert isinstance(self.model.projection, lsimodel.Projection) self.finished = True return self.model.projection @@ -93,20 +92,20 @@ def getstate(self): @Pyro4.expose @utils.synchronous('lock_update') def reset(self): - logger.info("resetting worker #%i" % self.myid) + logger.info("resetting worker #%i", self.myid) self.model.projection = self.model.projection.empty_like() self.finished = False @Pyro4.oneway def exit(self): - logger.info("terminating worker #%i" % self.myid) + logger.info("terminating worker #%i", self.myid) os._exit(0) # endclass Worker def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - logger.info("running %s" % " ".join(sys.argv)) + logger.info("running %s", " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) # make sure we have enough cmd line parameters @@ -116,7 +115,7 @@ def main(): utils.pyro_daemon('gensim.lsi_worker', Worker(), random_suffix=True) - logger.info("finished running %s" % program) + logger.info("finished running %s", program) if __name__ == '__main__': diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index f9e16cd23b..e55a009ab8 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -161,8 +161,9 @@ def merge(self, other, decay=1.0): self.s = other.s.copy() return if self.m != other.m: - raise ValueError("vector space mismatch: update is using %s features, expected %s" % - (other.m, self.m)) + raise ValueError( + "vector space mismatch: update is using %s features, expected %s" % (other.m, self.m) + ) logger.info("merging projections: %s + %s", str(self.u.shape), str(other.u.shape)) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] # TODO Maybe keep the bases as elementary reflectors, without @@ -182,8 +183,10 @@ def merge(self, other, decay=1.0): assert not other.u # find the rotation that diagonalizes r - k = np.bmat([[np.diag(decay * self.s), np.multiply(c, other.s)], - [matutils.pad(np.array([]).reshape(0, 0), min(m, n2), n1), np.multiply(r, other.s)]]) + k = np.bmat([ + [np.diag(decay * self.s), np.multiply(c, other.s)], + [matutils.pad(np.array([]).reshape(0, 0), min(m, n2), n1), np.multiply(r, other.s)] + ]) logger.debug("computing SVD of %s dense matrix", k.shape) try: # in np < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. @@ -216,9 +219,6 @@ def merge(self, other, decay=1.0): for i in xrange(self.u.shape[1]): if self.u[0, i] < 0.0: self.u[:, i] *= -1.0 -# diff = np.dot(self.u.T, self.u) - np.eye(self.u.shape[1]) -# logger.info('orth error after=%f' % np.sum(diff * diff)) -# endclass Projection class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): @@ -300,7 +300,9 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1) self.docs_processed = 0 - self.projection = Projection(self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples) + self.projection = Projection( + self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples + ) self.numworkers = 1 if not distributed: @@ -308,16 +310,18 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, self.dispatcher = None else: if not onepass: - raise NotImplementedError("distributed stochastic LSA not implemented yet; " - "run either distributed one-pass, or serial randomized.") + raise NotImplementedError( + "distributed stochastic LSA not implemented yet; " + "run either distributed one-pass, or serial randomized." + ) try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri)) - dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, - chunksize=chunksize, decay=decay, - power_iters=self.power_iters, extra_samples=self.extra_samples, - distributed=False, onepass=onepass) + dispatcher.initialize( + id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay, + power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass + ) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers", self.numworkers) @@ -359,7 +363,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, - extra_dims=self.extra_samples, power_iters=self.power_iters) + extra_dims=self.extra_samples, power_iters=self.power_iters + ) self.projection.merge(update, decay=decay) self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 else: @@ -385,7 +390,10 @@ def add_documents(self, corpus, chunksize=None, decay=None): logger.info("dispatched documents up to #%s", doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly - update = Projection(self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters) + update = Projection( + self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, + power_iters=self.power_iters + ) del job self.projection.merge(update, decay=decay) del update @@ -397,19 +405,21 @@ def add_documents(self, corpus, chunksize=None, decay=None): logger.info("reached the end of input; now waiting for all remaining jobs to finish") self.projection = self.dispatcher.getstate() self.docs_processed += doc_no -# logger.info("top topics after adding %i documents" % doc_no) -# self.print_debug(10) else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert self.onepass, "distributed two-pass algo not supported yet" - update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters) + update = Projection( + self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, + power_iters=self.power_iters + ) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) self.docs_processed += corpus.shape[1] def __str__(self): return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize) + self.num_terms, self.num_topics, self.decay, self.chunksize + ) def __getitem__(self, bow, scaled=False, chunksize=512): """ @@ -579,9 +589,8 @@ def load(cls, fname, *args, **kwargs): try: result.projection = super(LsiModel, cls).load(projection_fname, *args, **kwargs) except Exception as e: - logging.warning("failed to load projection from %s: %s" % (projection_fname, e)) + logging.warning("failed to load projection from %s: %s", projection_fname, e) return result -# endclass LsiModel def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): @@ -679,7 +688,7 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations", power_iters) - for power_iter in xrange(power_iters): + for _ in xrange(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step @@ -697,8 +706,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, num_docs += n logger.debug("multiplying chunk * gauss") o = np.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix - sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o - chunk.data, o.ravel(), y.ravel()) + sparsetools.csc_matvecs( + m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o + chunk.data, o.ravel(), y.ravel() + ) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range @@ -723,7 +734,7 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, if scipy.sparse.issparse(corpus): b = qt * corpus - logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) + logger.info("2nd phase: running dense svd on %s matrix", str(b.shape)) u, s, vt = scipy.linalg.svd(b, full_matrices=False) del b, vt else: diff --git a/gensim/models/normmodel.py b/gensim/models/normmodel.py index a78dc604dc..31e843beb3 100644 --- a/gensim/models/normmodel.py +++ b/gensim/models/normmodel.py @@ -55,7 +55,7 @@ def calc_norm(self, corpus): """ Calculates the norm by calling matutils.unitvec with the norm parameter. """ - logger.info("Performing %s normalization..." % (self.norm)) + logger.info("Performing %s normalization...", self.norm) norms = [] numnnz = 0 docno = 0 @@ -73,4 +73,3 @@ def normalize(self, bow): def __getitem__(self, bow): return self.normalize(bow) -# endclass NormModel diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 1f0826258c..263968526f 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -108,9 +108,8 @@ class Phrases(interfaces.TransformationABC): """ - def __init__(self, sentences=None, min_count=5, threshold=10.0, - max_vocab_size=40000000, delimiter=b'_', progress_per=10000, - scoring='default'): + def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, + delimiter=b'_', progress_per=10000, scoring='default'): """ Initialize the model from an iterable of `sentences`. Each sentence must be a list of words (unicode strings) that will be used for training. @@ -179,7 +178,8 @@ def __str__(self): """Get short string representation of this phrase detector.""" return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % ( self.__class__.__name__, len(self.vocab), self.min_count, - self.threshold, self.max_vocab_size) + self.threshold, self.max_vocab_size + ) @staticmethod def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): @@ -191,8 +191,10 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % - (sentence_no, total_words, len(vocab))) + logger.info( + "PROGRESS: at sentence #%i, processed %i words and %i word types", + sentence_no, total_words, len(vocab) + ) sentence = [utils.any2utf8(w) for w in sentence] for bigram in zip(sentence, sentence[1:]): vocab[bigram[0]] += 1 @@ -208,8 +210,10 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): utils.prune_vocab(vocab, min_reduce) min_reduce += 1 - logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % - (len(vocab), total_words, sentence_no + 1)) + logger.info( + "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences", + len(vocab), total_words, sentence_no + 1 + ) return min_reduce, vocab, total_words def add_vocab(self, sentences): @@ -262,11 +266,9 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): corpus_word_count = self.corpus_word_count if scoring == 'default': - scoring_function = \ - partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) + scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) elif scoring == 'npmi': - scoring_function = \ - partial(self.npmi_scorer, corpus_word_count=corpus_word_count) + scoring_function = partial(self.npmi_scorer, corpus_word_count=corpus_word_count) # no else here to catch unknown scoring function, check is done in Phrases.__init__ for sentence in sentences: @@ -282,10 +284,6 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): count_b = float(vocab[word_b]) count_ab = float(vocab[bigram_word]) score = scoring_function(count_a, count_b, count_ab) - # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) - # added mincount check because if the scorer doesn't contain min_count - # it would not be enforced otherwise if score > threshold and count_ab >= min_count: if as_tuples: yield ((word_a, word_b), score) @@ -336,8 +334,6 @@ def __getitem__(self, sentence): pb = float(vocab[word_b]) pab = float(vocab[bigram_word]) score = (pab - min_count) / pa / pb * len(vocab) - # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) if score > threshold: new_s.append(bigram_word) last_bigram = True @@ -453,7 +449,7 @@ def __getitem__(self, sentence): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.info("running %s" % " ".join(sys.argv)) + logging.info("running %s", " ".join(sys.argv)) # check and process cmdline input program = os.path.basename(sys.argv[0]) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 1186a041a0..f5753c75c5 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -61,7 +61,7 @@ def initialize(self, corpus): self.num_terms = 1 + max([-1] + self.id2word.keys()) shape = self.num_topics, self.num_terms - logger.info("constructing %s random matrix" % str(shape)) + logger.info("constructing %s random matrix", str(shape)) # Now construct the projection matrix itself. # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). @@ -89,10 +89,11 @@ def __getitem__(self, bow): vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / np.sqrt(self.num_topics) vec = np.asfortranarray(vec, dtype=np.float32) topic_dist = np.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) - return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) - if np.isfinite(topicvalue) and not np.allclose(topicvalue, 0.0)] + return [ + (topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) + if np.isfinite(topicvalue) and not np.allclose(topicvalue, 0.0) + ] def __setstate__(self, state): self.__dict__ = state self.freshly_loaded = True -# endclass RpModel diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 4b5ba02e02..50320ad747 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -28,7 +28,7 @@ def precompute_idfs(wglobal, dfs, total_docs): """Precompute the inverse document frequency mapping for all terms.""" # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. - return dict((termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs)) + return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} class TfidfModel(interfaces.TransformationABC): @@ -49,9 +49,8 @@ class TfidfModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__( - self, corpus=None, id2word=None, dictionary=None, - wlocal=utils.identity, wglobal=df2idf, normalize=True): + def __init__(self, corpus=None, id2word=None, dictionary=None, + wlocal=utils.identity, wglobal=df2idf, normalize=True): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -89,7 +88,8 @@ def __init__( # step that goes through the corpus (= an optimization). if corpus is not None: logger.warning( - "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus") + "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus" + ) self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.dfs = dictionary.dfs.copy() self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) @@ -129,7 +129,8 @@ def initialize(self, corpus): n_features = max(dfs) if dfs else 0 logger.info( "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", - self.num_docs, n_features, self.num_nnz) + self.num_docs, n_features, self.num_nnz + ) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) def __getitem__(self, bow, eps=1e-12): @@ -158,4 +159,3 @@ def __getitem__(self, bow, eps=1e-12): # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector -# endclass TfidfModel diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 255b9c553f..2111478c00 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -159,7 +159,9 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): # don't train on the `word` itself if pos2 != pos: - train_sg_pair(model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss) + train_sg_pair( + model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss + ) result += len(word_vocabs) return result @@ -382,11 +384,10 @@ class Word2Vec(utils.SaveLoad): """ - def __init__( - self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): + def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -460,9 +461,9 @@ def __init__( self.load = call_on_class_only if FAST_VERSION == -1: - logger.warning('Slow version of {0} is being used'.format(__name__)) + logger.warning('Slow version of %s is being used', __name__) else: - logger.debug('Fast version of {0} is being used'.format(__name__)) + logger.debug('Fast version of %s is being used', __name__) self.initialize_word_vectors() self.sg = int(sg) @@ -498,12 +499,14 @@ def __init__( if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha) + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) else: if trim_rule is not None: - logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") - logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.") + logger.warning( + "The rule, if given, is only used to prune vocabulary during build_vocab() " + "and is not stored as part of the model. Model initialized without sentences. " + "trim_rule provided, if any, will be ignored." + ) def initialize_word_vectors(self): self.wv = KeyedVectors() @@ -546,7 +549,9 @@ def create_binary_tree(self): heapq.heapify(heap) for i in xrange(len(self.wv.vocab) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) - heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2)) + heapq.heappush( + heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2) + ) # recurse over the tree, assigning a binary code to each vocabulary word if heap: @@ -587,13 +592,16 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): if not checked_string_types: if isinstance(sentence, string_types): logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings)." - "First item here is instead plain %s.", type(sentence) + "Each 'sentences' item should be a list of words (usually unicode strings). " + "First item here is instead plain %s.", + type(sentence) ) checked_string_types += 1 if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)) + logger.info( + "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, sum(itervalues(vocab)) + total_words, len(vocab) + ) for word in sentence: vocab[word] += 1 @@ -602,12 +610,15 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): min_reduce += 1 total_words += sum(itervalues(vocab)) - logger.info("collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1) + logger.info( + "collected %i word types from a corpus of %i raw words and %i sentences", + len(vocab), total_words, sentence_no + 1 + ) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): + def scale_vocab(self, min_count=None, sample=None, dry_run=False, + keep_raw_vocab=False, trim_rule=None, update=False): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -648,12 +659,16 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab drop_total += v original_unique_total = len(retain_words) + drop_unique retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info("min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique) + logger.info( + "min_count=%d retains %i unique words (%i%% of original %i, drops %i)", + min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique + ) original_total = retain_total + drop_total retain_pct = retain_total * 100 / max(original_total, 1) - logger.info("min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - min_count, retain_total, retain_pct, original_total, drop_total) + logger.info( + "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", + min_count, retain_total, retain_pct, original_total, drop_total + ) else: logger.info("Updating model with new vocabulary") new_total = pre_exist_total = 0 @@ -677,10 +692,12 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info("""New added %i unique words (%i%% of original %i) - and increased the count of %i pre-existing words (%i%% of original %i)""", - len(new_words), new_unique_pct, original_unique_total, - len(pre_exist_words), pre_exist_unique_pct, original_unique_total) + logger.info( + "New added %i unique words (%i%% of original %i) " + "and increased the count of %i pre-existing words (%i%% of original %i)", + len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), + pre_exist_unique_pct, original_unique_total + ) retain_words = new_words + pre_exist_words retain_total = new_total + pre_exist_total @@ -713,15 +730,16 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab self.raw_vocab = defaultdict(int) logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) - - # return from each step: words-affected, resulting-corpus-size - report_values = {'drop_unique': drop_unique, 'retain_total': retain_total, - 'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total)} + logger.info( + "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", + downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total + ) - # print extra memory estimates - report_values['memory'] = self.estimate_memory(vocab_size=len(retain_words)) + # return from each step: words-affected, resulting-corpus-size, extra memory estimates + report_values = { + 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, + 'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words)) + } return report_values @@ -787,8 +805,7 @@ def _raw_word_count(self, job): return sum(len(sentence) for sentence in job) def train(self, sentences, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, - word_count=0, + epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -804,11 +821,13 @@ def train(self, sentences, total_examples=None, total_words=None, explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. """ - if (self.model_trimmed_post_training): + if self.model_trimmed_post_training: raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") if FAST_VERSION < 0: - warnings.warn("C extension not loaded for Word2Vec, training will be slow. " - "Install a C compiler and reinstall gensim for fast training.") + warnings.warn( + "C extension not loaded for Word2Vec, training will be slow. " + "Install a C compiler and reinstall gensim for fast training." + ) self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training @@ -822,8 +841,8 @@ def train(self, sentences, total_examples=None, total_words=None, logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, - self.hs, self.sample, self.negative, self.window) + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window + ) if not self.wv.vocab: raise RuntimeError("you must first build vocabulary before training the model") @@ -834,10 +853,15 @@ def train(self, sentences, total_examples=None, total_words=None, raise ValueError( "The number of sentences in the training corpus is missing. Did you load the model via KeyedVectors.load_word2vec_format?" "Models loaded via load_word2vec_format don't support further training. " - "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.") + "Instead start with a blank model, scan_vocab on the new corpus, " + "intersect_word2vec_format with the old model, then train." + ) if total_words is None and total_examples is None: - raise ValueError("You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count.") + raise ValueError( + "You must specify either total_examples or total_words, for proper alpha and progress calculations. " + "The usual value is total_examples=model.corpus_count." + ) if epochs is None: raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.") start_alpha = start_alpha or self.alpha @@ -872,9 +896,7 @@ def job_producer(): pushed_words, pushed_examples = 0, 0 next_alpha = start_alpha if next_alpha > self.min_alpha_yet_reached: - logger.warning( - "Effective 'alpha' higher than previous training cycles" - ) + logger.warning("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 @@ -890,7 +912,8 @@ def job_producer(): # no => submit the existing job logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha) + job_no, batch_size, len(job_batch), next_alpha + ) job_no += 1 job_queue.put((job_batch, next_alpha)) @@ -914,15 +937,15 @@ def job_producer(): if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha) + job_no, batch_size, len(job_batch), next_alpha + ) job_no += 1 job_queue.put((job_batch, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable " - "iteration = an iterable)." + "be sure to provide a corpus that offers restartable iteration = an iterable)." ) # give the workers heads up that they can finish -- no more work! @@ -967,34 +990,31 @@ def job_producer(): logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue)) + utils.qsize(job_queue), utils.qsize(progress_queue) + ) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue)) + utils.qsize(job_queue), utils.qsize(progress_queue) + ) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed) + raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed + ) if job_tally < 10 * self.workers: - logger.warning( - "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" - ) + logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: - logger.warning( - "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples - ) + logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: - logger.warning( - "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words - ) + logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed @@ -1021,21 +1041,25 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """ if FAST_VERSION < 0: - warnings.warn("C extension compilation failed, scoring will be slow. " - "Install a C compiler and reinstall gensim for fastness.") + warnings.warn( + "C extension compilation failed, scoring will be slow. " + "Install a C compiler and reinstall gensim for fastness." + ) logger.info( "scoring sentences with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative) + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative + ) if not self.wv.vocab: raise RuntimeError("you must first build vocabulary before scoring new data") if not self.hs: - raise RuntimeError("We have currently only implemented score \ - for the hierarchical softmax scheme, so you need to have \ - run word2vec with hs=1 and negative=0 for this to work.") + raise RuntimeError( + "We have currently only implemented score for the hierarchical softmax scheme, " + "so you need to have run word2vec with hs=1 and negative=0 for this to work." + ) def worker_loop(): """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" @@ -1081,15 +1105,14 @@ def worker_loop(): if (job_no - 1) * chunksize > total_sentences: logger.warning( "terminating after %i sentences (set higher total_sentences if you want more).", - total_sentences) + total_sentences + ) job_no -= 1 raise StopIteration() logger.debug("putting job #%i in the queue", job_no) job_queue.put(items) except StopIteration: - logger.info( - "reached end of input; waiting to finish %i outstanding jobs", - job_no - done_jobs + 1) + logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) for _ in xrange(self.workers): job_queue.put(None) # give the workers heads up that they can finish -- no more work! push_done = True @@ -1102,7 +1125,8 @@ def worker_loop(): if elapsed >= next_report: logger.info( "PROGRESS: at %.2f%% sentences, %.0f sentences/s", - 100.0 * sentence_count, sentence_count / elapsed) + 100.0 * sentence_count, sentence_count / elapsed + ) next_report = elapsed + report_delay # don't flood log, wait report_delay seconds else: # loop ended by job count; really done @@ -1114,7 +1138,8 @@ def worker_loop(): self.clear_sims() logger.info( "scoring %i sentences took %.1fs, %.0f sentences/s", - sentence_count, elapsed, sentence_count / elapsed) + sentence_count, elapsed, sentence_count / elapsed + ) return sentence_scores[:sentence_count] def clear_sims(self): @@ -1141,9 +1166,10 @@ def update_weights(self): # Raise an error if an online update is run before initial training on a corpus if not len(self.wv.syn0): - raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus " - "before doing an online update.") + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus before doing an online update." + ) self.wv.syn0 = vstack([self.wv.syn0, newsyn0]) @@ -1192,16 +1218,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut training. Use 1.0 to allow further training updates of merged vectors. """ overlap_count = 0 - logger.info("loading projection weights from %s" % (fname)) + logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = map(int, header.split()) # throws for invalid file format + vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if not vector_size == self.vector_size: raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? if binary: binary_len = dtype(REAL).itemsize * vector_size - for line_no in xrange(vocab_size): + for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: @@ -1220,13 +1246,13 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut for line_no, line in enumerate(fin): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) - word, weights = parts[0], list(map(REAL, parts[1:])) + raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) + word, weights = parts[0], [REAL(x) for x in parts[1:]] if word in self.wv.vocab: overlap_count += 1 self.wv.syn0[self.wv.vocab[word].index] = weights self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes - logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname)) + logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname) def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): """ @@ -1301,9 +1327,10 @@ def n_similarity(self, ws1, ws2): def predict_output_word(self, context_words_list, topn=10): """Report the probability distribution of the center word given the context words as input to the trained model.""" if not self.negative: - raise RuntimeError("We have currently only implemented predict_output_word " - "for the negative sampling scheme, so you need to have " - "run word2vec with negative > 0 for this to work.") + raise RuntimeError( + "We have currently only implemented predict_output_word for the negative sampling scheme, " + "so you need to have run word2vec with negative > 0 for this to work." + ) if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") @@ -1344,8 +1371,10 @@ def estimate_memory(self, vocab_size=None, report=None): if self.negative: report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize report['total'] = sum(report.values()) - logger.info("estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total']) + logger.info( + "estimated required memory for %i words and %i dimensions: %i bytes", + vocab_size, self.vector_size, report['total'] + ) return report @staticmethod @@ -1375,7 +1404,11 @@ def __str__(self): return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): - warnings.warn("This method would be deprecated in the future. Keep just_word_vectors = model.wv to retain just the KeyedVectors instance for read-only querying of word vectors.") + warnings.warn( + "This method would be deprecated in the future. " + "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance " + "for read-only querying of word vectors." + ) if save_syn1 and save_syn1neg and save_syn0_lockf: return if hasattr(self, 'syn1') and not save_syn1: @@ -1581,19 +1614,19 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logging.debug('reading directory ' + self.source) + logging.debug('reading directory %s', self.source) self.input_files = os.listdir(self.source) self.input_files = [self.source + file for file in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files)) + logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) def __iter__(self): - '''iterate through the files''' + """iterate through the files""" for file_name in self.input_files: - logging.info('reading file ' + file_name) + logging.info('reading file %s', file_name) with utils.smart_open(file_name) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() @@ -1649,7 +1682,8 @@ def __iter__(self): model = Word2Vec( corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window, sample=args.sample, sg=skipgram, hs=args.hs, - negative=args.negative, cbow_mean=1, iter=args.iter) + negative=args.negative, cbow_mean=1, iter=args.iter + ) if args.output: outfile = args.output diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index b60f158c65..fe988fc24c 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -579,7 +579,7 @@ cdef void score_pair_sg_hs( row2 = word_point[b] * size f = our_dot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE) sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1 - f = sgn*f + f *= sgn if f <= -MAX_EXP or f >= MAX_EXP: continue f = LOG_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] @@ -673,7 +673,7 @@ cdef void score_pair_cbow_hs( row2 = word_point[b] * size f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1 - f = sgn*f + f *= sgn if f <= -MAX_EXP or f >= MAX_EXP: continue f = LOG_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index bc02663e04..1f450a457a 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -41,9 +41,9 @@ class DtmModel(utils.SaveLoad): """ - def __init__( - self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None, - lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): + def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, + id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, + alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): """ `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`. @@ -88,7 +88,7 @@ def __init__( try: lencorpus = len(corpus) - except Exception: + except TypeError: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: @@ -97,8 +97,10 @@ def __init__( raise ValueError("""There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): - raise ValueError("mismatched timeslices %{slices} for corpus of len {clen}".format( - slices=sum(time_slices), clen=lencorpus)) + raise ValueError( + "mismatched timeslices %{slices} for corpus of len {clen}" + .format(slices=sum(time_slices), clen=lencorpus) + ) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' @@ -171,7 +173,7 @@ def convert_input(self, corpus, time_slices): Serialize documents in LDA-C format to a temporary text file,. """ - logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) + logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) @@ -187,17 +189,25 @@ def train(self, corpus, time_slices, mode, model): """ self.convert_input(corpus, time_slices) - arguments = "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} --outname={p4} --alpha={p5}".format( - p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha) + arguments = \ + "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \ + "--outname={p4} --alpha={p5}".format( + p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, + p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha + ) - params = "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} --top_chain_var={p3} --rng_seed={p4} ".format( - p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed) + params = \ + "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} " \ + "--top_chain_var={p3} --rng_seed={p4} ".format( + p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, + p3=self.top_chain_var, p4=self.rng_seed + ) arguments = arguments + " " + params - logger.info("training DTM with args %s" % arguments) + logger.info("training DTM with args %s", arguments) cmd = [self.dtm_path] + arguments.split() - logger.info("Running command %s" % cmd) + logger.info("Running command %s", cmd) check_output(args=cmd, stderr=PIPE) self.em_steps = np.loadtxt(self.fem_steps()) @@ -255,13 +265,6 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted else: num_topics = min(num_topics, self.num_topics) chosen_topics = range(num_topics) - # add a little random jitter, to randomize results around the same - # alpha - # sort_alpha = self.alpha + 0.0001 * \ - # numpy.random.rand(len(self.alpha)) - # sorted_topics = list(numpy.argsort(sort_alpha)) - # chosen_topics = sorted_topics[: topics / 2] + \ - # sorted_topics[-topics / 2:] if times < 0 or times >= len(self.time_slices): times = len(self.time_slices) @@ -278,9 +281,6 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted else: topic = self.show_topic(i, time, num_words=num_words) shown.append(topic) - # if log: - # logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i], - # topic)) return shown def show_topic(self, topicid, time, topn=50, num_words=None): @@ -290,25 +290,30 @@ def show_topic(self, topicid, time, topn=50, num_words=None): """ if num_words is not None: # deprecated num_words is used - logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") - logger.warning("Please use topn instead.") + logger.warning( + "The parameter num_words for show_topic() would be deprecated in the updated version. " + "Please use topn instead." + ) topn = num_words topics = self.lambda_[:, :, time] topic = topics[topicid] - # liklihood to probability + # likelihood to probability topic = np.exp(topic) # normalize to probability dist topic = topic / topic.sum() # sort according to prob bestn = matutils.argsort(topic, topn, reverse=True) - beststr = [(topic[id], self.id2word[id]) for id in bestn] + beststr = [(topic[idx], self.id2word[idx]) for idx in bestn] return beststr def print_topic(self, topicid, time, topn=10, num_words=None): """Return the given topic, formatted as a string.""" if num_words is not None: # deprecated num_words is used - warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.") + warnings.warn( + "The parameter num_words for print_topic() would be deprecated in the updated version. " + "Please use topn instead." + ) topn = num_words return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)]) @@ -320,7 +325,7 @@ def dtm_vis(self, corpus, time): input parameter is the year to do the visualisation. """ topic_term = np.exp(self.lambda_[:, :, time]) / np.exp(self.lambda_[:, :, time]).sum() - topic_term = topic_term * self.num_topics + topic_term *= self.num_topics doc_topic = self.gamma_ diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 839cb46633..c2c8ee0688 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -19,7 +19,7 @@ Example: >>> from gensim.models.wrappers import FastText ->>> model = fasttext.FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8') +>>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8') >>> print model['forests'] # prints vector for given out-of-vocabulary word .. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information @@ -41,6 +41,11 @@ logger = logging.getLogger(__name__) +try: + FileNotFoundError +except NameError: + FileNotFoundError = IOError + FASTTEXT_FILEFORMAT_MAGIC = 793712314 @@ -146,7 +151,7 @@ def initialize_word_vectors(self): @classmethod def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5, - word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12): + word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12): """ `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`. @@ -212,7 +217,7 @@ def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, a cmd.append("-%s" % option) cmd.append(str(value)) - output = utils.check_output(args=cmd) # noqa:F841 + utils.check_output(args=cmd) model = cls.load_fasttext_format(output_file) cls.delete_training_files(output_file) return model @@ -265,17 +270,17 @@ def load_model_params(self, file_handle): magic, version = self.struct_unpack(file_handle, '@2i') if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format self.new_format = True - dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d') + dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d') else: # older format self.new_format = False dim = magic ws = version - epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d') + epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d') # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc) self.vector_size = dim self.window = ws self.iter = epoch - self.min_count = minCount + self.min_count = min_count self.negative = neg self.hs = loss == 1 self.sg = model == 2 @@ -307,7 +312,9 @@ def load_dict(self, file_handle, encoding='utf8'): # For more info : https://github.com/facebookresearch/fastText/issues/218 assert word == "__label__", ( - 'mismatched vocab_size ({}) and nwords ({}), extra word "{}"'.format(vocab_size, nwords, word)) + 'mismatched vocab_size ({}) and nwords ({}), extra word "{}"' + .format(vocab_size, nwords, word) + ) continue # don't add word to vocab self.wv.vocab[word] = Vocab(index=i, count=count) @@ -320,7 +327,8 @@ def load_dict(self, file_handle, encoding='utf8'): # expecting to log this warning only for pretrained french vector, wiki.fr logger.warning( "mismatch between final vocab size (%s words), and expected vocab size (%s words)", - len(self.wv.vocab), vocab_size) + len(self.wv.vocab), vocab_size + ) if self.new_format: for j in range(pruneidx_size): @@ -332,7 +340,9 @@ def load_vectors(self, file_handle): num_vectors, dim = self.struct_unpack(file_handle, '@2q') # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc) assert self.vector_size == dim, ( - 'mismatch between vector size in model params ({}) and model vectors ({})'.format(self.vector_size, dim)) + 'mismatch between vector size in model params ({}) and model vectors ({})' + .format(self.vector_size, dim) + ) float_size = struct.calcsize('@f') if float_size == 4: dtype = np.dtype(np.float32) @@ -343,8 +353,10 @@ def load_vectors(self, file_handle): self.wv.syn0_all = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim) self.wv.syn0_all = self.wv.syn0_all.reshape((num_vectors, dim)) assert self.wv.syn0_all.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \ - 'mismatch between actual weight matrix shape {} and expected shape {}'.format( - self.wv.syn0_all.shape, (self.bucket + len(self.wv.vocab), self.vector_size)) + 'mismatch between actual weight matrix shape {} and expected shape {}'\ + .format( + self.wv.syn0_all.shape, (self.bucket + len(self.wv.vocab), self.vector_size) + ) self.init_ngrams() @@ -378,7 +390,10 @@ def init_ngrams(self): ngram_weights = self.wv.syn0_all - logger.info("loading weights for %s words for fastText model from %s", len(self.wv.vocab), self.file_name) + logger.info( + "loading weights for %s words for fastText model from %s", + len(self.wv.vocab), self.file_name + ) for w, vocab in self.wv.vocab.items(): word_ngrams = self.compute_ngrams(w, self.wv.min_n, self.wv.max_n) @@ -386,7 +401,10 @@ def init_ngrams(self): self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]]) self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1) - logger.info("loaded %s weight matrix for fastText model from %s", self.wv.syn0.shape, self.file_name) + logger.info( + "loaded %s weight matrix for fastText model from %s", + self.wv.syn0.shape, self.file_name + ) @staticmethod def compute_ngrams(word, min_n, max_n): diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index a4e435810f..19c93e5f6c 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -143,7 +143,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True): self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format - cmd = self.mallet_path + ' import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input %s --output %s' + cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex \"\S+\" --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') @@ -158,8 +158,10 @@ def train(self, corpus): '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s' cmd = cmd % ( - self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, - self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold) + self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, + self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, + self.finferencer(), self.topic_threshold + ) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(args=cmd, shell=True) @@ -176,7 +178,10 @@ def __getitem__(self, bow, iterations=100): self.convert_input(bow, infer=True) cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' - cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold) + cmd = cmd % ( + self.fcorpusmallet() + '.infer', self.finferencer(), + self.fdoctopics() + '.infer', iterations, self.topic_threshold + ) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) @@ -255,13 +260,11 @@ def show_topic(self, topicid, topn=10, num_words=None): topn = num_words if self.word_topics is None: - logger.warning( - "Run train or load_word_topics before showing topics." - ) + logger.warning("Run train or load_word_topics before showing topics.") topic = self.word_topics[topicid] topic = topic / topic.sum() # normalize to probability dist bestn = matutils.argsort(topic, topn, reverse=True) - beststr = [(self.id2word[id], topic[id]) for id in bestn] + beststr = [(self.id2word[idx], topic[idx]) for idx in bestn] return beststr def get_version(self, direc_path): @@ -305,14 +308,9 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): # the MALLET doctopic format changed in 2.0.8 to exclude the id, # this handles the file differently dependent on the pattern if len(parts) == 2 * self.num_topics: - doc = [(id_, weight) - for id_, weight in zip(map(int, parts[::2]), - map(float, parts[1::2])) - if abs(weight) > eps] + doc = [(int(id_), float(weight)) for id_, weight in zip(*[iter(parts)] * 2) if abs(float(weight)) > eps] elif len(parts) == self.num_topics and mallet_version != '2.0.7': - doc = [(id_, weight) - for id_, weight in enumerate(map(float, parts)) - if abs(weight) > eps] + doc = [(id_, float(weight)) for id_, weight in enumerate(parts) if abs(float(weight)) > eps] else: if mallet_version == "2.0.7": """ @@ -375,6 +373,7 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, - gamma_threshold=gamma_threshold) + gamma_threshold=gamma_threshold + ) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index afa19c4327..ede5074b99 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -68,7 +68,7 @@ from gensim import utils, matutils from gensim.models.ldamodel import LdaModel -LOG = logging.getLogger(__name__) +logger = logging.getLogger(__name__) class LdaVowpalWabbit(utils.SaveLoad): @@ -140,11 +140,10 @@ def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, if self.id2word is None: if corpus is None: - raise ValueError('at least one of corpus/id2word must be ' - 'specified, to establish input space ' - 'dimensionality') - LOG.warning('no word id mapping provided; initializing from ' - 'corpus, assuming identity') + raise ValueError( + "at least one of corpus/id2word must be specified, to establish input space dimensionality" + ) + logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: @@ -153,8 +152,7 @@ def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, self.num_terms = 0 if self.num_terms == 0: - raise ValueError('cannot compute LDA over an empty collection ' - '(no terms)') + raise ValueError("cannot compute LDA over an empty collection (no terms)") # LDA parameters self.num_topics = num_topics @@ -186,7 +184,7 @@ def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, def train(self, corpus): """Clear any existing model state, and train on given corpus.""" - LOG.debug('Training new model from corpus') + logger.debug('Training new model from corpus') # reset any existing offset, model, or topics generated self.offset = self._initial_offset @@ -206,7 +204,7 @@ def update(self, corpus): if not os.path.exists(self._model_filename): return self.train(corpus) - LOG.debug('Updating exiting model from corpus') + logger.debug('Updating exiting model from corpus') # reset any existing topics generated self._topics = None @@ -228,12 +226,10 @@ def log_perplexity(self, chunk): vw_data = self._predict(chunk)[1] corpus_words = sum(cnt for document in chunk for _, cnt in document) bound = -vw_data['average_loss'] - LOG.info("%.3f per-word bound, %.1f perplexity estimate based on a " - "held-out corpus of %i documents with %i words", - bound, - numpy.exp2(-bound), - vw_data['corpus_size'], - corpus_words) + logger.info( + "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words", + bound, numpy.exp2(-bound), vw_data['corpus_size'], corpus_words + ) return bound def get_topics(self): @@ -267,7 +263,7 @@ def show_topics(self, num_topics=10, num_words=10, shown.append(topic) if log: - LOG.info("topic #%i (%.3f): %s", i, self.alpha, topic) + logger.info("topic #%i (%.3f): %s", i, self.alpha, topic) return shown @@ -287,12 +283,12 @@ def save(self, fname, *args, **kwargs): # Vowpal Wabbit uses its own binary model file, read this into # variable before serialising this object - keeps all data # self contained within a single serialised file - LOG.debug("Reading model bytes from '%s'", self._model_filename) + logger.debug("Reading model bytes from '%s'", self._model_filename) with utils.smart_open(self._model_filename, 'rb') as fhandle: self._model_data = fhandle.read() if os.path.exists(self._topics_filename): - LOG.debug("Reading topic bytes from '%s'", self._topics_filename) + logger.debug("Reading topic bytes from '%s'", self._topics_filename) with utils.smart_open(self._topics_filename, 'rb') as fhandle: self._topics_data = fhandle.read() @@ -310,13 +306,13 @@ def load(cls, fname, *args, **kwargs): if lda_vw._model_data: # Vowpal Wabbit operates on its own binary model file - deserialise # to file at load time, making it immediately ready for use - LOG.debug("Writing model bytes to '%s'", lda_vw._model_filename) + logger.debug("Writing model bytes to '%s'", lda_vw._model_filename) with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle: fhandle.write(lda_vw._model_data) lda_vw._model_data = None # no need to keep in memory after this if lda_vw._topics_data: - LOG.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) + logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) with utils.smart_open(lda_vw._topics_filename, 'wb') as fhandle: fhandle.write(lda_vw._topics_data) lda_vw._topics_data = None @@ -326,23 +322,25 @@ def load(cls, fname, *args, **kwargs): def __del__(self): """Cleanup the temporary directory used by this wrapper.""" if self.cleanup_files and self.tmp_dir: - LOG.debug("Recursively deleting: %s", self.tmp_dir) + logger.debug("Recursively deleting: %s", self.tmp_dir) shutil.rmtree(self.tmp_dir) def _init_temp_dir(self, prefix='tmp'): """Create a working temporary directory with given prefix.""" self.tmp_dir = tempfile.mkdtemp(prefix=prefix) - LOG.info('using %s as temp dir', self.tmp_dir) + logger.info('using %s as temp dir', self.tmp_dir) def _get_vw_predict_command(self, corpus_size): """Get list of command line arguments for running prediction.""" - cmd = [self.vw_path, - '--testonly', # don't update model with this data - '--lda_D', str(corpus_size), - '-i', self._model_filename, # load existing binary model - '-d', self._corpus_filename, - '--learning_rate', '0', # possibly not needed, but harmless - '-p', self._predict_filename] + cmd = [ + self.vw_path, + '--testonly', # don't update model with this data + '--lda_D', str(corpus_size), + '-i', self._model_filename, # load existing binary model + '-d', self._corpus_filename, + '--learning_rate', '0', # possibly not needed, but harmless + '-p', self._predict_filename + ] if self.random_seed is not None: cmd.extend(['--random_seed', str(self.random_seed)]) @@ -355,27 +353,31 @@ def _get_vw_train_command(self, corpus_size, update=False): If 'update' is set to True, this specifies that we're further training an existing model. """ - cmd = [self.vw_path, - '-d', self._corpus_filename, - '--power_t', str(self.decay), - '--initial_t', str(self.offset), - '--minibatch', str(self.chunksize), - '--lda_D', str(corpus_size), - '--passes', str(self.passes), - '--cache_file', self._cache_filename, - '--lda_epsilon', str(self.gamma_threshold), - '--readable_model', self._topics_filename, - '-k', # clear cache - '-f', self._model_filename] + cmd = [ + self.vw_path, + '-d', self._corpus_filename, + '--power_t', str(self.decay), + '--initial_t', str(self.offset), + '--minibatch', str(self.chunksize), + '--lda_D', str(corpus_size), + '--passes', str(self.passes), + '--cache_file', self._cache_filename, + '--lda_epsilon', str(self.gamma_threshold), + '--readable_model', self._topics_filename, + '-k', # clear cache + '-f', self._model_filename + ] if update: cmd.extend(['-i', self._model_filename]) else: # these params are read from model file if updating - cmd.extend(['--lda', str(self.num_topics), - '-b', str(_bit_length(self.num_terms)), - '--lda_alpha', str(self.alpha), - '--lda_rho', str(self.eta)]) + cmd.extend([ + '--lda', str(self.num_topics), + '-b', str(_bit_length(self.num_terms)), + '--lda_alpha', str(self.alpha), + '--lda_rho', str(self.eta) + ]) if self.random_seed is not None: cmd.extend(['--random_seed', str(self.random_seed)]) @@ -393,8 +395,7 @@ def _load_vw_topics(self): of: ... """ - topics = numpy.zeros((self.num_topics, self.num_terms), - dtype=numpy.float32) + topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self._topics_filename) as topics_file: found_data = False @@ -437,8 +438,7 @@ def _predict(self, chunk): vw_data = _parse_vw_output(_run_vw_command(cmd)) vw_data['corpus_size'] = corpus_size - predictions = numpy.zeros((corpus_size, self.num_topics), - dtype=numpy.float32) + predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32) with utils.smart_open(self._predict_filename) as fhandle: for i, line in enumerate(fhandle): @@ -524,7 +524,7 @@ def write_corpus_as_vw(corpus, filename): Returns the number of lines written. """ - LOG.debug("Writing corpus to: %s", filename) + logger.debug("Writing corpus to: %s", filename) corpus_size = 0 with utils.smart_open(filename, 'wb') as corpus_file: @@ -552,16 +552,14 @@ def _parse_vw_output(text): def _run_vw_command(cmd): """Execute given Vowpal Wabbit command, log stdout and stderr.""" - LOG.info("Running Vowpal Wabbit command: %s", ' '.join(cmd)) + logger.info("Running Vowpal Wabbit command: %s", ' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = proc.communicate()[0].decode('utf-8') - LOG.debug("Vowpal Wabbit output: %s", output) + logger.debug("Vowpal Wabbit output: %s", output) if proc.returncode != 0: - raise subprocess.CalledProcessError(proc.returncode, - ' '.join(cmd), - output=output) + raise subprocess.CalledProcessError(proc.returncode, ' '.join(cmd), output=output) return output @@ -588,6 +586,7 @@ def vwmodel2ldamodel(vw_model, iterations=50): model_gensim = LdaModel( num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, - offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold) + offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold + ) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim diff --git a/gensim/models/wrappers/varembed.py b/gensim/models/wrappers/varembed.py index eab1e0217c..30bf859ec7 100644 --- a/gensim/models/wrappers/varembed.py +++ b/gensim/models/wrappers/varembed.py @@ -18,14 +18,10 @@ """ import logging -import sys - import numpy as np -from gensim.models.keyedvectors import KeyedVectors - -# utility fnc for pickling, common scipy operations etc from gensim import utils +from gensim.models.keyedvectors import KeyedVectors from gensim.models.word2vec import Vocab logger = logging.getLogger(__name__) @@ -56,25 +52,21 @@ def load_varembed_format(cls, vectors, morfessor_model=None): result = cls() if vectors is None: raise Exception("Please provide vectors binary to load varembed model") - D = utils.unpickle(vectors) - word_to_ix = D['word_to_ix'] - morpho_to_ix = D['morpho_to_ix'] - word_embeddings = D['word_embeddings'] - morpho_embeddings = D['morpheme_embeddings'] + d = utils.unpickle(vectors) + word_to_ix = d['word_to_ix'] + morpho_to_ix = d['morpho_to_ix'] + word_embeddings = d['word_embeddings'] + morpho_embeddings = d['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: - if sys.version_info >= (2, 7): # Morfessor is only supported for Python 2.7 and above. - try: - import morfessor - morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model) - result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) - except ImportError: - # Morfessor Package not found. - logger.error('Could not import morfessor. Not using morpheme embeddings') - raise ImportError('Could not import morfessor.') - else: - # Raise exception in Python 2.6 or earlier. - raise Exception('Using Morphemes requires Python 2.7 and above. Morfessor is not supported in python 2.6') + try: + import morfessor + morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model) + result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) + except ImportError: + # Morfessor Package not found. + logger.error('Could not import morfessor. Not using morpheme embeddings') + raise ImportError('Could not import morfessor.') logger.info('Loaded varembed model vectors from %s', vectors) return result @@ -105,6 +97,10 @@ def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho """ for word in self.vocab: morpheme_embedding = np.array( - [morpho_embeddings[morpho_to_ix.get(m, -1)] for m in morfessor_model.viterbi_segment(word)[0]]).sum(axis=0) + [ + morpho_embeddings[morpho_to_ix.get(m, -1)] + for m in morfessor_model.viterbi_segment(word)[0] + ] + ).sum(axis=0) self.syn0[self.vocab[word].index] += morpheme_embedding logger.info("Added morphemes to word vectors") diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 8426af1d82..c31cd28adc 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -96,13 +96,23 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') - cmd_vocab_count = [os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)] - cmd_cooccurence_count = [os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)] + cmd_vocab_count = [ + os.path.join(wr_path, 'glove', 'vocab_count'), + '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) + ] + cmd_cooccurence_count = [ + os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), + '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) + ] cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] - input_fnames = [os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file] + input_fnames = [ + os.path.join(meta_dir, os.path.split(corpus_file)[-1]), + os.path.join(meta_dir, os.path.split(corpus_file)[-1]), + cooccurrence_file + ] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) @@ -116,22 +126,24 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: - numwords = sum(1 for line in f) + numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: - numlines = sum(1 for line in f) + numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: - meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1]) + meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( + numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], + numwords, vocab_file.split('/')[-1] + ) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( - 'Resultant embedding will be from %d iterations rather than the input %d iterations, ' - 'as wordrank dumps the embedding only at dump_period intervals. ' - 'Input an appropriate combination of parameters (iter, dump_period) such that ' - '"iter mod dump_period" is zero.', iter - (iter % dump_period), iter - ) + "Resultant embedding will be from %d iterations rather than the input %d iterations, as wordrank dumps the embedding only at dump_period intervals. " + "Input an appropriate combination of parameters (iter, dump_period) such that \"iter mod dump_period\" is zero.", + iter - (iter % dump_period), iter + ) wr_args = { 'path': meta_dir, @@ -151,20 +163,21 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, } # run wordrank executable with wr_args - cmd = ['mpirun', '-np'] - cmd.append(str(np)) - cmd.append(os.path.join(wr_path, 'wordrank')) + cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") - output = utils.check_output(args=cmd) # noqa:F841 + utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) - model = cls.load_wordrank_model(os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble) + model = cls.load_wordrank_model( + os.path.join(model_dir, 'wordrank.words'), vocab_file, + os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble + ) if cleanup_files: rmtree(model_dir) diff --git a/gensim/nosy.py b/gensim/nosy.py index 2913e1e694..0606166449 100644 --- a/gensim/nosy.py +++ b/gensim/nosy.py @@ -27,7 +27,7 @@ DEFAULTARGS = '--with-color -exe' # -w tests' -def checkSum(): +def check_sum(): """ Return a long which can be used to know if any .py files have changed. """ @@ -44,10 +44,9 @@ def checkSum(): val = 0 try: while True: - if checkSum() != val: - val = checkSum() - os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS, - ' '.join(sys.argv[1:]))) + if check_sum() != val: + val = check_sum() + os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS, ' '.join(sys.argv[1:]))) print(datetime.datetime.now().__str__()) print('=' * 77) time.sleep(1) diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py index a22b8b94d1..048e056418 100644 --- a/gensim/parsing/porter.py +++ b/gensim/parsing/porter.py @@ -363,10 +363,10 @@ def stem(self, w): return self.b[:self.k + 1] def stem_sentence(self, txt): - return " ".join(map(self.stem, txt.split())) + return " ".join(self.stem(x) for x in txt.split()) def stem_documents(self, docs): - return map(self.stem_sentence, docs) + return [self.stem_sentence(x) for x in docs] if __name__ == '__main__': diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index a92eb98656..ab25361f60 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -117,8 +117,11 @@ def stem_text(text): stem = stem_text -DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, - strip_numeric, remove_stopwords, strip_short, stem_text] +DEFAULT_FILTERS = [ + lambda x: x.lower(), strip_tags, strip_punctuation, + strip_multiple_whitespaces, strip_numeric, + remove_stopwords, strip_short, stem_text +] def preprocess_string(s, filters=DEFAULT_FILTERS): diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 4f13de4524..0667440f80 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -17,7 +17,6 @@ which contains the number of vectors and their dimensionality (two integers). """ -import os import sys import logging import argparse @@ -30,7 +29,7 @@ def get_glove_info(glove_file_name): """Return the number of vectors and dimensions in a file in GloVe format.""" with smart_open(glove_file_name) as f: - num_lines = sum(1 for line in f) + num_lines = sum(1 for _ in f) with smart_open(glove_file_name) as f: num_dims = len(f.readline().split()) - 1 return num_lines, num_dims @@ -53,19 +52,9 @@ def glove2word2vec(glove_input_file, word2vec_output_file): logging.root.setLevel(level=logging.INFO) logger.info("running %s", ' '.join(sys.argv)) - # check and process cmdline input - program = os.path.basename(sys.argv[0]) - if len(sys.argv) < 2: - print(globals()['__doc__'] % locals()) - sys.exit(1) - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input", required=True, - help="Input file, in gloVe format (read-only).") - parser.add_argument( - "-o", "--output", required=True, - help="Output file, in word2vec text format (will be overwritten).") + parser.add_argument("-i", "--input", required=True, help="Input file, in gloVe format (read-only).") + parser.add_argument("-o", "--output", required=True, help="Output file, in word2vec text format (will be overwritten).") args = parser.parse_args() # do the actual conversion diff --git a/gensim/scripts/make_wiki_online.py b/gensim/scripts/make_wiki_online.py index 66985a566e..37c437f3e1 100755 --- a/gensim/scripts/make_wiki_online.py +++ b/gensim/scripts/make_wiki_online.py @@ -55,7 +55,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) - logger.info("running %s" % ' '.join(sys.argv)) + logger.info("running %s", ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: @@ -107,4 +107,4 @@ # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - logger.info("finished running %s" % program) + logger.info("finished running %s", program) diff --git a/gensim/scripts/make_wiki_online_lemma.py b/gensim/scripts/make_wiki_online_lemma.py index 66985a566e..37c437f3e1 100755 --- a/gensim/scripts/make_wiki_online_lemma.py +++ b/gensim/scripts/make_wiki_online_lemma.py @@ -55,7 +55,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) - logger.info("running %s" % ' '.join(sys.argv)) + logger.info("running %s", ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: @@ -107,4 +107,4 @@ # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - logger.info("finished running %s" % program) + logger.info("finished running %s", program) diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py index 66985a566e..37c437f3e1 100755 --- a/gensim/scripts/make_wiki_online_nodebug.py +++ b/gensim/scripts/make_wiki_online_nodebug.py @@ -55,7 +55,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) - logger.info("running %s" % ' '.join(sys.argv)) + logger.info("running %s", ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: @@ -107,4 +107,4 @@ # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - logger.info("finished running %s" % program) + logger.info("finished running %s", program) diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py index 66985a566e..37c437f3e1 100755 --- a/gensim/scripts/make_wikicorpus.py +++ b/gensim/scripts/make_wikicorpus.py @@ -55,7 +55,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) - logger.info("running %s" % ' '.join(sys.argv)) + logger.info("running %s", ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: @@ -107,4 +107,4 @@ # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - logger.info("finished running %s" % program) + logger.info("finished running %s", program) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 88cab79d25..6bb1301a59 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -37,13 +37,13 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): - ''' + """ Convert Word2Vec mode to 2D tensor TSV file and metadata file Args: - param1 (str): word2vec model file path - param2 (str): filename prefix - param2 (bool): set True to use a binary Word2Vec model, defaults to False - ''' + word2vec_model_path (str): word2vec model file path + tensor_filename (str): filename prefix + binary (bool): set True to use a binary Word2Vec model, defaults to False + """ model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary) outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' @@ -52,11 +52,11 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): with open(outfiletsvmeta, 'w+') as file_metadata: for word in model.index2word: file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n')) - vector_row = '\t'.join(map(str, model[word])) + vector_row = '\t'.join(str(x) for x in model[word]) file_vector.write(vector_row + '\n') - logger.info("2D tensor file saved to %s" % outfiletsv) - logger.info("Tensor metadata file saved to %s" % outfiletsvmeta) + logger.info("2D tensor file saved to %s", outfiletsv) + logger.info("Tensor metadata file saved to %s", outfiletsvmeta) if __name__ == "__main__": @@ -64,24 +64,12 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): logging.root.setLevel(level=logging.INFO) logger.info("running %s", ' '.join(sys.argv)) - # check and process cmdline input - program = os.path.basename(sys.argv[0]) - if len(sys.argv) < 2: - print(globals()['__doc__'] % locals()) - sys.exit(1) - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input", required=True, - help="Input word2vec model") - parser.add_argument( - "-o", "--output", required=True, - help="Output tensor file name prefix") - parser.add_argument("-b", "--binary", - required=False, - help="If word2vec model in binary format, set True, else False") + parser.add_argument("-i", "--input", required=True, help="Input word2vec model") + parser.add_argument("-o", "--output", required=True, help="Output tensor file name prefix") + parser.add_argument("-b", "--binary", required=False, help="If word2vec model in binary format, set True, else False") args = parser.parse_args() word2vec2tensor(args.input, args.output, args.binary) - logger.info("finished running %s", program) + logger.info("finished running %s", os.path.basename(sys.argv[0])) diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py index 52baea6f4c..878e588613 100644 --- a/gensim/scripts/word2vec_standalone.py +++ b/gensim/scripts/word2vec_standalone.py @@ -61,17 +61,8 @@ if __name__ == "__main__": - logging.basicConfig( - format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', - level=logging.INFO) + logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info("running %s", " ".join(sys.argv)) - - # check and process cmdline input - program = os.path.basename(sys.argv[0]) - if len(sys.argv) < 2: - print(globals()['__doc__'] % locals()) - sys.exit(1) - seterr(all='raise') # don't ignore numpy errors parser = argparse.ArgumentParser() @@ -107,7 +98,8 @@ model = Word2Vec( corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram, - hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter) + hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter + ) if args.output: outfile = args.output @@ -124,4 +116,4 @@ questions_file = args.accuracy model.accuracy(questions_file) - logger.info("finished running %s", program) + logger.info("finished running %s", os.path.basename(sys.argv[0])) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index efe71159d3..5e93c6f8cf 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -105,7 +105,7 @@ def __getstate__(self): return result def __str__(self): - return ("%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname())) + return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname()) def get_index(self): if not hasattr(self, 'index'): @@ -209,8 +209,9 @@ def __len__(self): return len(self.fresh_docs) + sum([len(shard) for shard in self.shards]) def __str__(self): - return ("Similarity index with %i documents in %i shards (stored under %s)" % - (len(self), len(self.shards), self.output_prefix)) + return "Similarity index with %i documents in %i shards (stored under %s)" % ( + len(self), len(self.shards), self.output_prefix + ) def add_documents(self, corpus): """ @@ -262,8 +263,9 @@ def close_shard(self): # consider the shard sparse if its density is < 30% issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features) if issparse: - index = SparseMatrixSimilarity(self.fresh_docs, num_terms=self.num_features, - num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz) + index = SparseMatrixSimilarity( + self.fresh_docs, num_terms=self.num_features, num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz + ) else: index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features) logger.info("creating %s shard #%s", 'sparse' if issparse else 'dense', shardid) @@ -334,8 +336,7 @@ def __getitem__(self, query): # the following uses a lot of lazy evaluation and (optionally) parallel # processing, to improve query latency and minimize memory footprint. offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) - convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) - for doc_index, sim in doc] + convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] is_corpus, query = utils.is_corpus(query) is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: @@ -370,8 +371,7 @@ def vector_by_id(self, docpos): if docpos < pos: break if not self.shards or docpos < 0 or docpos >= pos: - raise ValueError("invalid document position: %s (must be 0 <= x < %s)" % - (docpos, len(self))) + raise ValueError("invalid document position: %s (must be 0 <= x < %s)" % (docpos, len(self))) result = shard.get_document_id(docpos - pos + len(shard)) return result @@ -458,7 +458,6 @@ def destroy(self): for fname in glob.glob(self.output_prefix + '*'): logger.info("deleting %s", fname) os.remove(fname) -# endclass Similarity class MatrixSimilarity(interfaces.SimilarityABC): @@ -496,7 +495,10 @@ class for description of the other parameters. if corpus is not None: if self.num_features <= 0: - raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") + raise ValueError( + "cannot index a corpus with zero features (you must specify either `num_features` " + "or a non-empty corpus in the constructor)" + ) logger.info("creating matrix with %i documents and %i features", corpus_len, num_features) self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) @@ -535,7 +537,8 @@ def get_similarities(self, query): if is_corpus: query = numpy.asarray( [matutils.sparse2full(vec, self.num_features) for vec in query], - dtype=self.index.dtype) + dtype=self.index.dtype + ) else: if scipy.sparse.issparse(query): query = query.toarray() # convert sparse to dense @@ -553,7 +556,6 @@ def get_similarities(self, query): def __str__(self): return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1]) -# endclass MatrixSimilarity class WmdSimilarity(interfaces.SimilarityABC): @@ -638,7 +640,6 @@ def get_similarities(self, query): def __str__(self): return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1]) -# endclass WmdSimilarity class SparseMatrixSimilarity(interfaces.SimilarityABC): @@ -689,7 +690,8 @@ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, - dtype=dtype, printprogress=10000).T + dtype=dtype, printprogress=10000 + ).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR @@ -736,4 +738,3 @@ def get_similarities(self, query): # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result -# endclass SparseMatrixSimilarity diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py index d0ca879225..e9323f6998 100644 --- a/gensim/similarities/index.py +++ b/gensim/similarities/index.py @@ -17,7 +17,9 @@ try: from annoy import AnnoyIndex except ImportError: - raise ImportError("Annoy has not been installed, if you wish to use the annoy indexer, please run `pip install annoy`") + raise ImportError( + "Annoy has not been installed, if you wish to use the annoy indexer, please run `pip install annoy`" + ) class AnnoyIndexer(object): @@ -49,7 +51,8 @@ def load(self, fname): fname_dict = fname + '.d' if not (os.path.exists(fname) and os.path.exists(fname_dict)): raise IOError( - "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict)) + "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict) + ) else: with smart_open(fname_dict) as f: d = _pickle.loads(f.read()) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 0217350d3a..d3128243a6 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -23,10 +23,10 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None, - chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, - alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, - gamma_threshold=0.001, serialized=False, serialization_path=None, - minimum_probability=0.01, random_state=None): + chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, + alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, + gamma_threshold=0.001, serialized=False, serialization_path=None, + minimum_probability=0.01, random_state=None): """ Sklearn wrapper for AuthorTopic model. See gensim.models.AuthorTopicModel for parameter details. """ @@ -55,11 +55,14 @@ def fit(self, X, y=None): Fit the model according to the given training data. Calls gensim.models.AuthorTopicModel """ - self.gensim_model = models.AuthorTopicModel(corpus=X, num_topics=self.num_topics, id2word=self.id2word, + self.gensim_model = models.AuthorTopicModel( + corpus=X, num_topics=self.num_topics, id2word=self.id2word, author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, - update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized, - serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state) + update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, + serialized=self.serialized, serialization_path=self.serialization_path, + minimum_probability=self.minimum_probability, random_state=self.random_state + ) return self def transform(self, author_names): @@ -69,7 +72,9 @@ def transform(self, author_names): """ # The input as array of array if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) check = lambda x: [x] if not isinstance(x, list) else x author_names = check(author_names) @@ -88,11 +93,14 @@ def partial_fit(self, X, author2doc=None, doc2author=None): Train model over X. """ if self.gensim_model is None: - self.gensim_model = models.AuthorTopicModel(corpus=X, num_topics=self.num_topics, id2word=self.id2word, + self.gensim_model = models.AuthorTopicModel( + corpus=X, num_topics=self.num_topics, id2word=self.id2word, author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes, iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta, - update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized, - serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state) + update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, + serialized=self.serialized, serialization_path=self.serialization_path, + minimum_probability=self.minimum_probability, random_state=self.random_state + ) self.gensim_model.update(corpus=X, author2doc=author2doc, doc2author=doc2author) return self diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 05d496d9b1..14163f1600 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -22,13 +22,10 @@ class D2VTransformer(TransformerMixin, BaseEstimator): Base Doc2Vec module """ - def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, - dm_tag_count=1, docvecs=None, docvecs_mapfile=None, - comment=None, trim_rule=None, size=100, alpha=0.025, - window=5, min_count=5, max_vocab_size=None, sample=1e-3, - seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, - cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1, - batch_words=10000): + def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, + docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, + hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ @@ -66,14 +63,16 @@ def fit(self, X, y=None): Fit the model according to the given training data. Calls gensim.models.Doc2Vec """ - self.gensim_model = models.Doc2Vec(documents=X, dm_mean=self.dm_mean, dm=self.dm, + self.gensim_model = models.Doc2Vec( + documents=X, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words) + iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + ) return self def transform(self, docs): @@ -83,7 +82,9 @@ def transform(self, docs): or a single document like : ['calculus', 'mathematical'] """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # The input as array of array check = lambda x: [x] if isinstance(x[0], string_types) else x diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 92265a5e8f..d1dcec01a5 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -23,10 +23,8 @@ class HdpTransformer(TransformerMixin, BaseEstimator): Base HDP module """ - def __init__(self, id2word, max_chunks=None, max_time=None, - chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, - gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, - outputdir=None, random_state=None): + def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, + alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ Sklearn api for HDP model. See gensim.models.HdpModel for parameter details. """ @@ -57,10 +55,12 @@ def fit(self, X, y=None): else: corpus = X - self.gensim_model = models.HdpModel(corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, + self.gensim_model = models.HdpModel( + corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, - var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state + ) return self def transform(self, docs): @@ -72,7 +72,9 @@ def transform(self, docs): or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # The input as array of array check = lambda x: [x] if isinstance(x[0], tuple) else x @@ -82,7 +84,7 @@ def transform(self, docs): max_num_topics = 0 for k, v in enumerate(docs): X[k] = self.gensim_model[v] - max_num_topics = max(max_num_topics, max(list(map(lambda x: x[0], X[k]))) + 1) + max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1) for k, v in enumerate(X): # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future @@ -99,10 +101,12 @@ def partial_fit(self, X): X = matutils.Sparse2Corpus(X) if self.gensim_model is None: - self.gensim_model = models.HdpModel(id2word=self.id2word, max_chunks=self.max_chunks, + self.gensim_model = models.HdpModel( + id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, - var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state + ) self.gensim_model.update(corpus=X) return self diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 107353e2df..77d539e616 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -24,11 +24,9 @@ class LdaTransformer(TransformerMixin, BaseEstimator): Base LDA module """ - def __init__( - self, num_topics=100, id2word=None, chunksize=2000, passes=1, - update_every=1, alpha='symmetric', eta=None, decay=0.5, - offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, - minimum_probability=0.01, random_state=None, scorer='perplexity'): + def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric', + eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, + minimum_probability=0.01, random_state=None, scorer='perplexity'): """ Sklearn wrapper for LDA model. See gensim.model.LdaModel for parameter details. @@ -63,12 +61,14 @@ def fit(self, X, y=None): else: corpus = X - self.gensim_model = models.LdaModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, + self.gensim_model = models.LdaModel( + corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, - random_state=self.random_state) + random_state=self.random_state + ) return self def transform(self, docs): @@ -109,11 +109,13 @@ def partial_fit(self, X): X = matutils.Sparse2Corpus(X) if self.gensim_model is None: - self.gensim_model = models.LdaModel(num_topics=self.num_topics, id2word=self.id2word, + self.gensim_model = models.LdaModel( + num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, - minimum_probability=self.minimum_probability, random_state=self.random_state) + minimum_probability=self.minimum_probability, random_state=self.random_state + ) self.gensim_model.update(corpus=X) return self diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index 25b50bf95e..6b96d8d6fa 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -22,9 +22,9 @@ class LdaSeqTransformer(TransformerMixin, BaseEstimator): Base LdaSeq module """ - def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, - random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): + def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, + lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, + lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): """ Sklearn wrapper for LdaSeq model. See gensim.models.LdaSeqModel for parameter details. """ @@ -50,11 +50,13 @@ def fit(self, X, y=None): Fit the model according to the given training data. Calls gensim.models.LdaSeqModel """ - self.gensim_model = models.LdaSeqModel(corpus=X, time_slice=self.time_slice, id2word=self.id2word, + self.gensim_model = models.LdaSeqModel( + corpus=X, time_slice=self.time_slice, id2word=self.id2word, alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats, lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance, passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter, - em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize) + em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize + ) return self def transform(self, docs): diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index c44240b2ba..776af6f5da 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -24,8 +24,7 @@ class LsiTransformer(TransformerMixin, BaseEstimator): Base LSI module """ - def __init__(self, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, onepass=True, power_iters=2, extra_samples=100): + def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): """ Sklearn wrapper for LSI model. See gensim.model.LsiModel for parameter details. """ @@ -48,8 +47,10 @@ def fit(self, X, y=None): else: corpus = X - self.gensim_model = models.LsiModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, - decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples) + self.gensim_model = models.LsiModel( + corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, + decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples + ) return self def transform(self, docs): @@ -61,7 +62,9 @@ def transform(self, docs): or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # The input as array of array check = lambda x: [x] if isinstance(x[0], tuple) else x @@ -82,8 +85,10 @@ def partial_fit(self, X): X = matutils.Sparse2Corpus(X) if self.gensim_model is None: - self.gensim_model = models.LsiModel(num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, - decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples) + self.gensim_model = models.LsiModel( + num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, decay=self.decay, + onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples + ) self.gensim_model.add_documents(corpus=X) return self diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 8a944f0235..ad00c51c0e 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -21,8 +21,7 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): Base Phrases module """ - def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, - delimiter=b'_', progress_per=10000): + def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000): """ Sklearn wrapper for Phrases model. """ @@ -37,8 +36,10 @@ def fit(self, X, y=None): """ Fit the model according to the given training data. """ - self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + self.gensim_model = models.Phrases( + sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per + ) return self def transform(self, docs): @@ -61,8 +62,10 @@ def transform(self, docs): def partial_fit(self, X): if self.gensim_model is None: - self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + self.gensim_model = models.Phrases( + sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per + ) self.gensim_model.add_vocab(X) return self diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 8673c7d39e..62395e0bce 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -47,7 +47,9 @@ def transform(self, docs): or a single document like : [(0, 1.0), (1, 1.0), (2, 1.0)] """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # The input as array of array check = lambda x: [x] if isinstance(x[0], tuple) else x diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index e5a96e6551..6beb126d0d 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -34,7 +34,7 @@ def fit(self, X, y=None): """ Fit the model according to the given training data. """ - tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) + tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) return self @@ -43,12 +43,14 @@ def transform(self, docs): Return the BOW format for the input documents. """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # input as python lists check = lambda x: [x] if isinstance(x, string_types) else x docs = check(docs) - tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs)) + tokenized_docs = [list(self.tokenizer(x)) for x in docs] X = [[] for _ in range(0, len(tokenized_docs))] for k, v in enumerate(tokenized_docs): @@ -61,6 +63,6 @@ def partial_fit(self, X): if self.gensim_model is None: self.gensim_model = Dictionary(prune_at=self.prune_at) - tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) + tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model.add_documents(tokenized_docs) return self diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index ca34af6b40..414c597dc1 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -21,8 +21,8 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): Base Tf-Idf module """ - def __init__(self, corpus=None, id2word=None, dictionary=None, - wlocal=gensim.utils.identity, wglobal=gensim.models.tfidfmodel.df2idf, normalize=True): + def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, + wglobal=gensim.models.tfidfmodel.df2idf, normalize=True): """ Sklearn wrapper for Tf-Idf model. """ @@ -46,7 +46,9 @@ def transform(self, docs): Return the transformed documents after multiplication with the tf-idf matrix. """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # input as python lists check = lambda x: [x] if isinstance(x[0], tuple) else x diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 32d3e2ffa7..6ddea2eb90 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -23,10 +23,9 @@ class W2VTransformer(TransformerMixin, BaseEstimator): Base Word2Vec module """ - def __init__(self, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=10000): + def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, + workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=10000): """ Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. """ @@ -56,12 +55,14 @@ def fit(self, X, y=None): Fit the model according to the given training data. Calls gensim.models.Word2Vec """ - self.gensim_model = models.Word2Vec(sentences=X, size=self.size, alpha=self.alpha, + self.gensim_model = models.Word2Vec( + sentences=X, size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule, - sorted_vocab=self.sorted_vocab, batch_words=self.batch_words) + sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + ) return self def transform(self, words): @@ -69,7 +70,9 @@ def transform(self, words): Return the word-vectors for the input list of words. """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # The input as array of array check = lambda x: [x] if isinstance(x, six.string_types) else x @@ -83,4 +86,7 @@ def transform(self, words): return np.reshape(np.array(X), (len(words), self.size)) def partial_fit(self, X): - raise NotImplementedError("'partial_fit' has not been implemented for W2VTransformer. However, the model can be updated with a fixed vocabulary using Gensim API call.") + raise NotImplementedError( + "'partial_fit' has not been implemented for W2VTransformer. " + "However, the model can be updated with a fixed vocabulary using Gensim API call." + ) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index d634a32b54..1fb11a8d77 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -18,7 +18,7 @@ class BM25(object): def __init__(self, corpus): self.corpus_size = len(corpus) - self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size + self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size self.corpus = corpus self.f = [] self.df = {} @@ -62,7 +62,7 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): bm25 = BM25(corpus) - average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) + average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) weights = [] for doc in corpus: diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index 8424873e35..c35a59a25d 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -242,8 +242,7 @@ def del_edge(self, edge): self.del_edge_labeling((v, u)) def del_edge_labeling(self, edge): - keys = [edge] - keys.append(edge[::-1]) + keys = [edge, edge[::-1]] for key in keys: for mapping in [self.edge_properties, self.edge_attr]: diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index b24e6f1f04..1630c9389d 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -30,7 +30,7 @@ def _get_pos_filters(): return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER) -def _get_words_for_graph(tokens, pos_filter): +def _get_words_for_graph(tokens, pos_filter=None): if pos_filter is None: include_filters, exclude_filters = _get_pos_filters() else: @@ -97,7 +97,7 @@ def _process_text(graph, tokens, split_text): def _queue_iterator(queue): iterations = queue.qsize() - for i in xrange(iterations): + for _ in xrange(iterations): var = queue.get() yield var queue.put(var) @@ -197,7 +197,8 @@ def _format_results(_keywords, combined_keywords, split, scores): return "\n".join(combined_keywords) -def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True): +def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), + lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index c067c23faf..3307a2280f 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -151,7 +151,7 @@ def summarize_corpus(corpus, ratio=0.2): # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: - logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.") + logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) @@ -205,7 +205,7 @@ def summarize(text, ratio=0.2, word_count=None, split=False): # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: - logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.") + logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH) corpus = _build_corpus(sentences) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 404c44b18e..fa6a56b887 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -97,7 +97,7 @@ def clean_text_by_word(text, deacc=True): else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) - return dict((unit.text, unit) for unit in units) + return {unit.text: unit for unit in units} def tokenize_by_word(text): diff --git a/gensim/test/basetests.py b/gensim/test/basetmtests.py similarity index 91% rename from gensim/test/basetests.py rename to gensim/test/basetmtests.py index a22bfe2d30..e8cb1d259d 100644 --- a/gensim/test/basetests.py +++ b/gensim/test/basetmtests.py @@ -13,27 +13,27 @@ class TestBaseTopicModel(object): - def testPrintTopic(self): + def test_print_topic(self): topics = self.model.show_topics(formatted=True) for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821 - def testPrintTopics(self): + def test_print_topics(self): topics = self.model.print_topics() for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821 - def testShowTopic(self): + def test_show_topic(self): topic = self.model.show_topic(1) for k, v in topic: self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, (np.floating, float))) - def testShowTopics(self): + def test_show_topics(self): topics = self.model.show_topics(formatted=False) for topic_no, topic in topics: @@ -43,7 +43,7 @@ def testShowTopics(self): self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, (np.floating, float))) - def testGetTopics(self): + def test_get_topics(self): topics = self.model.get_topics() vocab_size = len(self.model.id2word) for topic in topics: diff --git a/gensim/test/simspeed.py b/gensim/test/simspeed.py index 4c1ffcab6f..7ba25fc2ea 100755 --- a/gensim/test/simspeed.py +++ b/gensim/test/simspeed.py @@ -27,7 +27,7 @@ if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.info("running %s" % " ".join(sys.argv)) + logging.info("running %s", " ".join(sys.argv)) # check and process cmdline input program = os.path.basename(sys.argv[0]) @@ -54,8 +54,10 @@ # because it needs to convert sparse vecs to np arrays and normalize them to # unit length=extra work, which #3 avoids. query = list(itertools.islice(corpus_dense, 1000)) - logging.info("test 1 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)" % - (len(query), len(index_dense), index_dense.num_features)) + logging.info( + "test 1 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)", + len(query), len(index_dense), index_dense.num_features + ) for chunksize in [1, 4, 8, 16, 64, 128, 256, 512, 1024]: start = time() if chunksize > 1: @@ -68,13 +70,17 @@ assert len(sims) == len(query) # make sure we have one result for each query document taken = time() - start queries = math.ceil(1.0 * len(query) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(query) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(query) / taken, queries / taken + ) # Same comment as for test #1 but vs. test #4. query = list(itertools.islice(corpus_sparse, 1000)) - logging.info("test 2 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)" % - (len(query), len(corpus_sparse), index_sparse.index.shape[1], density)) + logging.info( + "test 2 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)", + len(query), len(corpus_sparse), index_sparse.index.shape[1], density + ) for chunksize in [1, 5, 10, 100, 500, 1000]: start = time() if chunksize > 1: @@ -87,28 +93,37 @@ assert len(sims) == len(query) # make sure we have one result for each query document taken = time() - start queries = math.ceil(1.0 * len(query) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(query) / taken, queries / taken)) - - logging.info("test 3 (dense): similarity of all vs. all (%i documents, %i dense features)" % - (len(corpus_dense), index_dense.num_features)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(query) / taken, queries / taken + ) + + logging.info( + "test 3 (dense): similarity of all vs. all (%i documents, %i dense features)", + len(corpus_dense), index_dense.num_features + ) for chunksize in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]: index_dense.chunksize = chunksize start = time() # `sims` stores the entire N x N sim matrix in memory! # this is not necessary, but i added it to test the accuracy of the result # (=report mean diff below) - sims = [sim for sim in index_dense] + sims = list(index_dense) taken = time() - start sims = np.asarray(sims) if chunksize == 0: - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s)" % (chunksize, taken, len(corpus_dense) / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s)", + chunksize, taken, len(corpus_dense) / taken + ) unchunksizeed = sims else: queries = math.ceil(1.0 * len(corpus_dense) / chunksize) diff = np.mean(np.abs(unchunksizeed - sims)) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e" % - (chunksize, taken, len(corpus_dense) / taken, queries / taken, diff)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e", + chunksize, taken, len(corpus_dense) / taken, queries / taken, diff + ) del sims index_dense.num_best = 10 @@ -116,32 +131,41 @@ for chunksize in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]: index_dense.chunksize = chunksize start = time() - sims = [sim for sim in index_dense] + sims = list(index_dense) taken = time() - start if chunksize == 0: queries = len(corpus_dense) else: queries = math.ceil(1.0 * len(corpus_dense) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(corpus_dense) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(corpus_dense) / taken, queries / taken + ) index_dense.num_best = None - logging.info("test 5 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)" % - (len(corpus_sparse), index_sparse.index.shape[1], density)) + logging.info( + "test 5 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)", + len(corpus_sparse), index_sparse.index.shape[1], density + ) for chunksize in [0, 5, 10, 100, 500, 1000, 5000]: index_sparse.chunksize = chunksize start = time() - sims = [sim for sim in index_sparse] + sims = list(index_sparse) taken = time() - start sims = np.asarray(sims) if chunksize == 0: - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s)" % (chunksize, taken, len(corpus_sparse) / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s)", + chunksize, taken, len(corpus_sparse) / taken + ) unchunksizeed = sims else: queries = math.ceil(1.0 * len(corpus_sparse) / chunksize) diff = np.mean(np.abs(unchunksizeed - sims)) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e" % - (chunksize, taken, len(corpus_sparse) / taken, queries / taken, diff)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e", + chunksize, taken, len(corpus_sparse) / taken, queries / taken, diff + ) del sims index_sparse.num_best = 10 @@ -149,14 +173,16 @@ for chunksize in [0, 5, 10, 100, 500, 1000, 5000]: index_sparse.chunksize = chunksize start = time() - sims = [sim for sim in index_sparse] + sims = list(index_sparse) taken = time() - start if chunksize == 0: queries = len(corpus_sparse) else: queries = math.ceil(1.0 * len(corpus_sparse) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(corpus_sparse) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(corpus_sparse) / taken, queries / taken + ) index_sparse.num_best = None - logging.info("finished running %s" % program) + logging.info("finished running %s", program) diff --git a/gensim/test/simspeed2.py b/gensim/test/simspeed2.py index 334730a6f1..931caef950 100755 --- a/gensim/test/simspeed2.py +++ b/gensim/test/simspeed2.py @@ -25,7 +25,7 @@ if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.info("running %s" % " ".join(sys.argv)) + logging.info("running %s", " ".join(sys.argv)) # check and process cmdline input program = os.path.basename(sys.argv[0]) @@ -47,8 +47,10 @@ density = 100.0 * sum(shard.num_nnz for shard in index_sparse.shards) / (len(index_sparse) * sparse_features) - logging.info("test 1 (dense): similarity of all vs. all (%i documents, %i dense features)" % - (len(corpus_dense), index_dense.num_features)) + logging.info( + "test 1 (dense): similarity of all vs. all (%i documents, %i dense features)", + len(corpus_dense), index_dense.num_features + ) for chunksize in [1, 8, 32, 64, 128, 256, 512, 1024, index_dense.shardsize]: index_dense.chunksize = chunksize start = time() @@ -56,8 +58,10 @@ pass taken = time() - start queries = math.ceil(1.0 * len(corpus_dense) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(corpus_dense) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(corpus_dense) / taken, queries / taken + ) index_dense.num_best = 10 logging.info("test 2 (dense): as above, but only ask for the top-10 most similar for each document") @@ -67,12 +71,17 @@ sims = [sim for sim in index_dense] taken = time() - start queries = math.ceil(1.0 * len(corpus_dense) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(corpus_dense) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(corpus_dense) / taken, queries / taken + ) index_dense.num_best = None - logging.info("test 3 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)" % - (len(corpus_sparse), index_sparse.num_features, density)) + logging.info( + "test 3 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)", + len(corpus_sparse), index_sparse.num_features, density + ) + for chunksize in [1, 5, 10, 100, 256, 500, 1000, index_sparse.shardsize]: index_sparse.chunksize = chunksize start = time() @@ -80,8 +89,10 @@ pass taken = time() - start queries = math.ceil(1.0 * len(corpus_sparse) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(corpus_sparse) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(corpus_sparse) / taken, queries / taken + ) index_sparse.num_best = 10 logging.info("test 4 (sparse): as above, but only ask for the top-10 most similar for each document") @@ -92,8 +103,10 @@ pass taken = time() - start queries = math.ceil(1.0 * len(corpus_sparse) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(corpus_sparse) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(corpus_sparse) / taken, queries / taken + ) index_sparse.num_best = None # Difference between test #5 and test #1 is that the query in #5 is a gensim iterable @@ -101,8 +114,10 @@ # because it needs to convert sparse vecs to numpy arrays and normalize them to # unit length=extra work, which #1 avoids. query = list(itertools.islice(corpus_dense, 1000)) - logging.info("test 5 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)" % - (len(query), len(index_dense), index_dense.num_features)) + logging.info( + "test 5 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)", + len(query), len(index_dense), index_dense.num_features + ) for chunksize in [1, 8, 32, 64, 128, 256, 512, 1024]: start = time() if chunksize > 1: @@ -114,13 +129,17 @@ _ = index_dense[vec] taken = time() - start queries = math.ceil(1.0 * len(query) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(query) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(query) / taken, queries / taken + ) # Same comment as for test #5. query = list(itertools.islice(corpus_dense, 1000)) - logging.info("test 6 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)" % - (len(query), len(corpus_sparse), index_sparse.num_features, density)) + logging.info( + "test 6 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)", + len(query), len(corpus_sparse), index_sparse.num_features, density + ) for chunksize in [1, 5, 10, 100, 500, 1000]: start = time() if chunksize > 1: @@ -132,7 +151,9 @@ _ = index_sparse[vec] taken = time() - start queries = math.ceil(1.0 * len(query) / chunksize) - logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" % - (chunksize, taken, len(query) / taken, queries / taken)) + logging.info( + "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)", + chunksize, taken, len(query) / taken, queries / taken + ) - logging.info("finished running %s" % program) + logging.info("finished running %s", program) diff --git a/gensim/test/svd_error.py b/gensim/test/svd_error.py index 4f204c1147..e6ab11bb78 100755 --- a/gensim/test/svd_error.py +++ b/gensim/test/svd_error.py @@ -51,7 +51,7 @@ def norm2(a): """Spectral norm ("norm 2") of a symmetric matrix `a`.""" if COMPUTE_NORM2: - logging.info("computing spectral norm of a %s matrix" % str(a.shape)) + logging.info("computing spectral norm of a %s matrix", str(a.shape)) return scipy.linalg.eigvalsh(a).max() # much faster than np.linalg.norm(2) else: return np.nan @@ -65,8 +65,10 @@ def print_error(name, aat, u, s, ideal_nf, ideal_n2): err = -np.dot(u, np.dot(np.diag(s), u.T)) err += aat nf, n2 = np.linalg.norm(err), norm2(err) - print('%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g' % - (name, nf, nf / ideal_nf, n2, n2 / ideal_n2, rmse(err))) + print( + '%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g' % + (name, nf, nf / ideal_nf, n2, n2 / ideal_n2, rmse(err)) + ) sys.stdout.flush() @@ -82,7 +84,7 @@ def __iter__(self): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.info("running %s" % " ".join(sys.argv)) + logging.info("running %s", " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) # do we have enough cmd line arguments? @@ -105,7 +107,7 @@ def __iter__(self): m = int(sys.argv[3]) else: m = mm.num_terms - logging.info("using %i documents and %i features" % (n, m)) + logging.info("using %i documents and %i features", n, m) corpus = ClippedCorpus(mm, n, m) id2word = gensim.utils.FakeDict(m) @@ -137,7 +139,7 @@ def __iter__(self): print_error("baseline", aat, np.zeros((m, factors)), np.zeros((factors)), ideal_fro, ideal_n2) if sparsesvd: - logging.info("computing SVDLIBC SVD for %i factors" % (factors)) + logging.info("computing SVDLIBC SVD for %i factors", factors) taken = time.time() corpus_ram = gensim.matutils.corpus2csc(corpus, num_terms=m) ut, s, vt = sparsesvd(corpus_ram, factors) @@ -152,30 +154,41 @@ def __iter__(self): del u for power_iters in POWER_ITERS: for chunksize in CHUNKSIZE: - logging.info("computing incremental SVD for %i factors, %i power iterations, chunksize %i" % - (factors, power_iters, chunksize)) + logging.info( + "computing incremental SVD for %i factors, %i power iterations, chunksize %i", + factors, power_iters, chunksize + ) taken = time.time() gensim.models.lsimodel.P2_EXTRA_ITERS = power_iters - model = gensim.models.LsiModel(corpus, id2word=id2word, num_topics=factors, - chunksize=chunksize, power_iters=power_iters) + model = gensim.models.LsiModel( + corpus, id2word=id2word, num_topics=factors, + chunksize=chunksize, power_iters=power_iters + ) taken = time.time() - taken u, s = model.projection.u.astype(np.float32), model.projection.s.astype(np.float32)**2 del model - print("incremental SVD for %i factors, %i power iterations, chunksize %i took %s s (spectrum %f .. %f)" % - (factors, power_iters, chunksize, taken, s[0], s[-1])) + print( + "incremental SVD for %i factors, %i power iterations, " + "chunksize %i took %s s (spectrum %f .. %f)" % + (factors, power_iters, chunksize, taken, s[0], s[-1]) + ) print_error('incremental SVD', aat, u, s, ideal_fro, ideal_n2) del u - logging.info("computing multipass SVD for %i factors, %i power iterations" % - (factors, power_iters,)) + logging.info("computing multipass SVD for %i factors, %i power iterations", factors, power_iters) taken = time.time() - model = gensim.models.LsiModel(corpus, id2word=id2word, num_topics=factors, chunksize=2000, - onepass=False, power_iters=power_iters) + model = gensim.models.LsiModel( + corpus, id2word=id2word, num_topics=factors, chunksize=2000, + onepass=False, power_iters=power_iters + ) taken = time.time() - taken u, s = model.projection.u.astype(np.float32), model.projection.s.astype(np.float32)**2 del model - print("multipass SVD for %i factors, %i power iterations took %s s (spectrum %f .. %f)" % - (factors, power_iters, taken, s[0], s[-1])) + print( + "multipass SVD for %i factors, " + "%i power iterations took %s s (spectrum %f .. %f)" % + (factors, power_iters, taken, s[0], s[-1]) + ) print_error('multipass SVD', aat, u, s, ideal_fro, ideal_n2) del u - logging.info("finished running %s" % program) + logging.info("finished running %s", program) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 17cf1619f9..97bffde623 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -26,7 +26,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import atmodel from gensim import matutils -from gensim.test import basetests +from gensim.test import basetmtests # TODO: # Test that computing the bound on new unseen documents works as expected (this is somewhat different @@ -40,23 +40,38 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # Assign some authors randomly to the documents above. -author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]} -doc2author = {0: ['john', 'jack'], 1: ['john', 'jill'], 2: ['john', 'jane', 'jack'], 3: ['john', 'jane', 'jill'], - 4: ['john', 'jane', 'jack'], 5: ['john', 'jane', 'jill'], 6: ['john', 'jane', 'jack'], 7: ['jane', 'jill'], - 8: ['jane', 'jack']} +author2doc = { + 'john': [0, 1, 2, 3, 4, 5, 6], + 'jane': [2, 3, 4, 5, 6, 7, 8], + 'jack': [0, 2, 4, 6, 8], + 'jill': [1, 3, 5, 7] +} +doc2author = { + 0: ['john', 'jack'], + 1: ['john', 'jill'], + 2: ['john', 'jane', 'jack'], + 3: ['john', 'jane', 'jill'], + 4: ['john', 'jane', 'jack'], + 5: ['john', 'jane', 'jill'], + 6: ['john', 'jane', 'jack'], + 7: ['jane', 'jill'], + 8: ['jane', 'jack'] +} # More data with new and old authors (to test update method). # Although the text is just a subset of the previous, the model @@ -73,7 +88,7 @@ def testfile(test_fname=''): return os.path.join(tempfile.gettempdir(), fname) -class TestAuthorTopicModel(unittest.TestCase, basetests.TestBaseTopicModel): +class TestAuthorTopicModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = atmodel.AuthorTopicModel @@ -101,8 +116,10 @@ def testTransform(self): passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering if passed: break - logging.warning("Author-topic model failed to converge on attempt %i (got %s, expected %s)" % - (i, sorted(vec), sorted(expected))) + logging.warning( + "Author-topic model failed to converge on attempt %i (got %s, expected %s)", + i, sorted(vec), sorted(expected) + ) self.assertTrue(passed) def testBasic(self): @@ -117,8 +134,14 @@ def testBasic(self): def testAuthor2docMissing(self): # Check that the results are the same if author2doc is constructed automatically from doc2author. - model = self.class_(corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0) - model2 = self.class_(corpus, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0) + model = self.class_( + corpus, author2doc=author2doc, doc2author=doc2author, + id2word=dictionary, num_topics=2, random_state=0 + ) + model2 = self.class_( + corpus, doc2author=doc2author, id2word=dictionary, + num_topics=2, random_state=0 + ) # Compare Jill's topics before in both models. jill_topics = model.get_author_topics('jill') @@ -129,8 +152,14 @@ def testAuthor2docMissing(self): def testDoc2authorMissing(self): # Check that the results are the same if doc2author is constructed automatically from author2doc. - model = self.class_(corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0) - model2 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, random_state=0) + model = self.class_( + corpus, author2doc=author2doc, doc2author=doc2author, + id2word=dictionary, num_topics=2, random_state=0 + ) + model2 = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, + num_topics=2, random_state=0 + ) # Compare Jill's topics before in both models. jill_topics = model.get_author_topics('jill') @@ -185,7 +214,10 @@ def testUpdateNewDataNewAuthor(self): def testSerialized(self): # Test the model using serialized corpora. Basic tests, plus test of update functionality. - model = self.class_(self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, serialized=True, serialization_path=datapath('testcorpus_serialization.mm')) + model = self.class_( + self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, + serialized=True, serialization_path=datapath('testcorpus_serialization.mm') + ) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) @@ -216,7 +248,10 @@ def testTransformSerialized(self): # better random initialization for i in range(25): # restart at most 5 times # create the transformation model - model = self.class_(id2word=dictionary, num_topics=2, passes=100, random_state=0, serialized=True, serialization_path=datapath('testcorpus_serialization.mm')) + model = self.class_( + id2word=dictionary, num_topics=2, passes=100, random_state=0, + serialized=True, serialization_path=datapath('testcorpus_serialization.mm') + ) model.update(self.corpus, author2doc) jill_topics = model.get_author_topics('jill') @@ -234,13 +269,21 @@ def testTransformSerialized(self): remove(datapath('testcorpus_serialization.mm')) if passed: break - logging.warning("Author-topic model failed to converge on attempt %i (got %s, expected %s)" % - (i, sorted(vec), sorted(expected))) + logging.warning( + "Author-topic model failed to converge on attempt %i (got %s, expected %s)", + i, sorted(vec), sorted(expected) + ) self.assertTrue(passed) def testAlphaAuto(self): - model1 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, alpha='symmetric', passes=10, num_topics=2) - modelauto = self.class_(corpus, author2doc=author2doc, id2word=dictionary, alpha='auto', passes=10, num_topics=2) + model1 = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, + alpha='symmetric', passes=10, num_topics=2 + ) + modelauto = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, + alpha='auto', passes=10, num_topics=2 + ) # did we learn something? self.assertFalse(all(np.equal(model1.alpha, modelauto.alpha))) @@ -301,8 +344,14 @@ def testAlpha(self): self.assertRaises(ValueError, self.class_, **kwargs) def testEtaAuto(self): - model1 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, eta='symmetric', passes=10, num_topics=2) - modelauto = self.class_(corpus, author2doc=author2doc, id2word=dictionary, eta='auto', passes=10, num_topics=2) + model1 = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, + eta='symmetric', passes=10, num_topics=2 + ) + modelauto = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, + eta='auto', passes=10, num_topics=2 + ) # did we learn something? self.assertFalse(all(np.equal(model1.eta, modelauto.eta))) @@ -388,7 +437,10 @@ def testGetTopicTerms(self): def testGetAuthorTopics(self): - model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)) + model = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, + passes=100, random_state=np.random.seed(0) + ) author_topics = [] for a in model.id2author.values(): @@ -402,7 +454,10 @@ def testGetAuthorTopics(self): def testTermTopics(self): - model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)) + model = self.class_( + corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, + passes=100, random_state=np.random.seed(0) + ) # check with word_type result = model.get_term_topics(2) @@ -436,7 +491,7 @@ def testPasses(self): for test_rhot in test_rhots: model.update(corpus, author2doc) - msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs])) + msg = "{}, {}, {}".format(passes, model.num_updates, model.state.numdocs) self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg) self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots)) diff --git a/gensim/test/test_big.py b/gensim/test/test_big.py index 5e6972bd1f..abf19c63c7 100644 --- a/gensim/test/test_big.py +++ b/gensim/test/test_big.py @@ -38,7 +38,7 @@ def __iter__(self): doc_len = np.random.poisson(self.doc_len) ids = np.random.randint(0, len(self.dictionary), doc_len) if self.words_only: - yield [str(id) for id in ids] + yield [str(idx) for idx in ids] else: weights = np.random.poisson(3, doc_len) yield sorted(zip(ids, weights)) @@ -53,21 +53,21 @@ def testWord2Vec(self): model = gensim.models.Word2Vec(corpus, size=300, workers=4) model.save(testfile(), ignore=['syn1']) del model - model = gensim.models.Word2Vec.load(testfile()) + gensim.models.Word2Vec.load(testfile()) def testLsiModel(self): corpus = BigCorpus(num_docs=50000) model = gensim.models.LsiModel(corpus, num_topics=500, id2word=corpus.dictionary) model.save(testfile()) del model - model = gensim.models.LsiModel.load(testfile()) + gensim.models.LsiModel.load(testfile()) def testLdaModel(self): corpus = BigCorpus(num_docs=5000) model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary) model.save(testfile()) del model - model = gensim.models.LdaModel.load(testfile()) + gensim.models.LdaModel.load(testfile()) if __name__ == '__main__': diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 039db55a48..229d87d1df 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -53,32 +53,39 @@ def setUp(self): # `topics1` is clearly better as it has a clear distinction between system-human # interaction and graphs. Hence both the coherence measures for `topics1` should be # greater. - self.topics1 = [['human', 'computer', 'system', 'interface'], - ['graph', 'minors', 'trees', 'eps']] - self.topics2 = [['user', 'graph', 'minors', 'system'], - ['time', 'graph', 'survey', 'minors']] + self.topics1 = [ + ['human', 'computer', 'system', 'interface'], + ['graph', 'minors', 'trees', 'eps'] + ] + self.topics2 = [ + ['user', 'graph', 'minors', 'system'], + ['time', 'graph', 'survey', 'minors'] + ] self.ldamodel = LdaModel( corpus=self.corpus, id2word=self.dictionary, num_topics=2, - passes=0, iterations=0) + passes=0, iterations=0 + ) mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if self.mallet_path: self.malletmodel = LdaMallet( mallet_path=self.mallet_path, corpus=self.corpus, - id2word=self.dictionary, num_topics=2, iterations=0) + id2word=self.dictionary, num_topics=2, iterations=0 + ) vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) if not vw_path: logging.info( - "Environment variable 'VOWPAL_WABBIT_PATH' not specified," - " skipping sanity checks for LDA Model") + "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" + ) self.vw_path = None else: self.vw_path = vw_path self.vwmodel = LdaVowpalWabbit( self.vw_path, corpus=self.corpus, id2word=self.dictionary, - num_topics=2, passes=0) + num_topics=2, passes=0 + ) def check_coherence_measure(self, coherence): """Check provided topic coherence algorithm on given topics""" @@ -179,20 +186,24 @@ def testErrors(self): # not providing dictionary self.assertRaises( ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, - coherence='u_mass') + coherence='u_mass' + ) # not providing texts for c_v and instead providing corpus self.assertRaises( ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus, - dictionary=self.dictionary, coherence='c_v') + dictionary=self.dictionary, coherence='c_v' + ) # not providing corpus or texts for u_mass self.assertRaises( ValueError, CoherenceModel, topics=self.topics1, dictionary=self.dictionary, - coherence='u_mass') + coherence='u_mass' + ) def testPersistence(self): fname = testfile() model = CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') + topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' + ) model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) @@ -200,7 +211,8 @@ def testPersistence(self): def testPersistenceCompressed(self): fname = testfile() + '.gz' model = CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') + topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' + ) model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence()) @@ -208,7 +220,8 @@ def testPersistenceCompressed(self): def testPersistenceAfterProbabilityEstimationUsingCorpus(self): fname = testfile() model = CoherenceModel( - topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') + topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' + ) model.estimate_probabilities() model.save(fname) model2 = CoherenceModel.load(fname) @@ -218,7 +231,8 @@ def testPersistenceAfterProbabilityEstimationUsingCorpus(self): def testPersistenceAfterProbabilityEstimationUsingTexts(self): fname = testfile() model = CoherenceModel( - topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v') + topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v' + ) model.estimate_probabilities() model.save(fname) model2 = CoherenceModel.load(fname) diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py index e5a5786613..f6c7d8b43c 100644 --- a/gensim/test/test_corpora_dictionary.py +++ b/gensim/test/test_corpora_dictionary.py @@ -43,7 +43,8 @@ def setUp(self): ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] + ['graph', 'minors', 'survey'] + ] def testDocFreqOneDoc(self): texts = [['human', 'interface', 'computer']] @@ -102,9 +103,10 @@ def testBuild(self): self.assertEqual(sorted(d.dfs.keys()), expected_keys) self.assertEqual(sorted(d.dfs.values()), expected_values) - expected_keys = sorted(['computer', 'eps', 'graph', 'human', - 'interface', 'minors', 'response', 'survey', - 'system', 'time', 'trees', 'user']) + expected_keys = sorted([ + 'computer', 'eps', 'graph', 'human', 'interface', + 'minors', 'response', 'survey', 'system', 'time', 'trees', 'user' + ]) expected_values = list(range(12)) self.assertEqual(sorted(d.token2id.keys()), expected_keys) self.assertEqual(sorted(d.token2id.values()), expected_values) @@ -127,21 +129,21 @@ def testFilterKeepTokens_keepTokens(self): # provide keep_tokens argument, keep the tokens given d = Dictionary(self.texts) d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['human', 'survey']) - expected = set(['graph', 'trees', 'human', 'system', 'user', 'survey']) + expected = {'graph', 'trees', 'human', 'system', 'user', 'survey'} self.assertEqual(set(d.token2id.keys()), expected) def testFilterKeepTokens_unchangedFunctionality(self): # do not provide keep_tokens argument, filter_extremes functionality is unchanged d = Dictionary(self.texts) d.filter_extremes(no_below=3, no_above=1.0) - expected = set(['graph', 'trees', 'system', 'user']) + expected = {'graph', 'trees', 'system', 'user'} self.assertEqual(set(d.token2id.keys()), expected) def testFilterKeepTokens_unseenToken(self): # do provide keep_tokens argument with unseen tokens, filter_extremes functionality is unchanged d = Dictionary(self.texts) d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['unknown_token']) - expected = set(['graph', 'trees', 'system', 'user']) + expected = {'graph', 'trees', 'system', 'user'} self.assertEqual(set(d.token2id.keys()), expected) def testFilterMostFrequent(self): @@ -160,7 +162,8 @@ def testFilterTokens(self): expected = { 'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, 'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, - 'system': 5, 'time': 6, 'trees': 9, 'user': 7} + 'system': 5, 'time': 6, 'trees': 9, 'user': 7 + } del expected[removed_word] self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) @@ -186,7 +189,8 @@ def test_saveAsText(self): small_text = [ ["prvé", "slovo"], ["slovo", "druhé"], - ["druhé", "slovo"]] + ["druhé", "slovo"] + ] d = Dictionary(small_text) @@ -264,7 +268,8 @@ def test_from_corpus(self): "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", - "Graph minors A survey"] + "Graph minors A survey" + ] stoplist = set('for a of the and to in'.split()) texts = [ [word for word in document.lower().split() if word not in stoplist] diff --git a/gensim/test/test_corpora_hashdictionary.py b/gensim/test/test_corpora_hashdictionary.py index 6f314d65ee..808246dc59 100644 --- a/gensim/test/test_corpora_hashdictionary.py +++ b/gensim/test/test_corpora_hashdictionary.py @@ -36,7 +36,8 @@ def setUp(self): ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] + ['graph', 'minors', 'survey'] + ] def testDocFreqOneDoc(self): texts = [['human', 'interface', 'computer']] @@ -92,7 +93,7 @@ def testDebugMode(self): # two words texts = [['human', 'cat']] d = HashDictionary(texts, debug=True, myhash=zlib.adler32) - expected = {9273: set(['cat']), 31002: set(['human'])} + expected = {9273: {'cat'}, 31002: {'human'}} self.assertEqual(d.id2token, expected) # now the same thing, with debug off @@ -105,8 +106,17 @@ def testRange(self): # all words map to the same id d = HashDictionary(self.texts, id_range=1, debug=True) dfs = {0: 9} - id2token = {0: set(['minors', 'graph', 'system', 'trees', 'eps', 'computer', 'survey', 'user', 'human', 'time', 'interface', 'response'])} - token2id = {'minors': 0, 'graph': 0, 'system': 0, 'trees': 0, 'eps': 0, 'computer': 0, 'survey': 0, 'user': 0, 'human': 0, 'time': 0, 'interface': 0, 'response': 0} + id2token = { + 0: { + 'minors', 'graph', 'system', 'trees', 'eps', 'computer', + 'survey', 'user', 'human', 'time', 'interface', 'response' + } + } + token2id = { + 'minors': 0, 'graph': 0, 'system': 0, 'trees': 0, + 'eps': 0, 'computer': 0, 'survey': 0, 'user': 0, + 'human': 0, 'time': 0, 'interface': 0, 'response': 0 + } self.assertEqual(d.dfs, dfs) self.assertEqual(d.id2token, id2token) self.assertEqual(d.token2id, token2id) @@ -114,29 +124,31 @@ def testRange(self): # 2 ids: 0/1 for even/odd number of bytes in the word d = HashDictionary(self.texts, id_range=2, myhash=lambda key: len(key)) dfs = {0: 7, 1: 7} - id2token = {0: set(['minors', 'system', 'computer', 'survey', 'user', 'time', 'response']), 1: set(['interface', 'graph', 'trees', 'eps', 'human'])} - token2id = {'minors': 0, 'graph': 1, 'system': 0, 'trees': 1, 'eps': 1, 'computer': 0, 'survey': 0, 'user': 0, 'human': 1, 'time': 0, 'interface': 1, 'response': 0} + id2token = { + 0: {'minors', 'system', 'computer', 'survey', 'user', 'time', 'response'}, + 1: {'interface', 'graph', 'trees', 'eps', 'human'} + } + token2id = { + 'minors': 0, 'graph': 1, 'system': 0, 'trees': 1, 'eps': 1, 'computer': 0, + 'survey': 0, 'user': 0, 'human': 1, 'time': 0, 'interface': 1, 'response': 0 + } self.assertEqual(d.dfs, dfs) self.assertEqual(d.id2token, id2token) self.assertEqual(d.token2id, token2id) def testBuild(self): d = HashDictionary(self.texts, myhash=zlib.adler32) - expected = {5232: 2, - 5798: 3, - 10608: 2, - 12466: 2, - 12736: 3, - 15001: 2, - 18451: 3, - 23844: 3, - 28591: 2, - 29104: 2, - 31002: 2, - 31049: 2} + expected = { + 5232: 2, 5798: 3, 10608: 2, 12466: 2, 12736: 3, 15001: 2, + 18451: 3, 23844: 3, 28591: 2, 29104: 2, 31002: 2, 31049: 2 + } self.assertEqual(d.dfs, expected) - expected = {'minors': 15001, 'graph': 18451, 'system': 5798, 'trees': 23844, 'eps': 31049, 'computer': 10608, 'survey': 28591, 'user': 12736, 'human': 31002, 'time': 29104, 'interface': 12466, 'response': 5232} + expected = { + 'minors': 15001, 'graph': 18451, 'system': 5798, 'trees': 23844, + 'eps': 31049, 'computer': 10608, 'survey': 28591, 'user': 12736, + 'human': 31002, 'time': 29104, 'interface': 12466, 'response': 5232 + } for ex in expected: self.assertEqual(d.token2id[ex], expected[ex]) @@ -149,7 +161,10 @@ def testFilter(self): d = HashDictionary(self.texts, myhash=zlib.adler32) d.filter_extremes(no_below=0, no_above=0.3) - expected = {29104: 2, 31049: 2, 28591: 2, 5232: 2, 10608: 2, 12466: 2, 15001: 2, 31002: 2} + expected = { + 29104: 2, 31049: 2, 28591: 2, 5232: 2, + 10608: 2, 12466: 2, 15001: 2, 31002: 2 + } self.assertEqual(d.dfs, expected) d = HashDictionary(self.texts, myhash=zlib.adler32) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index ad4eb4c976..9a66c50b0a 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -229,20 +229,26 @@ def test_dbow_hs(self): def test_dmm_hs(self): """Test DM/mean doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0, - alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_mean=1, size=24, window=4, + hs=1, negative=0, alpha=0.05, min_count=2, iter=20 + ) self.model_sanity(model) def test_dms_hs(self): """Test DM/sum doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=1, negative=0, - alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=1, + negative=0, alpha=0.05, min_count=2, iter=20 + ) self.model_sanity(model) def test_dmc_hs(self): """Test DM/concatenate doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=1, negative=0, - alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_concat=1, size=24, window=4, + hs=1, negative=0, alpha=0.05, min_count=2, iter=20 + ) self.model_sanity(model) def test_dbow_neg(self): @@ -252,20 +258,26 @@ def test_dbow_neg(self): def test_dmm_neg(self): """Test DM/mean doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=10, - alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, + negative=10, alpha=0.05, min_count=2, iter=20 + ) self.model_sanity(model) def test_dms_neg(self): """Test DM/sum doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=0, negative=10, - alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=0, + negative=10, alpha=0.05, min_count=2, iter=20 + ) self.model_sanity(model) def test_dmc_neg(self): """Test DM/concatenate doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0, negative=10, - alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0, + negative=10, alpha=0.05, min_count=2, iter=20 + ) self.model_sanity(model) def test_parallel(self): @@ -296,10 +308,14 @@ def test_deterministic_neg(self): def test_deterministic_dmc(self): """Test doc2vec results identical with identical RNG seed.""" # bigger, dmc - model = doc2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3, - seed=42, workers=1) - model2 = doc2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3, - seed=42, workers=1) + model = doc2vec.Doc2Vec( + DocsLeeCorpus(), dm=1, dm_concat=1, size=24, + window=4, hs=1, negative=3, seed=42, workers=1 + ) + model2 = doc2vec.Doc2Vec( + DocsLeeCorpus(), dm=1, dm_concat=1, size=24, + window=4, hs=1, negative=3, seed=42, workers=1 + ) self.models_equal(model, model2) def test_mixed_tag_types(self): @@ -341,12 +357,18 @@ def test_delete_temporary_training_data(self): self.assertTrue(not hasattr(model, 'syn0_lockf')) self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0')) self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0_lockf')) - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, + negative=0, alpha=0.05, min_count=2, iter=20 + ) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) self.assertTrue(model.docvecs and hasattr(model.docvecs, 'doctag_syn0')) self.assertTrue(hasattr(model, 'syn1')) self.model_sanity(model, keep_training=False) - model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=1, alpha=0.05, min_count=2, iter=20) + model = doc2vec.Doc2Vec( + list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, + negative=1, alpha=0.05, min_count=2, iter=20 + ) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) self.model_sanity(model, keep_training=False) self.assertTrue(hasattr(model, 'syn1neg')) @@ -436,11 +458,13 @@ def read_su_sentiment_rotten_tomatoes(dirname, lowercase=True): http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip has been expanded. It's not too big, so compose entirely into memory. """ - logging.info("loading corpus from %s" % dirname) + logging.info("loading corpus from %s", dirname) # many mangled chars in sentences (datasetSentences.txt) - chars_sst_mangled = ['à', 'á', 'â', 'ã', 'æ', 'ç', 'è', 'é', 'í', - 'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'û', 'ü'] + chars_sst_mangled = [ + 'à', 'á', 'â', 'ã', 'æ', 'ç', 'è', 'é', 'í', + 'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'û', 'ü' + ] sentence_fixups = [(char.encode('utf-8').decode('latin1'), char) for char in chars_sst_mangled] # more junk, and the replace necessary for sentence-phrase consistency sentence_fixups.extend([ @@ -502,8 +526,10 @@ def read_su_sentiment_rotten_tomatoes(dirname, lowercase=True): assert len([phrase for phrase in phrases if phrase.split == 'test']) == 2210 # 'test' assert len([phrase for phrase in phrases if phrase.split == 'dev']) == 1100 # 'dev' - logging.info("loaded corpus with %i sentences and %i phrases from %s", - len(info_by_sentence), len(phrases), dirname) + logging.info( + "loaded corpus with %i sentences and %i phrases from %s", + len(info_by_sentence), len(phrases), dirname + ) return phrases diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py index 81b48374ab..231bbb1932 100644 --- a/gensim/test/test_dtm.py +++ b/gensim/test/test_dtm.py @@ -39,7 +39,8 @@ def testDtm(self): model = gensim.models.wrappers.DtmModel( self.dtm_path, self.corpus, self.time_slices, num_topics=2, id2word=self.id2word, model='dtm', initialize_lda=True, - rng_seed=1) + rng_seed=1 + ) topics = model.show_topics(num_topics=2, times=2, num_words=10) self.assertEqual(len(topics), 4) @@ -52,7 +53,8 @@ def testDim(self): model = gensim.models.wrappers.DtmModel( self.dtm_path, self.corpus, self.time_slices, num_topics=2, id2word=self.id2word, model='fixed', initialize_lda=True, - rng_seed=1) + rng_seed=1 + ) topics = model.show_topics(num_topics=2, times=2, num_words=10) self.assertEqual(len(topics), 4) @@ -67,7 +69,8 @@ def testCalledProcessError(self): gensim.models.wrappers.DtmModel( self.dtm_path, self.corpus, self.time_slices, num_topics=2, id2word=self.id2word, model='dtm', initialize_lda=False, - rng_seed=1) + rng_seed=1 + ) if __name__ == '__main__': diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py index 6a7a7b09ed..3c00a8f35a 100644 --- a/gensim/test/test_fasttext_wrapper.py +++ b/gensim/test/test_fasttext_wrapper.py @@ -56,7 +56,8 @@ def testTraining(self): return # Use self.skipTest once python < 2.7 is no longer supported vocab_size, model_size = 1763, 10 trained_model = fasttext.FastText.train( - self.ft_path, self.corpus_file, size=model_size, output_file=testfile()) + self.ft_path, self.corpus_file, size=model_size, output_file=testfile() + ) self.assertEqual(trained_model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(trained_model.wv.vocab), vocab_size) @@ -72,11 +73,13 @@ def testMinCount(self): logger.info("FT_HOME env variable not set, skipping test") return # Use self.skipTest once python < 2.7 is no longer supported test_model_min_count_5 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=5) + self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=5 + ) self.assertTrue('forests' not in test_model_min_count_5.wv.vocab) test_model_min_count_1 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=1) + self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=1 + ) self.assertTrue('forests' in test_model_min_count_1.wv.vocab) def testModelSize(self): @@ -85,7 +88,8 @@ def testModelSize(self): logger.info("FT_HOME env variable not set, skipping test") return # Use self.skipTest once python < 2.7 is no longer supported test_model_size_20 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=testfile(), size=20) + self.ft_path, self.corpus_file, output_file=testfile(), size=20 + ) self.assertEqual(test_model_size_20.vector_size, 20) self.assertEqual(test_model_size_20.wv.syn0.shape[1], 20) self.assertEqual(test_model_size_20.wv.syn0_all.shape[1], 20) @@ -245,8 +249,10 @@ def testNSimilarity(self): self.assertEqual(self.test_model.n_similarity(['the'], ['and']), self.test_model.n_similarity(['and'], ['the'])) # Out of vocab check self.assertTrue(numpy.allclose(self.test_model.n_similarity(['night', 'nights'], ['nights', 'night']), 1.0)) - self.assertEqual(self.test_model.n_similarity(['night'], ['nights']), - self.test_model.n_similarity(['nights'], ['night'])) + self.assertEqual( + self.test_model.n_similarity(['night'], ['nights']), + self.test_model.n_similarity(['nights'], ['night']) + ) def testSimilarity(self): """Test similarity for in-vocab and out-of-vocab words""" diff --git a/gensim/test/test_glove2word2vec.py b/gensim/test/test_glove2word2vec.py index b638ca927d..2226bf9fd2 100644 --- a/gensim/test/test_glove2word2vec.py +++ b/gensim/test/test_glove2word2vec.py @@ -31,7 +31,10 @@ def setUp(self): self.output_file = testfile() def testConversion(self): - output = check_output(args=['python', '-m', 'gensim.scripts.glove2word2vec', '--input', self.datapath, '--output', self.output_file]) # noqa:F841 + check_output(args=[ + 'python', '-m', 'gensim.scripts.glove2word2vec', + '--input', self.datapath, '--output', self.output_file + ]) # test that the converted model loads successfully try: self.test_model = gensim.models.KeyedVectors.load_word2vec_format(self.output_file) @@ -40,7 +43,9 @@ def testConversion(self): if os.path.isfile(os.path.join(self.output_file)): self.fail('model file %s was created but could not be loaded.' % self.output_file) else: - self.fail('model file %s creation failed, check the parameters and input file format.' % self.output_file) + self.fail( + 'model file %s creation failed, check the parameters and input file format.' % self.output_file + ) if __name__ == '__main__': diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index 647b31ad7e..b3cf8bdde1 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -17,7 +17,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import hdpmodel -from gensim.test import basetests +from gensim.test import basetmtests import numpy as np @@ -26,15 +26,17 @@ # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -44,7 +46,7 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel): +class TestHdpModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = hdpmodel.HdpModel diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index 7f321c6565..996f1b7f7b 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -79,7 +79,10 @@ def testEmbeddingLayerCosineSim(self): word_a = 'graph' word_b = 'trees' - output = model.predict([np.asarray([keras_w2v_model.wv.vocab[word_a].index]), np.asarray([keras_w2v_model.wv.vocab[word_b].index])]) + output = model.predict([ + np.asarray([keras_w2v_model.wv.vocab[word_a].index]), + np.asarray([keras_w2v_model.wv.vocab[word_b].index]) + ]) # output is the cosine distance between the two words (as a similarity measure) self.assertTrue(type(output[0][0][0]) == np.float32) # verify that a float is returned @@ -113,7 +116,7 @@ def testEmbeddingLayer20NewsGroup(self): texts_w2v.append(sentence.split(' ')) labels.append(label_id) except Exception: - None + pass # Vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer() @@ -128,11 +131,11 @@ def testEmbeddingLayer20NewsGroup(self): y_train = labels # prepare the embedding layer using the wrapper - Keras_w2v = self.model_twenty_ng - Keras_w2v.build_vocab(texts_w2v) - Keras_w2v.train(texts, total_examples=Keras_w2v.corpus_count, epochs=Keras_w2v.iter) - Keras_w2v_wv = Keras_w2v.wv - embedding_layer = Keras_w2v_wv.get_embedding_layer() + keras_w2v = self.model_twenty_ng + keras_w2v.build_vocab(texts_w2v) + keras_w2v.train(texts, total_examples=keras_w2v.corpus_count, epochs=keras_w2v.iter) + keras_w2v_wv = keras_w2v.wv + embedding_layer = keras_w2v_wv.get_embedding_layer() # create a 1D convnet to solve our classification task sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index 8a3be3af5c..76bd448d5c 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -35,7 +35,7 @@ def test_text_keywords(self): with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: kw = f.read().strip().split("\n") - self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw))) + self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw}) def test_text_keywords_words(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -55,13 +55,13 @@ def test_text_keywords_pos(self): text = f.read() # calculate keywords using only certain parts of speech - generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True) + generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True) # To be compared to the reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: kw = f.read().strip().split("\n") - self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw))) + self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw}) def test_text_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index a7d64839bb..5ed4486e16 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -21,21 +21,23 @@ from gensim.models.wrappers import ldamallet from gensim import matutils from gensim.models import ldamodel -from gensim.test import basetests +from gensim.test import basetmtests module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -46,7 +48,7 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -class TestLdaMallet(unittest.TestCase, basetests.TestBaseTopicModel): +class TestLdaMallet(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None @@ -72,8 +74,10 @@ def testTransform(self): passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering if passed: break - logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % - (i, sorted(vec), sorted(expected))) + logging.warning( + "LDA failed to converge on attempt %i (got %s, expected %s)", + i, sorted(vec), sorted(expected) + ) self.assertTrue(passed) def testSparseTransform(self): @@ -82,7 +86,9 @@ def testSparseTransform(self): passed = False for i in range(5): # restart at most 5 times # create the sparse transformation model with the appropriate topic_threshold - model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5) + model = ldamallet.LdaMallet( + self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5 + ) # transform one document doc = list(corpus)[0] transformed = model[doc] @@ -91,8 +97,10 @@ def testSparseTransform(self): passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break - logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % - (i, sorted(vec), sorted(expected))) + logging.warning( + "LDA failed to converge on attempt %i (got %s, expected %s)", + i, sorted(vec), sorted(expected) + ) self.assertTrue(passed) def testMallet2Model(self): diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 71cc10f70c..c1d35c2661 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -22,22 +22,24 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import ldamodel, ldamulticore from gensim import matutils, utils -from gensim.test import basetests +from gensim.test import basetmtests module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -54,7 +56,7 @@ def testRandomState(): assert(isinstance(utils.get_random_state(testcase), np.random.RandomState)) -class TestLdaModel(unittest.TestCase, basetests.TestBaseTopicModel): +class TestLdaModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = ldamodel.LdaModel @@ -79,8 +81,7 @@ def testTransform(self): passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering if passed: break - logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % - (i, sorted(vec), sorted(expected))) + logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected)) self.assertTrue(passed) def testAlphaAuto(self): @@ -231,7 +232,9 @@ def testGetTopicTerms(self): def testGetDocumentTopics(self): - model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)) + model = self.class_( + self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) + ) doc_topics = model.get_document_topics(self.corpus) @@ -264,7 +267,9 @@ def testGetDocumentTopics(self): doc_topic_count_na = 0 word_phi_count_na = 0 - all_topics = model.get_document_topics(self.corpus, minimum_probability=0.8, minimum_phi_value=1.0, per_word_topics=True) + all_topics = model.get_document_topics( + self.corpus, minimum_probability=0.8, minimum_phi_value=1.0, per_word_topics=True + ) self.assertEqual(model.state.numdocs, len(corpus)) @@ -313,7 +318,9 @@ def testGetDocumentTopics(self): def testTermTopics(self): - model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)) + model = self.class_( + self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) + ) # check with word_type result = model.get_term_topics(2) @@ -355,7 +362,7 @@ def testPasses(self): for test_rhot in test_rhots: model.update(self.corpus) - msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs])) + msg = ", ".join(str(x) for x in [passes, model.num_updates, model.state.numdocs]) self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg) self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots)) @@ -379,7 +386,7 @@ def testPasses(self): # model = self.class_(id2word=dictionary, num_topics=2, passes=200, eta=eta) # model.update(self.corpus) - # topics = [dict((word, p) for p, word in model.show_topic(j, topn=None)) for j in range(2)] + # topics = [{word: p for p, word in model.show_topic(j, topn=None)} for j in range(2)] # # check that the word 'system' in the topic we seeded got a high weight, # # and the word 'trees' (the main word in the other topic) a low weight -- @@ -413,8 +420,8 @@ def testModelCompatibilityWithPythonVersions(self): self.assertTrue(np.allclose(model_2_7.expElogbeta, model_3_5.expElogbeta)) tstvec = [] self.assertTrue(np.allclose(model_2_7[tstvec], model_3_5[tstvec])) # try projecting an empty vector - id2word_2_7 = dict((k, v) for k, v in model_2_7.id2word.iteritems()) - id2word_3_5 = dict((k, v) for k, v in model_3_5.id2word.iteritems()) + id2word_2_7 = dict(model_2_7.id2word.iteritems()) + id2word_3_5 = dict(model_3_5.id2word.iteritems()) self.assertEqual(set(id2word_2_7.keys()), set(id2word_3_5.keys())) def testPersistenceIgnore(self): diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 4ca7b104fc..d38c01868c 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -20,25 +20,175 @@ class TestLdaSeq(unittest.TestCase): # we are setting up a DTM model and fitting it, and checking topic-word and doc-topic results. def setUp(self): texts = [ - [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'], - [u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'], - [u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'], - [u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], - [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], - [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'], - [u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], - [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'], - [u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], - [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], - [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'], - [u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], - [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], - [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], - [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], - [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'], - [u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], - [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], - [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], + [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', + u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'], + [u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', + u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', + u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', + u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', + u'engineering', u'participates', u'participates', u'reviews', u'participates', + u'hiring', u'conducting', u'interviews'], + [u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', + u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', + u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', + u'define', u'schedules', u'milestones', u'participating'], + [u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', + u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', + u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', + u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', + u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', + u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], + [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', + u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', + u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', + u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', + u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', + u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', + u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', + u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', + u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', + u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', + u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', + u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', + u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', + u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', + u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', + u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], + [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', + u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', + u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', + u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', + u'manipulate', u'applications', u'engineering'], + [u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', + u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', + u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', + u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', + u'insurers', u'operates', u'investment'], + [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', + u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', + u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', + u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', + u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', + u'infrastructures', u'engineers', u'documented', u'management', u'engineering', + u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', + u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'], + [u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', + u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', + u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', + u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', + u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', + u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', + u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', + u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', + u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], + [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', + u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', + u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', + u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', + u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', + u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', + u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', + u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', + u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', + u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', + u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', + u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', + u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', + u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', + u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], + [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', + u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', + u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', + u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', + u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', + u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', + u'automated', u'participate', u'ongoing'], + [u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', + u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', + u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', + u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', + u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', + u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', + u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', + u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', + u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', + u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', + u'laboratory', u'technology', u'availability', u'click', u'attach'], + [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', + u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], + [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', + u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', + u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', + u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', + u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', + u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', + u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', + u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', + u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', + u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', + u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', + u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', + u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', + u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', + u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', + u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], + [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', + u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', + u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', + u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', + u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', + u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', + u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', + u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', + u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', + u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', + u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', + u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', + u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], + [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', + u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', + u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', + u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', + u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', + u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', + u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'], + [u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', + u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', + u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], + [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', + u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', + u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', + u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', + u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', + u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', + u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', + u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', + u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', + u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', + u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', + u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', + u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', + u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], + [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', + u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', + u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', + u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', + u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', + u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', + u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', + u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', + u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', + u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', + u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', + u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', + u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', + u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', + u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', + u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', + u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', + u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', + u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', + u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], ['bank', 'river', 'shore', 'water'], ['river', 'water', 'flow', 'fast', 'tree'], @@ -56,7 +206,10 @@ def setUp(self): sstats = np.loadtxt(datapath('sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] - self.ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats) + self.ldaseq = ldaseqmodel.LdaSeqModel( + corpus=corpus, id2word=dictionary, num_topics=2, + time_slice=[10, 10, 11], initialize='own', sstats=sstats + ) # testing topic word proportions def testTopicWord(self): diff --git a/gensim/test/test_ldavowpalwabbit_wrapper.py b/gensim/test/test_ldavowpalwabbit_wrapper.py index 3cf6f9f6bb..d14723de59 100644 --- a/gensim/test/test_ldavowpalwabbit_wrapper.py +++ b/gensim/test/test_ldavowpalwabbit_wrapper.py @@ -33,11 +33,11 @@ # set up vars used in testing ("Deerwester" from the web tutorial) TOPIC_WORDS = [ -'cat lion leopard mouse jaguar lynx cheetah tiger kitten puppy'.split(), -'engine car wheel brakes tyre motor suspension cylinder exhaust clutch'.split(), -'alice bob robert tim sue rachel dave harry alex jim'.split(), -'c cplusplus go python haskell scala java ruby csharp erlang'.split(), -'eggs ham mushrooms cereal coffee beans tea juice sausages bacon'.split() + 'cat lion leopard mouse jaguar lynx cheetah tiger kitten puppy'.split(), + 'engine car wheel brakes tyre motor suspension cylinder exhaust clutch'.split(), + 'alice bob robert tim sue rachel dave harry alex jim'.split(), + 'c cplusplus go python haskell scala java ruby csharp erlang'.split(), + 'eggs ham mushrooms cereal coffee beans tea juice sausages bacon'.split() ] @@ -71,30 +71,29 @@ def test_save_load(self): """Test loading/saving LdaVowpalWabbit model.""" if not self.vw_path: # for python 2.6 return - lda = LdaVowpalWabbit(self.vw_path, - corpus=self.corpus, - passes=10, - chunksize=256, - id2word=self.dictionary, - cleanup_files=True, - alpha=0.1, - eta=0.1, - num_topics=len(TOPIC_WORDS), - random_seed=1) + lda = LdaVowpalWabbit( + self.vw_path, corpus=self.corpus, passes=10, chunksize=256, + id2word=self.dictionary, cleanup_files=True, alpha=0.1, + eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1 + ) with tempfile.NamedTemporaryFile() as fhandle: lda.save(fhandle.name) lda2 = LdaVowpalWabbit.load(fhandle.name) # ensure public fields are saved/loaded correctly - saved_fields = [lda.alpha, lda.chunksize, lda.cleanup_files, - lda.decay, lda.eta, lda.gamma_threshold, - lda.id2word, lda.num_terms, lda.num_topics, - lda.passes, lda.random_seed, lda.vw_path] - loaded_fields = [lda2.alpha, lda2.chunksize, lda2.cleanup_files, - lda2.decay, lda2.eta, lda2.gamma_threshold, - lda2.id2word, lda2.num_terms, lda2.num_topics, - lda2.passes, lda2.random_seed, lda2.vw_path] + saved_fields = [ + lda.alpha, lda.chunksize, lda.cleanup_files, + lda.decay, lda.eta, lda.gamma_threshold, + lda.id2word, lda.num_terms, lda.num_topics, + lda.passes, lda.random_seed, lda.vw_path + ] + loaded_fields = [ + lda2.alpha, lda2.chunksize, lda2.cleanup_files, + lda2.decay, lda2.eta, lda2.gamma_threshold, + lda2.id2word, lda2.num_terms, lda2.num_topics, + lda2.passes, lda2.random_seed, lda2.vw_path + ] self.assertEqual(saved_fields, loaded_fields) # ensure topic matrices are saved/loaded correctly @@ -106,16 +105,11 @@ def test_model_update(self): """Test updating existing LdaVowpalWabbit model.""" if not self.vw_path: # for python 2.6 return - lda = LdaVowpalWabbit(self.vw_path, - corpus=[self.corpus[0]], - passes=10, - chunksize=256, - id2word=self.dictionary, - cleanup_files=True, - alpha=0.1, - eta=0.1, - num_topics=len(TOPIC_WORDS), - random_seed=1) + lda = LdaVowpalWabbit( + self.vw_path, corpus=[self.corpus[0]], passes=10, chunksize=256, + id2word=self.dictionary, cleanup_files=True, alpha=0.1, + eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1 + ) lda.update(self.corpus[1:]) result = lda.log_perplexity(self.corpus) @@ -126,16 +120,10 @@ def test_perplexity(self): """Test LdaVowpalWabbit perplexity is within expected range.""" if not self.vw_path: # for python 2.6 return - lda = LdaVowpalWabbit(self.vw_path, - corpus=self.corpus, - passes=10, - chunksize=256, - id2word=self.dictionary, - cleanup_files=True, - alpha=0.1, - eta=0.1, - num_topics=len(TOPIC_WORDS), - random_seed=1) + lda = LdaVowpalWabbit( + self.vw_path, corpus=self.corpus, passes=10, chunksize=256, + id2word=self.dictionary, cleanup_files=True, alpha=0.1, + eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1) # varies, but should be between -1 and -5 result = lda.log_perplexity(self.corpus) @@ -147,16 +135,11 @@ def test_topic_coherence(self): if not self.vw_path: # for python 2.6 return corpus, dictionary = get_corpus() - lda = LdaVowpalWabbit(self.vw_path, - corpus=corpus, - passes=10, - chunksize=256, - id2word=dictionary, - cleanup_files=True, - alpha=0.1, - eta=0.1, - num_topics=len(TOPIC_WORDS), - random_seed=1) + lda = LdaVowpalWabbit( + self.vw_path, corpus=corpus, passes=10, chunksize=256, + id2word=dictionary, cleanup_files=True, alpha=0.1, + eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1 + ) lda.print_topics(5, 10) # map words in known topic to an ID @@ -198,11 +181,13 @@ def test_corpus_to_vw(self): """Test corpus to Vowpal Wabbit format conversion.""" if not self.vw_path: # for python 2.6 return - corpus = [[(0, 5), (7, 1), (5, 3), (0, 2)], - [(7, 2), (2, 1), (3, 11)], - [(1, 1)], - [], - [(5, 2), (0, 1)]] + corpus = [ + [(0, 5), (7, 1), (5, 3), (0, 2)], + [(7, 2), (2, 1), (3, 11)], + [(1, 1)], + [], + [(5, 2), (0, 1)] + ] expected = """ | 0:5 7:1 5:3 0:2 | 7:2 2:1 3:11 diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py index b51101c8b1..33cce71e52 100644 --- a/gensim/test/test_lee.py +++ b/gensim/test/test_lee.py @@ -63,7 +63,7 @@ def setUp(self): # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] - human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)] + human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)] def test_corpus(self): """availability and integrity of corpus""" @@ -100,36 +100,12 @@ def test_lee(self): for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) - flat = res[matutils.triu_indices(len(corpus), 1)] + flat = res[np.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] - logging.info("LSI correlation coefficient is %s" % cor) + logging.info("LSI correlation coefficient is %s", cor) self.assertTrue(cor > 0.6) - # def test_lee_mallet(self): - # global bg_corpus, corpus, bg_corpus2, corpus2 - - # # create a dictionary and corpus (bag of words) - # dictionary = corpora.Dictionary(bg_corpus2) - # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] - # corpus = [dictionary.doc2bow(text) for text in corpus2] - - # # initialize an LDA transformation from background corpus - # lda = models.wrappers.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', - # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) - # corpus_lda = lda[corpus] - - # # compute pairwise similarity matrix and extract upper triangular - # res = np.zeros((len(corpus), len(corpus))) - # for i, par1 in enumerate(corpus_lda): - # for j, par2 in enumerate(corpus_lda): - # res[i, j] = matutils.cossim(par1, par2) - # flat = res[matutils.triu_indices(len(corpus), 1)] - - # cor = np.corrcoef(flat, human_sim_vector)[0, 1] - # logging.info("LDA correlation coefficient is %s" % cor) - # self.assertTrue(cor > 0.35) - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) diff --git a/gensim/test/test_logentropy_model.py b/gensim/test/test_logentropy_model.py index 07f982dad9..22ca09be0d 100644 --- a/gensim/test/test_logentropy_model.py +++ b/gensim/test/test_logentropy_model.py @@ -25,15 +25,17 @@ # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -56,9 +58,11 @@ def testTransform(self): doc = list(self.corpus_ok)[0] transformed = model[doc] - expected = [(0, 0.3748900964125389), - (1, 0.30730215324230725), - (3, 1.20941755462856)] + expected = [ + (0, 0.3748900964125389), + (1, 0.30730215324230725), + (3, 1.20941755462856) + ] self.assertTrue(np.allclose(transformed, expected)) def testPersistence(self): @@ -78,7 +82,6 @@ def testPersistenceCompressed(self): self.assertTrue(model.entr == model2.entr) tstvec = [] self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) -# endclass TestLogEntropyModel if __name__ == '__main__': diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index e2c32bda66..6a1d2ef995 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -21,7 +21,7 @@ from gensim import matutils from gensim.corpora import mmcorpus, Dictionary from gensim.models import lsimodel -from gensim.test import basetests +from gensim.test import basetmtests module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -31,15 +31,17 @@ def datapath(fname): # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -49,7 +51,7 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -class TestLsiModel(unittest.TestCase, basetests.TestBaseTopicModel): +class TestLsiModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2) @@ -84,7 +86,8 @@ def testCorpusTransform(self): [0.01274618, -0.49016181], [0.04888203, -1.11294699], [0.08063836, -1.56345594], - [0.27381003, -1.34694159]]) + [0.27381003, -1.34694159] + ]) self.assertTrue(np.allclose(abs(got), abs(expected))) # must equal up to sign def testOnlineTransform(self): @@ -178,7 +181,7 @@ def testDocsProcessed(self): self.assertEqual(self.model.docs_processed, 9) self.assertEqual(self.model.docs_processed, self.corpus.num_docs) - def testGetTopics(self): + def test_get_topics(self): topics = self.model.get_topics() vocab_size = len(self.model.id2word) for topic in topics: @@ -188,8 +191,6 @@ def testGetTopics(self): # LSI topics are not probability distributions # self.assertAlmostEqual(np.sum(topic), 1.0, 5) -# endclass TestLsiModel - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) diff --git a/gensim/test/test_normmodel.py b/gensim/test/test_normmodel.py index 77221b4a4d..339680d085 100644 --- a/gensim/test/test_normmodel.py +++ b/gensim/test/test_normmodel.py @@ -65,18 +65,22 @@ def test_sparseCSRInput_l1(self): def test_numpyndarrayInput_l1(self): """Test for np ndarray input for l1 transformation""" - ndarray_matrix = np.array([[1, 0, 2], - [0, 0, 3], - [4, 5, 6]]) + ndarray_matrix = np.array([ + [1, 0, 2], + [0, 0, 3], + [4, 5, 6] + ]) normalized = self.model_l1.normalize(ndarray_matrix) # Check if output is of same type self.assertTrue(isinstance(normalized, np.ndarray)) # Check if output is correct - expected = np.array([[0.04761905, 0., 0.0952381], - [0., 0., 0.14285714], - [0.19047619, 0.23809524, 0.28571429]]) + expected = np.array([ + [0.04761905, 0., 0.0952381], + [0., 0., 0.14285714], + [0.19047619, 0.23809524, 0.28571429] + ]) self.assertTrue(np.allclose(normalized, expected)) # Test if error is raised on unsupported input type @@ -101,25 +105,31 @@ def test_sparseCSRInput_l2(self): self.assertTrue(issparse(normalized)) # Check if output is correct - expected = np.array([[0.10482848, 0., 0.20965697], - [0., 0., 0.31448545], - [0.41931393, 0.52414242, 0.6289709]]) + expected = np.array([ + [0.10482848, 0., 0.20965697], + [0., 0., 0.31448545], + [0.41931393, 0.52414242, 0.6289709] + ]) self.assertTrue(np.allclose(normalized.toarray(), expected)) def test_numpyndarrayInput_l2(self): """Test for np ndarray input for l2 transformation""" - ndarray_matrix = np.array([[1, 0, 2], - [0, 0, 3], - [4, 5, 6]]) + ndarray_matrix = np.array([ + [1, 0, 2], + [0, 0, 3], + [4, 5, 6] + ]) normalized = self.model_l2.normalize(ndarray_matrix) # Check if output is of same type self.assertTrue(isinstance(normalized, np.ndarray)) # Check if output is correct - expected = np.array([[0.10482848, 0., 0.20965697], - [0., 0., 0.31448545], - [0.41931393, 0.52414242, 0.6289709]]) + expected = np.array([ + [0.10482848, 0., 0.20965697], + [0., 0., 0.31448545], + [0.41931393, 0.52414242, 0.6289709] + ]) self.assertTrue(np.allclose(normalized, expected)) # Test if error is raised on unsupported input type diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py index fd0429e9f6..02ca13fb6b 100644 --- a/gensim/test/test_parsing.py +++ b/gensim/test/test_parsing.py @@ -36,8 +36,7 @@ for many searching purposes, a little fuzziness would help. """ -dataset = map(lambda x: strip_punctuation2(x.lower()), - [doc1, doc2, doc3, doc4]) +dataset = [strip_punctuation2(x.lower()) for x in [doc1, doc2, doc3, doc4]] # doc1 and doc2 have class 0, doc3 and doc4 avec class 1 classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) @@ -45,40 +44,33 @@ class TestPreprocessing(unittest.TestCase): def testStripNumeric(self): - self.assertEqual(strip_numeric("salut les amis du 59"), - "salut les amis du ") + self.assertEqual(strip_numeric("salut les amis du 59"), "salut les amis du ") def testStripShort(self): - self.assertEqual(strip_short("salut les amis du 59", 3), - "salut les amis") + self.assertEqual(strip_short("salut les amis du 59", 3), "salut les amis") def testStripTags(self): - self.assertEqual(strip_tags("Hello World!"), - "Hello World!") + self.assertEqual(strip_tags("Hello World!"), "Hello World!") def testStripMultipleWhitespaces(self): - self.assertEqual(strip_multiple_whitespaces("salut les\r\nloulous!"), - "salut les loulous!") + self.assertEqual(strip_multiple_whitespaces("salut les\r\nloulous!"), "salut les loulous!") def testStripNonAlphanum(self): - self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), - "toto nf kappa titi") + self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), "toto nf kappa titi") def testSplitAlphanum(self): - self.assertEqual(split_alphanum("toto diet1 titi"), - "toto diet 1 titi") - self.assertEqual(split_alphanum("toto 1diet titi"), - "toto 1 diet titi") + self.assertEqual(split_alphanum("toto diet1 titi"), "toto diet 1 titi") + self.assertEqual(split_alphanum("toto 1diet titi"), "toto 1 diet titi") def testStripStopwords(self): - self.assertEqual(remove_stopwords("the world is square"), - "world square") + self.assertEqual(remove_stopwords("the world is square"), "world square") def testStemText(self): - target = "while it is quit us to be abl to search a larg " + \ - "collect of document almost instantli for a joint occurr " + \ - "of a collect of exact words, for mani search purposes, " + \ - "a littl fuzzi would help." + target = \ + "while it is quit us to be abl to search a larg " + \ + "collect of document almost instantli for a joint occurr " + \ + "of a collect of exact words, for mani search purposes, " + \ + "a littl fuzzi would help." self.assertEqual(stem_text(doc5), target) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 5397d6e4c3..868947defb 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -133,11 +133,7 @@ def testExportPhrases(self): for phrase, score in bigram.export_phrases(sentences): seen_bigrams.add(phrase) - assert seen_bigrams == set([ - b'response time', - b'graph minors', - b'human interface' - ]) + assert seen_bigrams == {b'response time', b'graph minors', b'human interface'} def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ @@ -149,10 +145,7 @@ def testMultipleBigramsSingleEntry(self): for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) - assert seen_bigrams == set([ - b'graph minors', - b'human interface' - ]) + assert seen_bigrams == {b'graph minors', b'human interface'} def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ @@ -164,10 +157,10 @@ def testScoringDefault(self): for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) - assert seen_scores == set([ + assert seen_scores == { 5.167, # score for graph minors 3.444 # score for human interface - ]) + } def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ @@ -179,10 +172,10 @@ def testScoringNpmi(self): for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) - assert seen_scores == set([ + assert seen_scores == { .882, # score for graph minors .714 # score for human interface - ]) + } def testBadParameters(self): """Test the phrases module with bad parameters.""" diff --git a/gensim/test/test_rpmodel.py b/gensim/test/test_rpmodel.py index 2de5dd6546..94c1abce84 100644 --- a/gensim/test/test_rpmodel.py +++ b/gensim/test/test_rpmodel.py @@ -26,15 +26,17 @@ # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -80,7 +82,6 @@ def testPersistenceCompressed(self): self.assertTrue(np.allclose(model.projection, model2.projection)) tstvec = [] self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector -# endclass TestRpModel if __name__ == '__main__': diff --git a/gensim/test/test_segmentation.py b/gensim/test/test_segmentation.py index a4c9356a26..512121a055 100644 --- a/gensim/test/test_segmentation.py +++ b/gensim/test/test_segmentation.py @@ -20,30 +20,40 @@ class TestSegmentation(unittest.TestCase): def setUp(self): - self.topics = [array([9, 4, 6]), array([9, 10, 7]), array([5, 2, 7])] + self.topics = [ + array([9, 4, 6]), + array([9, 10, 7]), + array([5, 2, 7]) + ] def testSOnePre(self): """Test s_one_pre segmentation.""" actual = segmentation.s_one_pre(self.topics) - expected = [[(4, 9), (6, 9), (6, 4)], - [(10, 9), (7, 9), (7, 10)], - [(2, 5), (7, 5), (7, 2)]] + expected = [ + [(4, 9), (6, 9), (6, 4)], + [(10, 9), (7, 9), (7, 10)], + [(2, 5), (7, 5), (7, 2)] + ] self.assertTrue(np.allclose(actual, expected)) def testSOneOne(self): """Test s_one_one segmentation.""" actual = segmentation.s_one_one(self.topics) - expected = [[(9, 4), (9, 6), (4, 9), (4, 6), (6, 9), (6, 4)], - [(9, 10), (9, 7), (10, 9), (10, 7), (7, 9), (7, 10)], - [(5, 2), (5, 7), (2, 5), (2, 7), (7, 5), (7, 2)]] + expected = [ + [(9, 4), (9, 6), (4, 9), (4, 6), (6, 9), (6, 4)], + [(9, 10), (9, 7), (10, 9), (10, 7), (7, 9), (7, 10)], + [(5, 2), (5, 7), (2, 5), (2, 7), (7, 5), (7, 2)] + ] self.assertTrue(np.allclose(actual, expected)) def testSOneSet(self): """Test s_one_set segmentation.""" actual = segmentation.s_one_set(self.topics) - expected = [[(9, array([9, 4, 6])), (4, array([9, 4, 6])), (6, array([9, 4, 6]))], - [(9, array([9, 10, 7])), (10, array([9, 10, 7])), (7, array([9, 10, 7]))], - [(5, array([5, 2, 7])), (2, array([5, 2, 7])), (7, array([5, 2, 7]))]] + expected = [ + [(9, array([9, 4, 6])), (4, array([9, 4, 6])), (6, array([9, 4, 6]))], + [(9, array([9, 10, 7])), (10, array([9, 10, 7])), (7, array([9, 10, 7]))], + [(5, array([5, 2, 7])), (2, array([5, 2, 7])), (7, array([5, 2, 7]))] + ] for s_i in range(len(actual)): for j in range(len(actual[s_i])): self.assertEqual(actual[s_i][j][0], expected[s_i][j][0]) diff --git a/gensim/test/test_sharded_corpus.py b/gensim/test/test_sharded_corpus.py index 871048ea4e..cc70ee8f49 100644 --- a/gensim/test/test_sharded_corpus.py +++ b/gensim/test/test_sharded_corpus.py @@ -87,8 +87,7 @@ def test_sparse_serialization(self): no_exception = True try: - dataset = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, # noqa:F841 - dim=self.dim, sparse_serialization=True) + ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=True) except Exception: no_exception = False raise @@ -97,9 +96,10 @@ def test_sparse_serialization(self): def test_getitem_dense2dense(self): - corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, - dim=self.dim, sparse_serialization=False, - sparse_retrieval=False) + corpus = ShardedCorpus( + self.tmp_fname, self.data, shardsize=100, dim=self.dim, + sparse_serialization=False, sparse_retrieval=False + ) item = corpus[3] self.assertTrue(isinstance(item, np.ndarray)) @@ -117,9 +117,10 @@ def test_getitem_dense2dense(self): def test_getitem_dense2sparse(self): - corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, - dim=self.dim, sparse_serialization=False, - sparse_retrieval=True) + corpus = ShardedCorpus( + self.tmp_fname, self.data, shardsize=100, dim=self.dim, + sparse_serialization=False, sparse_retrieval=True + ) item = corpus[3] self.assertTrue(isinstance(item, sparse.csr_matrix)) @@ -138,13 +139,15 @@ def test_getitem_dense2sparse(self): def test_getitem_sparse2sparse(self): sp_tmp_fname = self.tmp_fname + '.sparse' - corpus = ShardedCorpus(sp_tmp_fname, self.data, shardsize=100, - dim=self.dim, sparse_serialization=True, - sparse_retrieval=True) + corpus = ShardedCorpus( + sp_tmp_fname, self.data, shardsize=100, dim=self.dim, + sparse_serialization=True, sparse_retrieval=True + ) - dense_corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, - dim=self.dim, sparse_serialization=False, - sparse_retrieval=True) + dense_corpus = ShardedCorpus( + self.tmp_fname, self.data, shardsize=100, dim=self.dim, + sparse_serialization=False, sparse_retrieval=True + ) item = corpus[3] self.assertTrue(isinstance(item, sparse.csr_matrix)) @@ -168,13 +171,15 @@ def test_getitem_sparse2sparse(self): def test_getitem_sparse2dense(self): sp_tmp_fname = self.tmp_fname + '.sparse' - corpus = ShardedCorpus(sp_tmp_fname, self.data, shardsize=100, - dim=self.dim, sparse_serialization=True, - sparse_retrieval=False) + corpus = ShardedCorpus( + sp_tmp_fname, self.data, shardsize=100, dim=self.dim, + sparse_serialization=True, sparse_retrieval=False + ) - dense_corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, - dim=self.dim, sparse_serialization=False, - sparse_retrieval=False) + dense_corpus = ShardedCorpus( + self.tmp_fname, self.data, shardsize=100, dim=self.dim, + sparse_serialization=False, sparse_retrieval=False + ) item = corpus[3] self.assertTrue(isinstance(item, np.ndarray)) @@ -195,9 +200,10 @@ def test_getitem_sparse2dense(self): def test_getitem_dense2gensim(self): - corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, - dim=self.dim, sparse_serialization=False, - gensim=True) + corpus = ShardedCorpus( + self.tmp_fname, self.data, shardsize=100, dim=self.dim, + sparse_serialization=False, gensim=True + ) item = corpus[3] self.assertTrue(isinstance(item, list)) @@ -211,8 +217,7 @@ def test_getitem_dense2gensim(self): self.assertTrue(isinstance(dslice[0][0], tuple)) iscorp, _ = is_corpus(dslice) - self.assertTrue(iscorp, "Is the object returned by slice notation " - "a gensim corpus?") + self.assertTrue(iscorp, "Is the object returned by slice notation a gensim corpus?") ilist = corpus[[2, 3, 4, 5]] self.assertTrue(next(ilist) == corpus[2]) @@ -235,8 +240,7 @@ def test_getitem_dense2gensim(self): str(dslice[i][j]))) iscorp, _ = is_corpus(ilist) - self.assertTrue(iscorp, "Is the object returned by list notation " - "a gensim corpus?") + self.assertTrue(iscorp, "Is the object returned by list notation a gensim corpus?") def test_resize(self): @@ -252,8 +256,6 @@ def test_resize(self): fname = dataset._shard_name(n) self.assertTrue(os.path.isfile(fname)) -############################################################################## - if __name__ == '__main__': suite = unittest.TestSuite() diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 9ce5263df7..93c0f8a3f7 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -36,20 +36,21 @@ # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] -sentences = [doc2vec.TaggedDocument(words, [i]) - for i, words in enumerate(texts)] +sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(texts)] def testfile(): @@ -78,7 +79,7 @@ def testFull(self, num_best=None, shardsize=100): [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.70710677, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.57735026], [0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026], - ], dtype=numpy.float32) + ], dtype=numpy.float32) # HACK: dictionary can be in different order, so compare in sorted order self.assertTrue(numpy.allclose(sorted(expected.flat), sorted(index.index.flat))) index.num_best = num_best @@ -137,15 +138,17 @@ def testChunking(self): [0.99999994, 0.23570226, 0.28867513, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.0], [0.23570226, 1.0, 0.40824831, 0.33333334, 0.70710677, 0.0, 0.0, 0.0, 0.23570226], [0.28867513, 0.40824831, 1.0, 0.61237246, 0.28867513, 0.0, 0.0, 0.0, 0.0] - ], dtype=numpy.float32) + ], dtype=numpy.float32) self.assertTrue(numpy.allclose(expected, sims)) # test the same thing but with num_best index.num_best = 3 sims = index[query] - expected = [[(0, 0.99999994), (2, 0.28867513), (1, 0.23570226)], - [(1, 1.0), (4, 0.70710677), (2, 0.40824831)], - [(2, 1.0), (3, 0.61237246), (1, 0.40824831)]] + expected = [ + [(0, 0.99999994), (2, 0.28867513), (1, 0.23570226)], + [(1, 1.0), (4, 0.70710677), (2, 0.40824831)], + [(2, 1.0), (3, 0.61237246), (1, 0.40824831)] + ] self.assertTrue(numpy.allclose(expected, sims)) if self.cls == similarities.Similarity: index.destroy() @@ -166,7 +169,7 @@ def testIter(self): [0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.99999994, 0.81649655, 0.40824828], [0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.81649655, 0.99999994, 0.66666663], [0.0, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.40824828, 0.66666663, 0.99999994] - ], dtype=numpy.float32) + ], dtype=numpy.float32) self.assertTrue(numpy.allclose(expected, sims)) if self.cls == similarities.Similarity: index.destroy() diff --git a/gensim/test/test_similarity_metrics.py b/gensim/test/test_similarity_metrics.py index 23bf0e60f7..27066ff09d 100644 --- a/gensim/test/test_similarity_metrics.py +++ b/gensim/test/test_similarity_metrics.py @@ -24,15 +24,17 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 0e17905c2b..07411aa9b9 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -43,10 +43,19 @@ ] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] -author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]} +author2doc = { + 'john': [0, 1, 2, 3, 4, 5, 6], + 'jane': [2, 3, 4, 5, 6, 7, 8], + 'jack': [0, 2, 4, 6, 8], + 'jill': [1, 3, 5, 7] +} texts_new = texts[0:3] -author2doc_new = {'jill': [0], 'bob': [0, 1], 'sally': [1, 2]} +author2doc_new = { + 'jill': [0], + 'bob': [0, 1], + 'sally': [1, 2] +} dictionary_new = Dictionary(texts_new) corpus_new = [dictionary_new.doc2bow(text) for text in texts_new] @@ -91,11 +100,16 @@ ['geometry', 'is', 'the', 'study', 'of', 'shape'], ['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'], ['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'], - ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and', 'the', 'areas', 'under', 'and', 'between', 'curves'], - ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter', 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'], + ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and', + 'the', 'areas', 'under', 'and', 'between', 'curves'], + ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter', + 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'], ['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'], - ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new', 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'], - ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society'] + ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new', + 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'], + ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', + 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', + 'transformed', 'modern', 'day', 'society'] ] d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)] @@ -129,7 +143,9 @@ class TestLdaWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - self.model = LdaTransformer(id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)) + self.model = LdaTransformer( + id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0) + ) self.model.fit(corpus) def testTransform(self): @@ -157,11 +173,16 @@ def testPartialFit(self): def testConsistencyWithGensimModel(self): # training an LdaTransformer with `num_topics`=10 - self.model = LdaTransformer(id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)) + self.model = LdaTransformer( + id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0) + ) self.model.fit(corpus) # training a Gensim LdaModel with the same params - gensim_ldamodel = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)) + gensim_ldamodel = models.LdaModel( + corpus=corpus, id2word=dictionary, num_topics=10, passes=100, + minimum_probability=0, random_state=numpy.random.seed(0) + ) texts_new = ['graph', 'eulerian'] bow = self.model.id2word.doc2bow(texts_new) @@ -190,7 +211,7 @@ def testPipeline(self): uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - id2word = Dictionary(map(lambda x: x.split(), data.data)) + id2word = Dictionary([x.split() for x in data.data]) corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) @@ -238,7 +259,10 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - lda_wrapper = LdaTransformer(id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)) + lda_wrapper = LdaTransformer( + id2word=dictionary, num_topics=2, passes=100, + minimum_probability=0, random_state=numpy.random.seed(0) + ) texts_new = ['graph', 'eulerian'] bow = lda_wrapper.id2word.doc2bow(texts_new) self.assertRaises(NotFittedError, lda_wrapper.transform, bow) @@ -280,7 +304,7 @@ def testPipeline(self): uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - id2word = Dictionary(map(lambda x: x.split(), data.data)) + id2word = Dictionary([x.split() for x in data.data]) corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) @@ -336,14 +360,14 @@ def testModelNotFitted(self): class TestLdaSeqWrapper(unittest.TestCase): def setUp(self): - self.model = LdaSeqTransformer(id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim') + self.model = LdaSeqTransformer( + id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim' + ) self.model.fit(corpus_ldaseq) def testTransform(self): # transforming two documents - docs = [] - docs.append(list(corpus_ldaseq)[0]) - docs.append(list(corpus_ldaseq)[1]) + docs = [list(corpus_ldaseq)[0], list(corpus_ldaseq)[1]] transformed_vecs = self.model.transform(docs) self.assertEqual(transformed_vecs.shape[0], 2) self.assertEqual(transformed_vecs.shape[1], self.model.num_topics) @@ -363,7 +387,7 @@ def testPipeline(self): data = cache test_data = data.data[0:2] test_target = data.target[0:2] - id2word = Dictionary(map(lambda x: x.split(), test_data)) + id2word = Dictionary([x.split() for x in test_data]) corpus = [id2word.doc2bow(i.split()) for i in test_data] model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim') clf = linear_model.LogisticRegression(penalty='l2', C=0.1) @@ -412,9 +436,7 @@ def setUp(self): def testTransform(self): # tranform two documents - docs = [] - docs.append(list(self.corpus)[0]) - docs.append(list(self.corpus)[1]) + docs = [list(self.corpus)[0], list(self.corpus)[1]] matrix = self.model.transform(docs) self.assertEqual(matrix.shape[0], 2) self.assertEqual(matrix.shape[1], self.model.num_topics) @@ -433,7 +455,7 @@ def testPipeline(self): uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - id2word = Dictionary(map(lambda x: x.split(), data.data)) + id2word = Dictionary([x.split() for x in data.data]) corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) @@ -514,11 +536,13 @@ def testPipeline(self): class_dict = {'mathematics': 1, 'physics': 0} train_data = [ - ('calculus', 'mathematics'), ('mathematical', 'mathematics'), ('geometry', 'mathematics'), ('operations', 'mathematics'), ('curves', 'mathematics'), - ('natural', 'physics'), ('nuclear', 'physics'), ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics') + ('calculus', 'mathematics'), ('mathematical', 'mathematics'), + ('geometry', 'mathematics'), ('operations', 'mathematics'), + ('curves', 'mathematics'), ('natural', 'physics'), ('nuclear', 'physics'), + ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics') ] - train_input = list(map(lambda x: x[0], train_data)) - train_target = list(map(lambda x: class_dict[x[1]], train_data)) + train_input = [x[0] for x in train_data] + train_target = [class_dict[x[1]] for x in train_data] clf = linear_model.LogisticRegression(penalty='l2', C=0.1) clf.fit(model.transform(train_input), train_target) @@ -648,10 +672,7 @@ def setUp(self): def testTransform(self): # tranform multiple documents - docs = [] - docs.append(w2v_texts[0]) - docs.append(w2v_texts[1]) - docs.append(w2v_texts[2]) + docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]] matrix = self.model.transform(docs) self.assertEqual(matrix.shape[0], 3) self.assertEqual(matrix.shape[1], self.model.size) @@ -682,8 +703,8 @@ def testPipeline(self): (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'), (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics') ] - train_input = list(map(lambda x: x[0], train_data)) - train_target = list(map(lambda x: class_dict[x[1]], train_data)) + train_input = [x[0] for x in train_data] + train_target = [class_dict[x[1]] for x in train_data] clf = linear_model.LogisticRegression(penalty='l2', C=0.1) clf.fit(model.transform(train_input), train_target) @@ -737,7 +758,7 @@ def testTransform(self): doc = ['computer system interface time computer system'] bow_vec = self.model.transform(doc)[0] expected_values = [1, 1, 2, 2] # comparing only the word-counts - values = list(map(lambda x: x[1], bow_vec)) + values = [x[1] for x in bow_vec] self.assertEqual(sorted(expected_values), sorted(values)) def testSetGetParams(self): @@ -794,8 +815,11 @@ def testTransform(self): # tranform multiple documents docs = [corpus[0], corpus[1]] transformed_docs = self.model.transform(docs) - expected_docs = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], - [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]] + expected_docs = [ + [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], + [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), + (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)] + ] self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) @@ -815,7 +839,7 @@ def testPipeline(self): uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - id2word = Dictionary(map(lambda x: x.split(), data.data)) + id2word = Dictionary([x.split() for x in data.data]) corpus = [id2word.doc2bow(i.split()) for i in data.data] tfidf_model = TfIdfTransformer() tfidf_model.fit(corpus) @@ -854,14 +878,20 @@ def testTransform(self): # tranform one document doc = self.corpus[0] transformed_doc = self.model.transform(doc) - expected_doc = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]] + expected_doc = [ + [0.81043386270128193, 0.049357139518070477, 0.035840906753517532, + 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148] + ] self.assertTrue(numpy.allclose(transformed_doc, expected_doc, atol=1e-2)) # tranform multiple documents docs = [self.corpus[0], self.corpus[1]] transformed_docs = self.model.transform(docs) - expected_docs = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148], - [0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.]] + expected_docs = [ + [0.81043386270128193, 0.049357139518070477, 0.035840906753517532, + 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148], + [0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.] + ] self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0], atol=1e-2)) self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1], atol=1e-2)) @@ -881,7 +911,7 @@ def testPipeline(self): uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - id2word = Dictionary(map(lambda x: x.split(), data.data)) + id2word = Dictionary([x.split() for x in data.data]) corpus = [id2word.doc2bow(i.split()) for i in data.data] model = HdpTransformer(id2word=id2word) clf = linear_model.LogisticRegression(penalty='l2', C=0.1) diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index bb00b5482d..65e2939857 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -25,15 +25,17 @@ # set up vars used in testing ("Deerwester" from the web tutorial) -texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] +texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index 9bdbcbdb8d..ca81d6e51a 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -28,7 +28,7 @@ class TestWikiCorpus(unittest.TestCase): # #TODO: sporadic failure to be investigated # def test_get_texts_returns_generator_of_lists(self): - # logger.debug("Current Python Version is " + str(sys.version_info)) + # logger.debug("Current Python Version is %s", str(sys.version_info)) # if sys.version_info < (2, 7, 0): # return # diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 29ae713b90..81123ccd7a 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -140,14 +140,18 @@ def test_sg_neg_online(self): def test_cbow_hs_online(self): """Test CBOW w/ hierarchical softmax""" - model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=3, iter=10, seed=42, workers=2) + model = word2vec.Word2Vec( + sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=3, iter=10, seed=42, workers=2 + ) self.onlineSanity(model) def test_cbow_neg_online(self): """Test CBOW w/ negative sampling""" - model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, seed=42, workers=2, sample=0) + model = word2vec.Word2Vec( + sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, + min_count=5, iter=10, seed=42, workers=2, sample=0 + ) self.onlineSanity(model) def testPersistence(self): @@ -241,7 +245,9 @@ def testPersistenceWord2VecFormat(self): self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) self.assertEquals(len(limited_model_kv.syn0), 3) - half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, datatype=np.float16) + half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( + testfile(), binary=True, datatype=np.float16 + ) self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) def testNoTrainingCFormat(self): @@ -284,7 +290,9 @@ def testPersistenceWord2VecFormatNonBinary(self): norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False) norm_only_model.init_sims(True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6)) - self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4)) + self.assertTrue(np.allclose( + model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4 + )) def testPersistenceWord2VecFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" @@ -450,14 +458,18 @@ def test_sg_neg(self): def test_cbow_hs(self): """Test CBOW w/ hierarchical softmax""" - model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, - min_count=5, iter=10, workers=2, batch_words=1000) + model = word2vec.Word2Vec( + sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, + min_count=5, iter=10, workers=2, batch_words=1000 + ) self.model_sanity(model) def test_cbow_neg(self): """Test CBOW w/ negative sampling""" - model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, workers=2, sample=0) + model = word2vec.Word2Vec( + sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, + min_count=5, iter=10, workers=2, sample=0 + ) self.model_sanity(model) def test_cosmul(self): @@ -654,8 +666,10 @@ def testBuildVocabWarning(self, l): @log_capture() def testTrainWarning(self, l): """Test if warning is raised if alpha rises during subsequent calls to train()""" - sentences = [['human'], - ['graph', 'trees']] + sentences = [ + ['human'], + ['graph', 'trees'] + ] model = word2vec.Word2Vec(min_count=1) model.build_vocab(sentences) for epoch in range(10): @@ -814,6 +828,7 @@ def assertLess(self, a, b, msg=None): if __name__ == '__main__': logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', - level=logging.DEBUG) + level=logging.DEBUG + ) logging.info("using optimization %s", word2vec.FAST_VERSION) unittest.main() diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index 8f8d5b5f9d..4ecb9f7c70 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -36,7 +36,10 @@ def setUp(self): self.wr_file = datapath('test_glove.txt') if not self.wr_path: return - self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, np=2, cleanup_files=True) + self.test_model = wordrank.Wordrank.train( + self.wr_path, self.corpus_file, self.out_name, iter=6, + dump_period=5, period=5, np=2, cleanup_files=True + ) def testLoadWordrankFormat(self): """Test model successfully loaded from Wordrank format file""" diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 247d8c146d..a3b3463391 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -9,7 +9,6 @@ """ import logging - import numpy as np logger = logging.getLogger(__name__) @@ -38,8 +37,10 @@ def log_conditional_probability(segmented_topics, accumulator): for w_prime, w_star in s_i: w_star_count = accumulator[w_star] if w_star_count == 0: - raise ValueError("Topic with id %d not found in corpus used to compute coherence. " - "Try using a larger corpus with a smaller vocobulary and/or setting a smaller value of `topn` for `CoherenceModel`." % (w_star)) + raise ValueError( + "Topic with id %d not found in corpus used to compute coherence. " + "Try using a larger corpus with a smaller vocabulary and/or setting a smaller value of `topn` for `CoherenceModel`." % w_star + ) co_occur_count = accumulator[w_prime, w_star] m_lc_i = np.log(((co_occur_count / num_docs) + EPSILON) / (w_star_count / num_docs)) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 7832494a5c..1ddd70cbb0 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -11,8 +11,7 @@ import itertools import logging -from gensim.topic_coherence.text_analysis import \ - CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator +from gensim.topic_coherence.text_analysis import CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator logger = logging.getLogger(__name__) diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 9097036914..2db0d695d2 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -27,18 +27,18 @@ def s_one_pre(topics): topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: - s_one_pre : list of list of (W', W*) tuples for all unique topic ids + s_one_pre_res : list of list of (W', W*) tuples for all unique topic ids """ - s_one_pre = [] + s_one_pre_res = [] for top_words in topics: s_one_pre_t = [] for w_prime_index, w_prime in enumerate(top_words[1:]): for w_star in top_words[:w_prime_index + 1]: s_one_pre_t.append((w_prime, w_star)) - s_one_pre.append(s_one_pre_t) + s_one_pre_res.append(s_one_pre_t) - return s_one_pre + return s_one_pre_res def s_one_one(topics): @@ -55,9 +55,9 @@ def s_one_one(topics): topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: - s_one_one : list of list of (W', W*) tuples for all unique topic ids + s_one_one_res : list of list of (W', W*) tuples for all unique topic ids """ - s_one_one = [] + s_one_one_res = [] for top_words in topics: s_one_one_t = [] @@ -67,9 +67,9 @@ def s_one_one(topics): continue else: s_one_one_t.append((w_prime, w_star)) - s_one_one.append(s_one_one_t) + s_one_one_res.append(s_one_one_t) - return s_one_one + return s_one_one_res def s_one_set(topics): @@ -87,14 +87,14 @@ def s_one_set(topics): topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: - s_one_set : list of list of (W', W*) tuples for all unique topic ids. + s_one_set_res : list of list of (W', W*) tuples for all unique topic ids. """ - s_one_set = [] + s_one_set_res = [] for top_words in topics: s_one_set_t = [] for w_prime in top_words: s_one_set_t.append((w_prime, top_words)) - s_one_set.append(s_one_set_t) + s_one_set_res.append(s_one_set_t) - return s_one_set + return s_one_set_res diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 7305fe9792..3254a34885 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -64,9 +64,7 @@ def num_docs(self): def num_docs(self, num): self._num_docs = num if self._num_docs % self.log_every == 0: - logger.info( - "%s accumulated stats from %d documents", - self.__class__.__name__, self._num_docs) + logger.info("%s accumulated stats from %d documents", self.__class__.__name__, self._num_docs) def analyze_text(self, text, doc_num=None): raise NotImplementedError("Base classes should implement analyze_text.") @@ -369,9 +367,7 @@ def queue_all_texts(self, q, texts, window_size): before = self._num_docs / self.log_every self._num_docs += sum(len(doc) - window_size + 1 for doc in batch) if before < (self._num_docs / self.log_every): - logger.info( - "%d batches submitted to accumulate stats from %d documents (%d virtual)", - (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs) + logger.info("%d batches submitted to accumulate stats from %d documents (%d virtual)", (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs) def terminate_workers(self, input_q, output_q, workers, interrupted=False): """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, @@ -414,9 +410,7 @@ def merge_accumulators(self, accumulators): # Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized. # This is by design, to avoid unnecessary matrix additions/conversions during accumulation. accumulator._symmetrize() - logger.info( - "accumulated word occurrence stats for %d virtual documents", - accumulator.num_docs) + logger.info("accumulated word occurrence stats for %d virtual documents", accumulator.num_docs) return accumulator @@ -435,9 +429,7 @@ def run(self): try: self._run() except KeyboardInterrupt: - logger.info( - "%s interrupted after processing %d documents", - self.__class__.__name__, self.accumulator.num_docs) + logger.info("%s interrupted after processing %d documents", self.__class__.__name__, self.accumulator.num_docs) except Exception: logger.exception("worker encountered unexpected exception") finally: @@ -455,13 +447,9 @@ def _run(self): self.accumulator.partial_accumulate(docs, self.window_size) n_docs += len(docs) - logger.debug( - "completed batch %d; %d documents processed (%d virtual)", - batch_num, n_docs, self.accumulator.num_docs) + logger.debug("completed batch %d; %d documents processed (%d virtual)", batch_num, n_docs, self.accumulator.num_docs) - logger.debug( - "finished all batches; %d documents processed (%d virtual)", - n_docs, self.accumulator.num_docs) + logger.debug("finished all batches; %d documents processed (%d virtual)", n_docs, self.accumulator.num_docs) def reply_to_master(self): logger.info("serializing accumulator to return to master...") diff --git a/gensim/utils.py b/gensim/utils.py index 47d7bc98cd..10555d2b51 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -106,12 +106,12 @@ def _synched(func): @wraps(func) def _synchronizer(self, *args, **kwargs): tlock = getattr(self, tlockname) - logger.debug("acquiring lock %r for %s" % (tlockname, func.__name__)) + logger.debug("acquiring lock %r for %s", tlockname, func.__name__) with tlock: # use lock as a context manager to perform safe acquire/release pairs - logger.debug("acquired lock %r for %s" % (tlockname, func.__name__)) + logger.debug("acquired lock %r for %s", tlockname, func.__name__) result = func(self, *args, **kwargs) - logger.debug("releasing lock %r for %s" % (tlockname, func.__name__)) + logger.debug("releasing lock %r for %s", tlockname, func.__name__) return result return _synchronizer return _synched @@ -181,8 +181,7 @@ def copytree_hardlink(source, dest): shutil.copy2 = copy2 -def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, - lower=False): +def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False): """ Iteratively yield tokens as unicode strings, removing accent marks and optionally lowercasing the unidoce string by assigning True @@ -275,7 +274,7 @@ def load(cls, fname, mmap=None): is encountered. """ - logger.info("loading %s object from %s" % (cls.__name__, fname)) + logger.info("loading %s object from %s", cls.__name__, fname) compress, subname = SaveLoad._adapt_by_suffix(fname) @@ -292,17 +291,16 @@ def _load_specials(self, fname, mmap, compress, subname): """ mmap_error = lambda x, y: IOError( 'Cannot mmap compressed object %s in file %s. ' % (x, y) + - 'Use `load(fname, mmap=None)` or uncompress files manually.') + 'Use `load(fname, mmap=None)` or uncompress files manually.' + ) for attrib in getattr(self, '__recursive_saveloads', []): cfname = '.'.join((fname, attrib)) - logger.info("loading %s recursively from %s.* with mmap=%s" % ( - attrib, cfname, mmap)) + logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap) getattr(self, attrib)._load_specials(cfname, mmap, compress, subname) for attrib in getattr(self, '__numpys', []): - logger.info("loading %s from %s with mmap=%s" % ( - attrib, subname(fname, attrib), mmap)) + logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap) if compress: if mmap: @@ -315,8 +313,7 @@ def _load_specials(self, fname, mmap, compress, subname): setattr(self, attrib, val) for attrib in getattr(self, '__scipys', []): - logger.info("loading %s from %s with mmap=%s" % ( - attrib, subname(fname, attrib), mmap)) + logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap) sparse = unpickle(subname(fname, attrib)) if compress: if mmap: @@ -334,7 +331,7 @@ def _load_specials(self, fname, mmap, compress, subname): setattr(self, attrib, sparse) for attrib in getattr(self, '__ignoreds', []): - logger.info("setting ignored attribute %s to None" % (attrib)) + logger.info("setting ignored attribute %s to None", attrib) setattr(self, attrib, None) @staticmethod @@ -346,10 +343,9 @@ def _adapt_by_suffix(fname): else: compress = False subname = lambda *args: '.'.join(list(args) + ['npy']) - return (compress, subname) + return compress, subname - def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, - ignore=frozenset(), pickle_protocol=2): + def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): """ Save the object to file (also see `load`). @@ -370,9 +366,7 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, in both Python 2 and 3. """ - logger.info( - "saving %s object under %s, separately %s" % ( - self.__class__.__name__, fname, separately)) + logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately) compress, subname = SaveLoad._adapt_by_suffix(fname) @@ -419,17 +413,14 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading recursive_saveloads.append(attrib) cfname = '.'.join((fname, attrib)) - restores.extend(val._save_specials( - cfname, None, sep_limit, ignore, - pickle_protocol, compress, subname)) + restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname)) try: numpys, scipys, ignoreds = [], [], [] for attrib, val in iteritems(asides): if isinstance(val, np.ndarray) and attrib not in ignore: numpys.append(attrib) - logger.info("storing np array '%s' to %s" % ( - attrib, subname(fname, attrib))) + logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib)) if compress: np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val)) @@ -438,15 +429,15 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: scipys.append(attrib) - logger.info("storing scipy.sparse array '%s' under %s" % ( - attrib, subname(fname, attrib))) + logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib)) if compress: np.savez_compressed( subname(fname, attrib, 'sparse'), data=val.data, indptr=val.indptr, - indices=val.indices) + indices=val.indices + ) else: np.save(subname(fname, attrib, 'data'), val.data) np.save(subname(fname, attrib, 'indptr'), val.indptr) @@ -461,7 +452,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, finally: val.data, val.indptr, val.indices = data, indptr, indices else: - logger.info("not storing attribute %s" % (attrib)) + logger.info("not storing attribute %s", attrib) ignoreds.append(attrib) self.__dict__['__numpys'] = numpys @@ -475,8 +466,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, raise return restores + [(self, asides)] - def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, - ignore=frozenset(), pickle_protocol=2): + def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): """ Save the object to file (also see `load`). @@ -504,11 +494,10 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, """ try: _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) - logger.info("saved %s object" % self.__class__.__name__) + logger.info("saved %s object", self.__class__.__name__) except TypeError: # `fname_or_handle` does not have write attribute self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) -# endclass SaveLoad def identity(p): @@ -864,10 +853,8 @@ def run(self): qsize = self.q.qsize() except NotImplementedError: qsize = '?' - logger.debug("prepared another chunk of %i documents (qsize=%s)" % - (len(wrapped_chunk[0]), qsize)) + logger.debug("prepared another chunk of %i documents (qsize=%s)", len(wrapped_chunk[0]), qsize) self.q.put(wrapped_chunk.pop(), block=True) -# endclass InputQueue if os.name == 'nt': @@ -957,7 +944,7 @@ def revdict(d): result (which one is kept is arbitrary). """ - return dict((v, k) for (k, v) in iteritems(dict(d))) + return {v: k for (k, v) in iteritems(dict(d))} def toptexts(query, texts, index, n=10): @@ -974,10 +961,7 @@ def toptexts(query, texts, index, n=10): sims = index[query] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) - result = [] - for topid, topcosine in sims[:n]: # only consider top-n most similar docs - result.append((topid, topcosine, texts[topid])) - return result + return [(topid, topcosine, texts[topid]) for topid, topcosine in sims[:n]] # only consider top-n most similar docs def randfname(prefix='gensim'): @@ -997,7 +981,7 @@ def upload_chunked(server, docs, chunksize=1000, preprocess=None): start = 0 for chunk in grouper(docs, chunksize): end = start + len(chunk) - logger.info("uploading documents %i-%i" % (start, end - 1)) + logger.info("uploading documents %i-%i", start, end - 1) if preprocess is not None: pchunk = [] for doc in chunk: @@ -1039,7 +1023,7 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None uri = daemon.register(obj, name) ns.remove(name) ns.register(name, uri) - logger.info("%s registered with nameserver (URI '%s')" % (name, uri)) + logger.info("%s registered with nameserver (URI '%s')", name, uri) daemon.requestLoop() @@ -1054,8 +1038,7 @@ def has_pattern(): return False -def lemmatize( - content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, +def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset(), min_length=2, max_length=15): """ This function is only available when the optional 'pattern' package is installed. @@ -1109,9 +1092,7 @@ def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): """ nnz = np.random.uniform(size=(dim,)) - data = [(i, float(np.random.poisson(lam=lam) + 1.0)) - for i in xrange(dim) if nnz[i] < prob_nnz] - return data + return [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz] def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0): @@ -1120,9 +1101,7 @@ def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0): to be used as a mock corpus. """ - data = [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) - for _ in xrange(n_items)] - return data + return [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)] def prune_vocab(vocab, min_reduce, trim_rule=None): @@ -1138,8 +1117,7 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce: result += vocab[w] del vocab[w] - logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", - old_len - len(vocab), min_reduce, old_len, len(vocab)) + logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", old_len - len(vocab), min_reduce, old_len, len(vocab)) return result diff --git a/setup.py b/setup.py index afeff174cd..90fd484c13 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def finalize_options(self): cmdclass = {'build_ext': custom_build_ext} -WHEELHOUSE_UPLOADER_COMMANDS = set(['fetch_artifacts', 'upload_all']) +WHEELHOUSE_UPLOADER_COMMANDS = {'fetch_artifacts', 'upload_all'} if WHEELHOUSE_UPLOADER_COMMANDS.intersection(sys.argv): import wheelhouse_uploader.cmd cmdclass.update(vars(wheelhouse_uploader.cmd))