diff --git a/continuous_integration/travis/flake8_diff.sh b/continuous_integration/travis/flake8_diff.sh
index 2f04ab3e31..37d12de61d 100755
--- a/continuous_integration/travis/flake8_diff.sh
+++ b/continuous_integration/travis/flake8_diff.sh
@@ -19,18 +19,18 @@ set -e
set -o pipefail
PROJECT=RaRe-Technologies/gensim
-PROJECT_URL=https://github.com/$PROJECT.git
+PROJECT_URL=https://github.com/${PROJECT}.git
# Find the remote with the project name (upstream in most cases)
-REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '')
+REMOTE=$(git remote -v | grep ${PROJECT} | cut -f1 | head -1 || echo '')
# Add a temporary remote if needed. For example this is necessary when
# Travis is configured to run in a fork. In this case 'origin' is the
# fork and not the reference repo we want to diff against.
if [[ -z "$REMOTE" ]]; then
TMP_REMOTE=tmp_reference_upstream
- REMOTE=$TMP_REMOTE
- git remote add $REMOTE $PROJECT_URL
+ REMOTE=${TMP_REMOTE}
+ git remote add ${REMOTE} ${PROJECT_URL}
fi
echo "Remotes:"
@@ -56,15 +56,15 @@ if [[ "$TRAVIS" == "true" ]]; then
echo "New branch, no commit range from Travis so passing this test by convention"
exit 0
fi
- COMMIT_RANGE=$TRAVIS_COMMIT_RANGE
+ COMMIT_RANGE=${TRAVIS_COMMIT_RANGE}
fi
else
# We want to fetch the code as it is in the PR branch and not
# the result of the merge into develop. This way line numbers
# reported by Travis will match with the local code.
- LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST
+ LOCAL_BRANCH_REF=travis_pr_${TRAVIS_PULL_REQUEST}
# In Travis the PR target is always origin
- git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF
+ git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:refs/${LOCAL_BRANCH_REF}
fi
fi
@@ -76,34 +76,34 @@ if [[ -z "$COMMIT_RANGE" ]]; then
fi
echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:"
echo '--------------------------------------------------------------------------------'
- git log -2 $LOCAL_BRANCH_REF
+ git log -2 ${LOCAL_BRANCH_REF}
REMOTE_MASTER_REF="$REMOTE/develop"
# Make sure that $REMOTE_MASTER_REF is a valid reference
echo -e "\nFetching $REMOTE_MASTER_REF"
echo '--------------------------------------------------------------------------------'
- git fetch $REMOTE develop:refs/remotes/$REMOTE_MASTER_REF
- LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF)
- REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short $REMOTE_MASTER_REF)
+ git fetch ${REMOTE} develop:refs/remotes/${REMOTE_MASTER_REF}
+ LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short ${LOCAL_BRANCH_REF})
+ REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short ${REMOTE_MASTER_REF})
- COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MASTER_REF) || \
- echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MASTER_REF -q)"
+ COMMIT=$(git merge-base ${LOCAL_BRANCH_REF} ${REMOTE_MASTER_REF}) || \
+ echo "No common ancestor found for $(git show ${LOCAL_BRANCH_REF} -q) and $(git show ${REMOTE_MASTER_REF} -q)"
if [ -z "$COMMIT" ]; then
exit 1
fi
- COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT)
+ COMMIT_SHORT_HASH=$(git rev-parse --short ${COMMIT})
echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\
"and $REMOTE_MASTER_REF ($REMOTE_MASTER_SHORT_HASH) is $COMMIT_SHORT_HASH:"
echo '--------------------------------------------------------------------------------'
- git show --no-patch $COMMIT_SHORT_HASH
+ git show --no-patch ${COMMIT_SHORT_HASH}
COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH"
if [[ -n "$TMP_REMOTE" ]]; then
- git remote remove $TMP_REMOTE
+ git remote remove ${TMP_REMOTE}
fi
else
@@ -111,19 +111,19 @@ else
fi
echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \
- "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):"
+ "($(git rev-list ${COMMIT_RANGE} | wc -l) commit(s)):"
echo '--------------------------------------------------------------------------------'
# We ignore files from sklearn/externals.
# Excluding vec files since they contain non-utf8 content and flake8 raises exception for non-utf8 input
# We need the following command to exit with 0 hence the echo in case
# there is no match
-MODIFIED_PY_FILES="$(git diff --name-only $COMMIT_RANGE | grep '[a-zA-Z0-9]*.py$' || echo "no_match")"
-MODIFIED_IPYNB_FILES="$(git diff --name-only $COMMIT_RANGE | grep '[a-zA-Z0-9]*.ipynb$' || echo "no_match")"
+MODIFIED_PY_FILES="$(git diff --name-only ${COMMIT_RANGE} | grep '[a-zA-Z0-9]*.py$' || echo "no_match")"
+MODIFIED_IPYNB_FILES="$(git diff --name-only ${COMMIT_RANGE} | grep '[a-zA-Z0-9]*.ipynb$' || echo "no_match")"
-echo "*.py files: " $MODIFIED_PY_FILES
-echo "*.ipynb files: " $MODIFIED_IPYNB_FILES
+echo "*.py files: " ${MODIFIED_PY_FILES}
+echo "*.ipynb files: " ${MODIFIED_IPYNB_FILES}
check_files() {
@@ -133,7 +133,7 @@ check_files() {
if [ -n "$files" ]; then
# Conservative approach: diff without context (--unified=0) so that code
# that was not changed does not create failures
- git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options
+ git diff --unified=0 ${COMMIT_RANGE} -- ${files} | flake8 --diff --show-source ${options}
fi
}
@@ -150,6 +150,6 @@ else
for fname in ${MODIFIED_IPYNB_FILES}
do
echo "File: $fname"
- jupyter nbconvert --to script --stdout $fname | flake8 - --show-source --ignore=E501,E731,E12,W503,E402 --builtins=get_ipython || true
+ jupyter nbconvert --to script --stdout ${fname} | flake8 - --show-source --ignore=E501,E731,E12,W503,E402 --builtins=get_ipython || true
done
fi
diff --git a/continuous_integration/travis/install.sh b/continuous_integration/travis/install.sh
index 1ba1796a4b..c14ac86925 100755
--- a/continuous_integration/travis/install.sh
+++ b/continuous_integration/travis/install.sh
@@ -9,5 +9,5 @@ export PATH=/home/travis/miniconda2/bin:$PATH
conda update --yes conda
-conda create --yes -n gensim-test python=$PYTHON_VERSION pip atlas flake8 jupyter numpy==$NUMPY_VERSION scipy==$SCIPY_VERSION && source activate gensim-test
+conda create --yes -n gensim-test python=${PYTHON_VERSION} pip atlas flake8 jupyter numpy==${NUMPY_VERSION} scipy==${SCIPY_VERSION} && source activate gensim-test
pip install . && pip install .[test]
diff --git a/docker/start_jupyter_notebook.sh b/docker/start_jupyter_notebook.sh
index 4c5946d056..7893536dd6 100644
--- a/docker/start_jupyter_notebook.sh
+++ b/docker/start_jupyter_notebook.sh
@@ -4,4 +4,4 @@ PORT=$1
NOTEBOOK_DIR=/gensim/docs/notebooks
DEFAULT_URL=/notebooks/gensim%20Quick%20Start.ipynb
-jupyter notebook --no-browser --ip=* --port=$PORT --allow-root --notebook-dir=$NOTEBOOK_DIR --NotebookApp.token=\"\" --NotebookApp.default_url=$DEFAULT_URL
\ No newline at end of file
+jupyter notebook --no-browser --ip=* --port=${PORT} --allow-root --notebook-dir=${NOTEBOOK_DIR} --NotebookApp.token=\"\" --NotebookApp.default_url=${DEFAULT_URL}
diff --git a/docs/notebooks/test_notebooks.py b/docs/notebooks/test_notebooks.py
index 05dd7082f5..77633b7037 100644
--- a/docs/notebooks/test_notebooks.py
+++ b/docs/notebooks/test_notebooks.py
@@ -28,16 +28,11 @@ def _notebook_run(path):
print(str(e.traceback).split("\n")[-2])
else:
raise e
- except TimeoutError as e:
+ except RuntimeError as e:
print(e)
finally:
nbformat.write(nb, fout)
- #nb = nbformat.read(fout, nbformat.current_nbformat)
-
- #errors = errors.extend(
- #[output for cell in nb.cells if "outputs" in cell
- # for output in cell["outputs"] if output.output_type == "error"])
return nb, errors
diff --git a/docs/src/conf.py b/docs/src/conf.py
index 622db5fec7..b1557c906c 100644
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -11,7 +11,8 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys, os
+import os
+import sys
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
@@ -34,7 +35,7 @@
source_suffix = '.rst'
# The encoding of source files.
-#source_encoding = 'utf-8'
+# source_encoding = 'utf-8'
# The master toctree document.
master_doc = 'indextoc'
@@ -58,67 +59,67 @@
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
-#language = None
+# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of documents that shouldn't be included in the build.
-#unused_docs = []
+# unused_docs = []
# List of directories, relative to source directory, that shouldn't be searched
# for source files.
exclude_trees = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
-#html_theme = 'default'
+# html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#main_colour = "#ffbbbb"
+# main_colour = "#ffbbbb"
html_theme_options = {
-#"rightsidebar": "false",
-#"stickysidebar": "true",
-#"bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
-#"headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
-#"sidebarbgcolor": "fuckyou",
-#"footerbgcolor": "#771111",
-#"relbarbgcolor": "#993333",
-#"sidebartextcolor": "#000000",
-#"sidebarlinkcolor": "#330000",
-#"codebgcolor": "#fffff0",
-#"headtextcolor": "#000080",
-#"headbgcolor": "#f0f0ff",
-#"bgcolor": "#ffffff",
+# "rightsidebar": "false",
+# "stickysidebar": "true",
+# "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
+# "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'",
+# "sidebarbgcolor": "fuckyou",
+# "footerbgcolor": "#771111",
+# "relbarbgcolor": "#993333",
+# "sidebartextcolor": "#000000",
+# "sidebarlinkcolor": "#330000",
+# "codebgcolor": "#fffff0",
+# "headtextcolor": "#000080",
+# "headbgcolor": "#f0f0ff",
+# "bgcolor": "#ffffff",
}
@@ -130,11 +131,11 @@
html_title = "gensim"
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = ''
+# html_short_title = ''
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
@@ -152,17 +153,17 @@
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-html_sidebars = {} #{'index': ['download.html', 'globaltoc.html', 'searchbox.html', 'indexsidebar.html']}
-#html_sidebars = {'index': ['globaltoc.html', 'searchbox.html']}
+html_sidebars = {} # {'index': ['download.html', 'globaltoc.html', 'searchbox.html', 'indexsidebar.html']}
+# html_sidebars = {'index': ['globaltoc.html', 'searchbox.html']}
# If false, no module index is generated.
-#html_use_modindex = True
+# html_use_modindex = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
html_split_index = False
@@ -175,10 +176,10 @@
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = ''
+# html_file_suffix = ''
# Output file base name for HTML help builder.
htmlhelp_basename = 'gensimdoc'
@@ -188,32 +189,30 @@
# -- Options for LaTeX output --------------------------------------------------
# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
+# latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
+# latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
-latex_documents = [
- ('index', 'gensim.tex', u'gensim Documentation', u'Radim Řehůřek', 'manual'),
-]
+latex_documents = [('index', 'gensim.tex', u'gensim Documentation', u'Radim Řehůřek', 'manual')]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
latex_use_parts = False
# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
+# latex_preamble = ''
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_use_modindex = True
+# latex_use_modindex = True
suppress_warnings = ['image.nonlocal_uri', 'ref.citation', 'ref.footnote']
diff --git a/gensim/__init__.py b/gensim/__init__.py
index c267afe4de..3923460991 100644
--- a/gensim/__init__.py
+++ b/gensim/__init__.py
@@ -3,7 +3,7 @@
similarities within a corpus of documents.
"""
-from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization # noqa:F401
+from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization, utils # noqa:F401
import logging
__version__ = '2.3.0'
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
index 327a36fc14..6bd96da716 100644
--- a/gensim/corpora/bleicorpus.py
+++ b/gensim/corpora/bleicorpus.py
@@ -45,7 +45,7 @@ def __init__(self, fname, fname_vocab=None):
`fname.vocab`.
"""
IndexedCorpus.__init__(self, fname)
- logger.info("loading corpus from %s" % fname)
+ logger.info("loading corpus from %s", fname)
if fname_vocab is None:
fname_base, _ = path.splitext(fname)
@@ -102,7 +102,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
else:
num_terms = 1 + max([-1] + id2word.keys())
- logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
+ logger.info("storing corpus in Blei's LDA-C format into %s", fname)
with utils.smart_open(fname, 'wb') as fout:
offsets = []
for doc in corpus:
@@ -113,7 +113,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
# write out vocabulary, in a format compatible with Blei's topics.py script
fname_vocab = utils.smart_extension(fname, '.vocab')
- logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
+ logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
for featureid in xrange(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
@@ -127,5 +127,3 @@ def docbyoffset(self, offset):
with utils.smart_open(self.fname) as f:
f.seek(offset)
return self.line2doc(f.readline())
-
-# endclass BleiCorpus
diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
index 6d3288bc0f..969437e571 100644
--- a/gensim/corpora/csvcorpus.py
+++ b/gensim/corpora/csvcorpus.py
@@ -36,7 +36,7 @@ def __init__(self, fname, labels):
`labels` = are class labels present in the input file? => skip the first column
"""
- logger.info("loading corpus from %s" % fname)
+ logger.info("loading corpus from %s", fname)
self.fname = fname
self.length = None
self.labels = labels
@@ -45,7 +45,7 @@ def __init__(self, fname, labels):
head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
self.headers = csv.Sniffer().has_header(head)
self.dialect = csv.Sniffer().sniff(head)
- logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))
+ logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
def __iter__(self):
"""
@@ -60,8 +60,6 @@ def __iter__(self):
for line_no, line in enumerate(reader):
if self.labels:
line.pop(0) # ignore the first column = class label
- yield list(enumerate(map(float, line)))
+ yield list(enumerate(float(x) for x in line))
self.length = line_no + 1 # store the total number of CSV rows = documents
-
-# endclass CsvCorpus
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
index d32276688b..08c4097f03 100644
--- a/gensim/corpora/dictionary.py
+++ b/gensim/corpora/dictionary.py
@@ -61,7 +61,7 @@ def __getitem__(self, tokenid):
if len(self.id2token) != len(self.token2id):
# the word->id mapping has changed (presumably via add_documents);
# recompute id->word accordingly
- self.id2token = dict((v, k) for k, v in iteritems(self.token2id))
+ self.id2token = utils.revdict(self.token2id)
return self.id2token[tokenid] # will throw for non-existent ids
def __iter__(self):
@@ -120,7 +120,8 @@ def add_documents(self, documents, prune_at=2000000):
logger.info(
"built %s from %i documents (total %i corpus positions)",
- self, self.num_docs, self.num_pos)
+ self, self.num_docs, self.num_pos
+ )
def doc2bow(self, document, allow_update=False, return_missing=False):
"""
@@ -147,14 +148,14 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
token2id = self.token2id
if allow_update or return_missing:
- missing = dict((w, freq) for w, freq in iteritems(counter) if w not in token2id)
+ missing = {w: freq for w, freq in iteritems(counter) if w not in token2id}
if allow_update:
for w in missing:
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)
- result = dict((token2id[w], freq) for w, freq in iteritems(counter) if w in token2id)
+ result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
if allow_update:
self.num_docs += 1
@@ -201,15 +202,17 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N
else:
good_ids = (
v for v in itervalues(self.token2id)
- if no_below <= self.dfs.get(v, 0) <= no_above_abs)
+ if no_below <= self.dfs.get(v, 0) <= no_above_abs
+ )
good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
if keep_n is not None:
good_ids = good_ids[:keep_n]
- bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)]
+ bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)]
logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
logger.info(
"keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
- len(good_ids), no_below, no_above_abs, 100.0 * no_above)
+ len(good_ids), no_below, no_above_abs, 100.0 * no_above
+ )
# do the actual filtering, then rebuild dictionary to remove gaps in ids
self.filter_tokens(good_ids=good_ids)
@@ -229,11 +232,11 @@ def filter_n_most_frequent(self, remove_n):
most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True)
most_frequent_ids = most_frequent_ids[:remove_n]
# do the actual filtering, then rebuild dictionary to remove gaps in ids
- most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids]
+ most_frequent_words = [(self[idx], self.dfs.get(idx, 0)) for idx in most_frequent_ids]
logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])
self.filter_tokens(bad_ids=most_frequent_ids)
- logger.info("resulting dictionary: %s" % self)
+ logger.info("resulting dictionary: %s", self)
def filter_tokens(self, bad_ids=None, good_ids=None):
"""
@@ -244,20 +247,12 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
"""
if bad_ids is not None:
bad_ids = set(bad_ids)
- self.token2id = dict((token, tokenid)
- for token, tokenid in iteritems(self.token2id)
- if tokenid not in bad_ids)
- self.dfs = dict((tokenid, freq)
- for tokenid, freq in iteritems(self.dfs)
- if tokenid not in bad_ids)
+ self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids}
+ self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids}
if good_ids is not None:
good_ids = set(good_ids)
- self.token2id = dict((token, tokenid)
- for token, tokenid in iteritems(self.token2id)
- if tokenid in good_ids)
- self.dfs = dict((tokenid, freq)
- for tokenid, freq in iteritems(self.dfs)
- if tokenid in good_ids)
+ self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids}
+ self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids}
self.compactify()
def compactify(self):
@@ -274,9 +269,9 @@ def compactify(self):
idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id))))
# reassign mappings to new ids
- self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id))
+ self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)}
self.id2token = {}
- self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
+ self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)}
def save_as_text(self, fname, sort_by_word=True):
"""
@@ -405,15 +400,16 @@ def from_corpus(corpus, id2word=None):
if id2word is None:
# make sure length(result) == get_max_id(corpus) + 1
- result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1))
+ result.token2id = {unicode(i): i for i in xrange(max_id + 1)}
else:
# id=>word mapping given: simply copy it
- result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word))
- for id in itervalues(result.token2id):
+ result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)}
+ for idx in itervalues(result.token2id):
# make sure all token ids have a valid `dfs` entry
- result.dfs[id] = result.dfs.get(id, 0)
+ result.dfs[idx] = result.dfs.get(idx, 0)
logger.info(
"built %s from %i documents (total %i corpus positions)",
- result, result.num_docs, result.num_pos)
+ result, result.num_docs, result.num_pos
+ )
return result
diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py
index 63f966b3cd..687ec241ac 100644
--- a/gensim/corpora/hashdictionary.py
+++ b/gensim/corpora/hashdictionary.py
@@ -101,7 +101,7 @@ def keys(self):
return range(len(self))
def __str__(self):
- return ("HashDictionary(%i id range)" % len(self))
+ return "HashDictionary(%i id range)" % len(self)
@staticmethod
def from_documents(*args, **kwargs):
@@ -117,11 +117,12 @@ def add_documents(self, documents):
"""
for docno, document in enumerate(documents):
if docno % 10000 == 0:
- logger.info("adding document #%i to %s" % (docno, self))
+ logger.info("adding document #%i to %s", docno, self)
self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
logger.info(
"built %s from %i documents (total %i corpus positions)",
- self, self.num_docs, self.num_pos)
+ self, self.num_docs, self.num_pos
+ )
def doc2bow(self, document, allow_update=False, return_missing=False):
"""
@@ -182,24 +183,21 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
"""
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs]
- ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n])
-
- self.dfs_debug = dict((word, freq)
- for word, freq in iteritems(self.dfs_debug)
- if word in ok)
- self.token2id = dict((token, tokenid)
- for token, tokenid in iteritems(self.token2id)
- if token in self.dfs_debug)
- self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug))
- for tokenid, tokens in iteritems(self.id2token))
- self.dfs = dict((tokenid, freq)
- for tokenid, freq in iteritems(self.dfs)
- if self.id2token.get(tokenid, set()))
+ ok = frozenset(word for word, freq in sorted(ok, key=lambda x: -x[1])[:keep_n])
+
+ self.dfs_debug = {word: freq for word, freq in iteritems(self.dfs_debug) if word in ok}
+ self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug}
+ self.id2token = {
+ tokenid: {token for token in tokens if token in self.dfs_debug}
+ for tokenid, tokens in iteritems(self.id2token)
+ }
+ self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())}
# for word->document frequency
logger.info(
"kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents",
- no_below, no_above_abs, 100.0 * no_above)
+ no_below, no_above_abs, 100.0 * no_above
+ )
def save_as_text(self, fname):
"""
@@ -216,6 +214,6 @@ def save_as_text(self, fname):
words = sorted(self[tokenid])
if words:
words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
- words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
+ words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda x: -x[1])]
words_df = '\t'.join(words_df)
fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), words_df)))
diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
index 62f29b25ed..af79a2fd5f 100644
--- a/gensim/corpora/indexedcorpus.py
+++ b/gensim/corpora/indexedcorpus.py
@@ -50,7 +50,7 @@ def __init__(self, fname, index_fname=None):
self.index = utils.unpickle(index_fname)
# change self.index into a numpy.ndarray to support fancy indexing
self.index = numpy.asarray(self.index)
- logger.info("loaded corpus index from %s" % index_fname)
+ logger.info("loaded corpus index from %s", index_fname)
except Exception:
self.index = None
self.length = None
@@ -95,15 +95,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)
if offsets is None:
- raise NotImplementedError("called serialize on class %s which doesn't support indexing!" %
- serializer.__name__)
+ raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__)
# store offsets persistently, using pickle
# we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
# the offsets that are actually stored on disk - we're not storing self.index in any case, the
# load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
# backwards compatibility
- logger.info("saving %s index to %s" % (serializer.__name__, index_fname))
+ logger.info("saving %s index to %s", serializer.__name__, index_fname)
utils.pickle(offsets, index_fname)
def __len__(self):
@@ -115,7 +114,7 @@ def __len__(self):
return len(self.index)
if self.length is None:
logger.info("caching corpus length")
- self.length = sum(1 for doc in self)
+ self.length = sum(1 for _ in self)
return self.length
def __getitem__(self, docno):
@@ -128,6 +127,3 @@ def __getitem__(self, docno):
return self.docbyoffset(self.index[docno])
else:
raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')
-
-
-# endclass IndexedCorpus
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
index 315490cdcc..d5265f6571 100644
--- a/gensim/corpora/lowcorpus.py
+++ b/gensim/corpora/lowcorpus.py
@@ -15,7 +15,7 @@
from gensim import utils
from gensim.corpora import IndexedCorpus
-from six import iteritems, iterkeys
+from six import iterkeys
from six.moves import xrange, zip as izip
@@ -63,7 +63,7 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
simple splitting on spaces.
"""
IndexedCorpus.__init__(self, fname)
- logger.info("loading corpus from %s" % fname)
+ logger.info("loading corpus from %s", fname)
self.fname = fname # input file, see class doc for format
self.line2words = line2words # how to translate lines into words (simply split on space by default)
@@ -79,13 +79,15 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
else:
- logger.info("using provided word mapping (%i ids)" % len(id2word))
+ logger.info("using provided word mapping (%i ids)", len(id2word))
self.id2word = id2word
self.num_terms = len(self.word2id)
self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples
- logger.info("loaded corpus with %i documents and %i terms from %s" %
- (self.num_docs, self.num_terms, fname))
+ logger.info(
+ "loaded corpus with %i documents and %i terms from %s",
+ self.num_docs, self.num_terms, fname
+ )
def _calculate_num_docs(self):
# the first line in input data is the number of documents (integer). throws exception on bad input.
@@ -118,12 +120,11 @@ def line2doc(self, line):
use_words.append(word)
marker.add(word)
# construct a list of (wordIndex, wordFrequency) 2-tuples
- doc = list(zip(map(self.word2id.get, use_words),
- map(words.count, use_words)))
+ doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
else:
uniq_words = set(words)
# construct a list of (word, wordFrequency) 2-tuples
- doc = list(zip(uniq_words, map(words.count, uniq_words)))
+ doc = [(w, words.count(w)) for w in uniq_words]
# return the document, then forget it and move on to the next one
# note that this way, only one doc is stored in memory at a time, not the whole corpus
@@ -165,9 +166,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
fout.write(utils.to_utf8('%s\n' % ' '.join(words)))
if truncated:
- logger.warning("List-of-words format can only save vectors with "
- "integer elements; %i float entries were truncated to integer value" %
- truncated)
+ logger.warning(
+ "List-of-words format can only save vectors with integer elements; "
+ "%i float entries were truncated to integer value", truncated
+ )
return offsets
def docbyoffset(self, offset):
@@ -185,6 +187,4 @@ def id2word(self):
@id2word.setter
def id2word(self, val):
self._id2word = val
- self.word2id = dict((v, k) for k, v in iteritems(val))
-
-# endclass LowCorpus
+ self.word2id = utils.revdict(val)
diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
index 00333e9358..cacf0074bd 100644
--- a/gensim/corpora/malletcorpus.py
+++ b/gensim/corpora/malletcorpus.py
@@ -42,7 +42,7 @@ def __init__(self, fname, id2word=None, metadata=False):
def _calculate_num_docs(self):
with utils.smart_open(self.fname) as fin:
- result = sum([1 for x in fin])
+ result = sum(1 for _ in fin)
return result
def __iter__(self):
@@ -85,7 +85,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
logger.info("no word id mapping provided; initializing from corpus")
id2word = utils.dict_from_corpus(corpus)
- logger.info("storing corpus in Mallet format into %s" % fname)
+ logger.info("storing corpus in Mallet format into %s", fname)
truncated = 0
offsets = []
@@ -106,9 +106,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))
if truncated:
- logger.warning("Mallet format can only save vectors with "
- "integer elements; %i float entries were truncated to integer value" %
- truncated)
+ logger.warning(
+ "Mallet format can only save vectors with integer elements; "
+ "%i float entries were truncated to integer value", truncated
+ )
return offsets
@@ -119,5 +120,3 @@ def docbyoffset(self, offset):
with utils.smart_open(self.fname) as f:
f.seek(offset)
return self.line2doc(f.readline())
-
-# endclass MalletCorpus
diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py
index 08e809443b..2158f0a526 100644
--- a/gensim/corpora/mmcorpus.py
+++ b/gensim/corpora/mmcorpus.py
@@ -45,8 +45,8 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
This function is automatically called by `MmCorpus.serialize`; don't
call it directly, call `serialize` instead.
"""
- logger.info("storing corpus in Matrix Market format to %s" % fname)
+ logger.info("storing corpus in Matrix Market format to %s", fname)
num_terms = len(id2word) if id2word is not None else None
- return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata)
-
-# endclass MmCorpus
+ return matutils.MmWriter.write_corpus(
+ fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata
+ )
diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py
index 255fc2b7fe..e5904f2773 100644
--- a/gensim/corpora/sharded_corpus.py
+++ b/gensim/corpora/sharded_corpus.py
@@ -234,8 +234,7 @@ def __init__(self, output_prefix, corpus, dim=None,
self.current_offset = None # The index into the dataset which
# corresponds to index 0 of current shard
- logger.info('Initializing sharded corpus with prefix '
- '{0}'.format(output_prefix))
+ logger.info('Initializing sharded corpus with prefix %s', output_prefix)
if (not os.path.isfile(output_prefix)) or overwrite:
logger.info('Building from corpus...')
self.init_shards(output_prefix, corpus, shardsize)
@@ -243,8 +242,7 @@ def __init__(self, output_prefix, corpus, dim=None,
# Save automatically, to facilitate re-loading
# and retain information about how the corpus
# was serialized.
- logger.info('Saving ShardedCorpus object to '
- '{0}'.format(self.output_prefix))
+ logger.info('Saving ShardedCorpus object to %s', self.output_prefix)
self.save()
else:
logger.info('Cloning existing...')
@@ -254,19 +252,18 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
"""Initialize shards from the corpus."""
if not gensim.utils.is_corpus(corpus):
- raise ValueError('Cannot initialize shards without a corpus to read'
- ' from! (Got corpus type: {0})'.format(type(corpus)))
+ raise ValueError(
+ "Cannot initialize shards without a corpus to read from! (Got corpus type: {0})".format(type(corpus))
+ )
proposed_dim = self._guess_n_features(corpus)
if proposed_dim != self.dim:
if self.dim is None:
- logger.info('Deriving dataset dimension from corpus: '
- '{0}'.format(proposed_dim))
+ logger.info('Deriving dataset dimension from corpus: %d', proposed_dim)
else:
logger.warning(
- 'Dataset dimension derived from input corpus diffe'
- 'rs from initialization argument, using corpus.'
- '(corpus {0}, init arg {1})'.format(proposed_dim, self.dim)
+ "Dataset dimension derived from input corpus differs from initialization argument, "
+ "using corpus. (corpus %d, init arg %d)", proposed_dim, self.dim
)
self.dim = proposed_dim
@@ -277,11 +274,10 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
logger.info('Running init from corpus.')
for n, doc_chunk in enumerate(gensim.utils.grouper(corpus, chunksize=shardsize)):
- logger.info('Chunk no. {0} at {1} s'.format(n, time.clock() - start_time))
+ logger.info('Chunk no. %d at %f s', n, time.clock() - start_time)
current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype)
- logger.debug('Current chunk dimension: '
- '{0} x {1}'.format(len(doc_chunk), self.dim))
+ logger.debug('Current chunk dimension: %d x %d', len(doc_chunk), self.dim)
for i, doc in enumerate(doc_chunk):
doc = dict(doc)
@@ -294,7 +290,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
self.save_shard(current_shard)
end_time = time.clock()
- logger.info('Built {0} shards in {1} s.'.format(self.n_shards, end_time - start_time))
+ logger.info('Built %d shards in %f s.', self.n_shards, end_time - start_time)
def init_by_clone(self):
"""
@@ -309,12 +305,12 @@ def init_by_clone(self):
if temp.dim != self.dim:
if self.dim is None:
- logger.info('Loaded dataset dimension: {0}'.format(temp.dim))
+ logger.info('Loaded dataset dimension: %d', temp.dim)
else:
logger.warning(
- 'Loaded dataset dimension differs from init arg '
- 'dimension, using loaded dim. '
- '(loaded {0}, init {1})'.format(temp.dim, self.dim)
+ "Loaded dataset dimension differs from init arg dimension, "
+ "using loaded dim. (loaded %d, init %d)",
+ temp.dim, self.dim
)
self.dim = temp.dim # To be consistent with the loaded data!
@@ -346,8 +342,6 @@ def load_shard(self, n):
"""
Load (unpickle) the n-th shard as the "live" part of the dataset
into the Dataset object."""
- # logger.debug('ShardedCorpus loading shard {0}, '
- # 'current shard: {1}'.format(n, self.current_shard_n))
# No-op if the shard is already open.
if self.current_shard_n == n:
@@ -389,22 +383,12 @@ def shard_by_offset(self, offset):
' supported.'.format(offset))
return k
- k = -1
- for i, o in enumerate(self.offsets):
- if o > offset: # Condition should fire for every valid offset,
- # since the last offset is n_docs (one-past-end).
- k = i - 1 # First offset is always 0, so i is at least 1.
- break
-
- return k
-
def in_current(self, offset):
"""
Determine whether the given offset falls within the current shard.
"""
- return (self.current_offset <= offset) \
- and (offset < self.offsets[self.current_shard_n + 1])
+ return (self.current_offset <= offset) and (offset < self.offsets[self.current_shard_n + 1])
def in_next(self, offset):
"""
@@ -416,8 +400,7 @@ def in_next(self, offset):
"""
if self.current_shard_n == self.n_shards:
return False # There's no next shard.
- return (self.offsets[self.current_shard_n + 1] <= offset) \
- and (offset < self.offsets[self.current_shard_n + 2])
+ return (self.offsets[self.current_shard_n + 1] <= offset) and (offset < self.offsets[self.current_shard_n + 2])
def resize_shards(self, shardsize):
"""
@@ -472,9 +455,7 @@ def resize_shards(self, shardsize):
for old_shard_n, old_shard_name in enumerate(old_shard_names):
os.remove(old_shard_name)
except Exception as e:
- logger.error('Exception occurred during old shard no. {0} '
- 'removal: {1}.\nAttempting to at least move '
- 'new shards in.'.format(old_shard_n, str(e)))
+ logger.error('Exception occurred during old shard no. %d removal: %s.\nAttempting to at least move new shards in.', old_shard_n, str(e))
finally:
# If something happens with cleaning up - try to at least get the
# new guys in.
@@ -483,9 +464,8 @@ def resize_shards(self, shardsize):
os.rename(new_shard_name, self._shard_name(shard_n))
# If something happens when we're in this stage, we're screwed.
except Exception as e:
- print(e)
- raise RuntimeError('Resizing completely failed for some reason.'
- ' Sorry, dataset is probably ruined...')
+ logger.exception(e)
+ raise RuntimeError('Resizing completely failed for some reason. Sorry, dataset is probably ruined...')
finally:
# Sets the new shard stats.
self.n_shards = n_new_shards
@@ -529,21 +509,20 @@ def _guess_n_features(self, corpus):
return self._guess_n_features(corpus.corpus)
else:
if not self.dim:
- raise TypeError('Couldn\'t find number of features, '
- 'refusing to guess (dimension set to {0},'
- 'type of corpus: {1}).'.format(self.dim, type(corpus)))
- else:
- logger.warning(
- 'Couldn\'t find number of features, trusting '
- 'supplied dimension ({0})'.format(self.dim)
+ raise TypeError(
+ "Couldn't find number of features, refusing to guess "
+ "(dimension set to {0}, type of corpus: {1})."
+ .format(self.dim, type(corpus))
)
+ else:
+ logger.warning("Couldn't find number of features, trusting supplied dimension (%d)", self.dim)
n_features = self.dim
if self.dim and n_features != self.dim:
logger.warning(
- 'Discovered inconsistent dataset dim ({0}) and '
- 'feature count from corpus ({1}). Coercing to dimension'
- ' given by argument.'.format(self.dim, n_features)
+ "Discovered inconsistent dataset dim (%d) and feature count from corpus (%d). "
+ "Coercing to dimension given by argument.",
+ self.dim, n_features
)
return n_features
@@ -598,8 +577,7 @@ def __getitem__(self, offset):
start = offset.start
stop = offset.stop
if stop > self.n_docs:
- raise IndexError('Requested slice offset {0} out of range'
- ' ({1} docs)'.format(stop, self.n_docs))
+ raise IndexError('Requested slice offset {0} out of range ({1} docs)'.format(stop, self.n_docs))
# - get range of shards over which to iterate
first_shard = self.shard_by_offset(start)
@@ -610,16 +588,11 @@ def __getitem__(self, offset):
# This fails on one-past
# slice indexing; that's why there's a code branch here.
- # logger.debug('ShardedCorpus: Retrieving slice {0}: '
- # 'shard {1}'.format((offset.start, offset.stop),
- # (first_shard, last_shard)))
-
self.load_shard(first_shard)
# The easy case: both in one shard.
if first_shard == last_shard:
- s_result = self.current_shard[start - self.current_offset:
- stop - self.current_offset]
+ s_result = self.current_shard[start - self.current_offset: stop - self.current_offset]
# Handle different sparsity settings:
s_result = self._getitem_format(s_result)
@@ -627,11 +600,9 @@ def __getitem__(self, offset):
# The hard case: the slice is distributed across multiple shards
# - initialize numpy.zeros()
- s_result = numpy.zeros((stop - start, self.dim),
- dtype=self.current_shard.dtype)
+ s_result = numpy.zeros((stop - start, self.dim), dtype=self.current_shard.dtype)
if self.sparse_serialization:
- s_result = sparse.csr_matrix((0, self.dim),
- dtype=self.current_shard.dtype)
+ s_result = sparse.csr_matrix((0, self.dim), dtype=self.current_shard.dtype)
# - gradually build it up. We will be using three set of start:stop
# indexes:
@@ -652,13 +623,11 @@ def __getitem__(self, offset):
# - if in ending shard, these are from 0
# to (stop - current_offset)
shard_start = start - self.current_offset
- shard_stop = self.offsets[self.current_shard_n + 1] - \
- self.current_offset
+ shard_stop = self.offsets[self.current_shard_n + 1] - self.current_offset
# s_result[result_start:result_stop] = self.current_shard[
# shard_start:shard_stop]
- s_result = self.__add_to_slice(s_result, result_start, result_stop,
- shard_start, shard_stop)
+ s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop)
# First and last get special treatment, these are in between
for shard_n in xrange(first_shard + 1, last_shard):
@@ -669,9 +638,7 @@ def __getitem__(self, offset):
shard_start = 0
shard_stop = self.shardsize
- s_result = self.__add_to_slice(s_result, result_start,
- result_stop, shard_start,
- shard_stop)
+ s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop)
# Last shard
self.load_shard(last_shard)
@@ -680,9 +647,7 @@ def __getitem__(self, offset):
shard_start = 0
shard_stop = stop - self.current_offset
- s_result = self.__add_to_slice(s_result, result_start, result_stop,
- shard_start, shard_stop)
-
+ s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop)
s_result = self._getitem_format(s_result)
return s_result
@@ -707,10 +672,7 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop):
Returns the resulting s_result.
"""
if (result_stop - result_start) != (stop - start):
- raise ValueError('Result start/stop range different than stop/start'
- 'range (%d - %d vs. %d - %d)'.format(result_start,
- result_stop,
- start, stop))
+ raise ValueError('Result start/stop range different than stop/start range (%d - %d vs. %d - %d)'.format(result_start, result_stop, start, stop))
# Dense data: just copy using numpy's slice notation
if not self.sparse_serialization:
@@ -722,10 +684,7 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop):
# result.
else:
if s_result.shape != (result_start, self.dim):
- raise ValueError('Assuption about sparse s_result shape '
- 'invalid: {0} expected rows, {1} real '
- 'rows.'.format(result_start,
- s_result.shape[0]))
+ raise ValueError('Assuption about sparse s_result shape invalid: {0} expected rows, {1} real rows.'.format(result_start, s_result.shape[0]))
tmp_matrix = self.current_shard[start:stop]
s_result = sparse.vstack([s_result, tmp_matrix])
@@ -789,19 +748,12 @@ def save(self, *args, **kwargs):
if len(args) == 0:
args = tuple([self.output_prefix])
- attrs_to_ignore = ['current_shard',
- 'current_shard_n',
- 'current_offset']
+ attrs_to_ignore = ['current_shard', 'current_shard_n', 'current_offset']
if 'ignore' not in kwargs:
kwargs['ignore'] = frozenset(attrs_to_ignore)
else:
- kwargs['ignore'] = frozenset([v for v in kwargs['ignore']]
- + attrs_to_ignore)
+ kwargs['ignore'] = frozenset([v for v in kwargs['ignore']] + attrs_to_ignore)
super(ShardedCorpus, self).save(*args, **kwargs)
- #
- # self.reset()
- # with smart_open(self.output_prefix, 'wb') as pickle_handle:
- # cPickle.dump(self, pickle_handle)
@classmethod
def load(cls, fname, mmap=None):
@@ -811,8 +763,7 @@ def load(cls, fname, mmap=None):
return super(ShardedCorpus, cls).load(fname, mmap)
@staticmethod
- def save_corpus(fname, corpus, id2word=None, progress_cnt=1000,
- metadata=False, **kwargs):
+ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
"""
Implement a serialization interface. Do not call directly;
use the `serialize` method instead.
@@ -834,9 +785,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000,
ShardedCorpus(fname, corpus, **kwargs)
@classmethod
- def serialize(serializer, fname, corpus, id2word=None,
- index_fname=None, progress_cnt=None, labels=None,
- metadata=False, **kwargs):
+ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, **kwargs):
"""
Iterate through the document stream `corpus`, saving the documents
as a ShardedCorpus to `fname`.
@@ -849,6 +798,4 @@ def serialize(serializer, fname, corpus, id2word=None,
Ignore the parameters id2word, index_fname, progress_cnt, labels
and metadata. They currently do nothing and are here only to
provide a compatible method signature with superclass."""
- serializer.save_corpus(fname, corpus, id2word=id2word,
- progress_cnt=progress_cnt, metadata=metadata,
- **kwargs)
+ serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py
index 5e24419421..290414836e 100644
--- a/gensim/corpora/svmlightcorpus.py
+++ b/gensim/corpora/svmlightcorpus.py
@@ -56,7 +56,7 @@ def __init__(self, fname, store_labels=True):
"""
IndexedCorpus.__init__(self, fname)
- logger.info("loading corpus from %s" % fname)
+ logger.info("loading corpus from %s", fname)
self.fname = fname # input file, see class doc for format
self.length = None
@@ -89,7 +89,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
This function is automatically called by `SvmLightCorpus.serialize`; don't
call it directly, call `serialize` instead.
"""
- logger.info("converting corpus to SVMlight format: %s" % fname)
+ logger.info("converting corpus to SVMlight format: %s", fname)
offsets = []
with utils.smart_open(fname, 'wb') as fout:
@@ -129,5 +129,3 @@ def doc2line(doc, label=0):
"""
pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
return "%s %s\n" % (label, pairs)
-
-# endclass SvmLightCorpus
diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
index 4be904e613..7265d20d0c 100644
--- a/gensim/corpora/textcorpus.py
+++ b/gensim/corpora/textcorpus.py
@@ -112,8 +112,7 @@ class TextCorpus(interfaces.CorpusABC):
6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords
"""
- def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None,
- tokenizer=None, token_filters=None):
+ def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None):
"""
Args:
input (str): path to top-level directory to traverse for corpus documents.
@@ -171,9 +170,7 @@ def init_dictionary(self, dictionary):
else:
logger.info("Input stream provided but dictionary already initialized")
else:
- logger.warning(
- "No input document stream provided; assuming "
- "dictionary will be initialized some other way.")
+ logger.warning("No input document stream provided; assuming dictionary will be initialized some other way.")
def __iter__(self):
"""The function that defines a corpus.
@@ -231,8 +228,7 @@ def step_through_preprocess(self, text):
yield (self.tokenizer, tokens)
for token_filter in self.token_filters:
- tokens = token_filter(tokens)
- yield (token_filter, tokens)
+ yield (token_filter, token_filter(tokens))
def get_texts(self):
"""Iterate over the collection, yielding one document at a time. A document
@@ -308,7 +304,6 @@ def __len__(self):
# cache the corpus length
self.length = sum(1 for _ in self.getstream())
return self.length
-# endclass TextCorpus
class TextDirectoryCorpus(TextCorpus):
@@ -433,7 +428,6 @@ def _cache_corpus_length(self):
self.length = sum(1 for _ in self.iter_filepaths())
else:
self.length = sum(1 for _ in self.getstream())
-# endclass TextDirectoryCorpus
def walk(top, topdown=True, onerror=None, followlinks=False, depth=0):
diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
index 0c09cc7e34..a8911ee07f 100644
--- a/gensim/corpora/ucicorpus.py
+++ b/gensim/corpora/ucicorpus.py
@@ -21,7 +21,6 @@
from gensim.corpora import IndexedCorpus
from gensim.matutils import MmReader
from gensim.matutils import MmWriter
-from six import iteritems
from six.moves import xrange
@@ -37,7 +36,7 @@ def __init__(self, input):
which is expected to be in the UCI Bag-of-Words format.
"""
- logger.info('Initializing corpus reader from %s' % input)
+ logger.info('Initializing corpus reader from %s', input)
self.input = input
@@ -50,16 +49,16 @@ def __init__(self, input):
except StopIteration:
pass
- logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
- (self.num_docs, self.num_terms, self.num_nnz))
+ logger.info(
+ "accepted corpus with %i documents, %i features, %i non-zero entries",
+ self.num_docs, self.num_terms, self.num_nnz
+ )
def skip_headers(self, input_file):
for lineno, _ in enumerate(input_file):
if lineno == 2:
break
-# endclass UciReader
-
class UciWriter(MmWriter):
"""
@@ -110,7 +109,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False):
offsets = []
for docno, bow in enumerate(corpus):
if docno % progress_cnt == 0:
- logger.info("PROGRESS: saving document #%i" % docno)
+ logger.info("PROGRESS: saving document #%i", docno)
if index:
posnow = writer.fout.tell()
if posnow == poslast:
@@ -125,11 +124,11 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False):
num_docs = docno + 1
if num_docs * num_terms != 0:
- logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" %
- (num_docs, num_terms,
- 100.0 * num_nnz / (num_docs * num_terms),
- num_nnz,
- num_docs * num_terms))
+ logger.info(
+ "saved %ix%i matrix, density=%.3f%% (%i/%i)",
+ num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms),
+ num_nnz, num_docs * num_terms
+ )
# now write proper headers, by seeking and overwriting the spaces written earlier
writer.update_headers(num_docs, num_terms, num_nnz)
@@ -138,8 +137,6 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False):
if index:
return offsets
-# endclass UciWriter
-
class UciCorpus(UciReader, IndexedCorpus):
"""
@@ -179,14 +176,14 @@ def create_dictionary(self):
dictionary.dfs = defaultdict(int)
dictionary.id2token = self.id2word
- dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))
+ dictionary.token2id = utils.revdict(self.id2word)
dictionary.num_docs = self.num_docs
dictionary.num_nnz = self.num_nnz
for docno, doc in enumerate(self):
if docno % 10000 == 0:
- logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))
+ logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs)
for word, count in doc:
dictionary.dfs[word] += 1
@@ -214,13 +211,11 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
# write out vocabulary
fname_vocab = utils.smart_extension(fname, '.vocab')
- logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
+ logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
for featureid in xrange(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
- logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname)
+ logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)
return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
-
-# endclass UciCorpus
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index ea87cce4a2..4a70f106f3 100755
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -57,9 +57,11 @@
# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
# ought to be ignored
-IGNORED_NAMESPACES = ['Wikipedia', 'Category', 'File', 'Portal', 'Template',
- 'MediaWiki', 'User', 'Help', 'Book', 'Draft',
- 'WikiProject', 'Special', 'Talk']
+IGNORED_NAMESPACES = [
+ 'Wikipedia', 'Category', 'File', 'Portal', 'Template',
+ 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
+ 'Special', 'Talk'
+]
def filter_wiki(raw):
@@ -143,10 +145,7 @@ def remove_template(s):
prev_c = c
# Remove all the templates
- s = ''.join([s[end + 1:start] for start, end in
- zip(starts + [None], [-1] + ends)])
-
- return s
+ return ''.join([s[end + 1:start] for start, end in zip(starts + [None], [-1] + ends)])
def remove_file(s):
@@ -184,8 +183,7 @@ def get_namespace(tag):
m = re.match("^{(.*?)}", tag)
namespace = m.group(1) if m else ""
if not namespace.startswith("http://www.mediawiki.org/xml/export-"):
- raise ValueError("%s not recognized as MediaWiki dump namespace"
- % namespace)
+ raise ValueError("%s not recognized as MediaWiki dump namespace" % namespace)
return namespace
@@ -271,8 +269,7 @@ class WikiCorpus(TextCorpus):
"""
- def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
- filter_namespaces=('0',)):
+ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
"""
Initialize the corpus. Unless a dictionary is provided, this scans the
corpus once, to determine its vocabulary.
@@ -311,10 +308,10 @@ def get_texts(self):
"""
articles, articles_all = 0, 0
positions, positions_all = 0, 0
- texts = \
- ((text, self.lemmatize, title, pageid)
- for title, text, pageid
- in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
+ texts = (
+ (text, self.lemmatize, title, pageid) for title, text, pageid
+ in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)
+ )
pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
try:
@@ -335,15 +332,16 @@ def get_texts(self):
yield tokens
except KeyboardInterrupt:
logger.warn(
- "user terminated iteration over Wikipedia corpus after %i documents with %i positions"
- " (total %i articles, %i positions before pruning articles shorter than %i words)",
- articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
+ "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
+ "(total %i articles, %i positions before pruning articles shorter than %i words)",
+ articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
+ )
else:
logger.info(
- "finished iterating over Wikipedia corpus of %i documents with %i positions"
- " (total %i articles, %i positions before pruning articles shorter than %i words)",
- articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
+ "finished iterating over Wikipedia corpus of %i documents with %i positions "
+ "(total %i articles, %i positions before pruning articles shorter than %i words)",
+ articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
+ )
self.length = articles # cache corpus length
finally:
pool.terminate()
-# endclass WikiCorpus
diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py
index d8fb8c4cb5..07fc247f8b 100644
--- a/gensim/examples/dmlcz/dmlcorpus.py
+++ b/gensim/examples/dmlcz/dmlcorpus.py
@@ -40,9 +40,9 @@ def __init__(self, configId, resultDir, acceptLangs=None):
self.sources = {} # all article sources; see sources.DmlSource class for an example of source
if acceptLangs is None: # which languages to accept
- acceptLangs = set(['any']) # if not specified, accept all languages (including unknown/unspecified)
+ acceptLangs = {'any'} # if not specified, accept all languages (including unknown/unspecified)
self.acceptLangs = set(acceptLangs)
- logger.info('initialized %s' % self)
+ logger.info('initialized %s', self)
def resultFile(self, fname):
return os.path.join(self.resultDir, self.configId + '_' + fname)
@@ -105,13 +105,12 @@ def buildDictionary(self):
them into tokens and converting tokens to their ids (creating new ids as
necessary).
"""
- logger.info("creating dictionary from %i articles" % len(self.documents))
+ logger.info("creating dictionary from %i articles", len(self.documents))
self.dictionary = dictionary.Dictionary()
numPositions = 0
for docNo, (sourceId, docUri) in enumerate(self.documents):
if docNo % 1000 == 0:
- logger.info("PROGRESS: at document #%i/%i (%s, %s)" %
- (docNo, len(self.documents), sourceId, docUri))
+ logger.info("PROGRESS: at document #%i/%i (%s, %s)", docNo, len(self.documents), sourceId, docUri)
source = self.config.sources[sourceId]
contents = source.getContent(docUri)
words = [source.normalizeWord(word) for word in source.tokenize(contents)]
@@ -119,8 +118,7 @@ def buildDictionary(self):
# convert to bag-of-words, but ignore the result -- here we only care about updating token ids
_ = self.dictionary.doc2bow(words, allowUpdate=True) # noqa:F841
- logger.info("built %s from %i documents (total %i corpus positions)" %
- (self.dictionary, len(self.documents), numPositions))
+ logger.info("built %s from %i documents (total %i corpus positions)", self.dictionary, len(self.documents), numPositions)
def processConfig(self, config, shuffle=False):
"""
@@ -135,31 +133,29 @@ def processConfig(self, config, shuffle=False):
"""
self.config = config
self.documents = []
- logger.info("processing config %s" % config)
+ logger.info("processing config %s", config)
for sourceId, source in config.sources.iteritems():
- logger.info("processing source '%s'" % sourceId)
+ logger.info("processing source '%s'", sourceId)
accepted = []
for articleUri in source.findArticles():
meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value)
if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata
accepted.append((sourceId, articleUri))
- logger.info("accepted %i articles for source '%s'" %
- (len(accepted), sourceId))
+ logger.info("accepted %i articles for source '%s'", len(accepted), sourceId)
self.documents.extend(accepted)
if not self.documents:
logger.warning('no articles at all found from the config; something went wrong!')
if shuffle:
- logger.info("shuffling %i documents for random order" % len(self.documents))
+ logger.info("shuffling %i documents for random order", len(self.documents))
import random
random.shuffle(self.documents)
- logger.info("accepted total of %i articles for %s" %
- (len(self.documents), str(config)))
+ logger.info("accepted total of %i articles for %s", len(self.documents), str(config))
def saveDictionary(self, fname):
- logger.info("saving dictionary mapping to %s" % fname)
+ logger.info("saving dictionary mapping to %s", fname)
fout = open(fname, 'w')
for tokenId, token in self.dictionary.id2token.iteritems():
fout.write("%i\t%s\n" % (tokenId, token))
@@ -177,7 +173,7 @@ def loadDictionary(fname):
return result
def saveDocuments(self, fname):
- logger.info("saving documents mapping to %s" % fname)
+ logger.info("saving documents mapping to %s", fname)
fout = open(fname, 'w')
for docNo, docId in enumerate(self.documents):
sourceId, docUri = docId
diff --git a/gensim/examples/dmlcz/gensim_build.py b/gensim/examples/dmlcz/gensim_build.py
index 9695241fb3..bb62103109 100755
--- a/gensim/examples/dmlcz/gensim_build.py
+++ b/gensim/examples/dmlcz/gensim_build.py
@@ -24,24 +24,20 @@
if AT_HOME:
SOURCE_LIST = [
- sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'),
- sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'),
- sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'),
- ]
-
-# SOURCE_LIST = [
-# sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'),
-# ]
+ sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'),
+ sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'),
+ sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'),
+ ]
RESULT_DIR = '/Users/kofola/workspace/dml/data/results'
else:
SOURCE_LIST = [
- sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'),
- sources.DmlSource('numdam', '/data/dmlcz/data/numdam'),
- sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'),
- ]
+ sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'),
+ sources.DmlSource('numdam', '/data/dmlcz/data/numdam'),
+ sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'),
+ ]
RESULT_DIR = '/data/dmlcz/xrehurek/results'
@@ -60,7 +56,7 @@ def buildDmlCorpus(config):
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
- logging.info("running %s" % ' '.join(sys.argv))
+ logging.info("running %s", ' '.join(sys.argv))
program = os.path.basename(sys.argv[0])
@@ -76,4 +72,4 @@ def buildDmlCorpus(config):
config.addSource(source)
buildDmlCorpus(config)
- logging.info("finished running %s" % program)
+ logging.info("finished running %s", program)
diff --git a/gensim/examples/dmlcz/gensim_genmodel.py b/gensim/examples/dmlcz/gensim_genmodel.py
index df11f9696c..a2f2b792e7 100755
--- a/gensim/examples/dmlcz/gensim_genmodel.py
+++ b/gensim/examples/dmlcz/gensim_genmodel.py
@@ -31,7 +31,7 @@
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
- logging.info("running %s" % ' '.join(sys.argv))
+ logging.info("running %s", ' '.join(sys.argv))
program = os.path.basename(sys.argv[0])
@@ -46,9 +46,9 @@
config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])
- logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
+ logging.info("loading word id mapping from %s", config.resultFile('wordids.txt'))
id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
- logging.info("loaded %i word ids" % len(id2word))
+ logging.info("loaded %i word ids", len(id2word))
corpus = MmCorpus(config.resultFile('bow.mm'))
@@ -56,23 +56,23 @@
model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
model.save(config.resultFile('model_tfidf.pkl'))
elif method == 'lda':
- model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA)
+ model = ldamodel.LdaModel(corpus, id2word=id2word, num_topics=DIM_LDA)
model.save(config.resultFile('model_lda.pkl'))
elif method == 'lsi':
# first, transform word counts to tf-idf weights
tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
# then find the transformation from tf-idf to latent space
- model = lsimodel.LsiModel(tfidf[corpus], id2word=id2word, numTopics=DIM_LSI)
+ model = lsimodel.LsiModel(tfidf[corpus], id2word=id2word, num_topics=DIM_LSI)
model.save(config.resultFile('model_lsi.pkl'))
elif method == 'rp':
# first, transform word counts to tf-idf weights
tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
# then find the transformation from tf-idf to latent space
- model = rpmodel.RpModel(tfidf[corpus], id2word=id2word, numTopics=DIM_RP)
+ model = rpmodel.RpModel(tfidf[corpus], id2word=id2word, num_topics=DIM_RP)
model.save(config.resultFile('model_rp.pkl'))
else:
raise ValueError('unknown topic extraction method: %s' % repr(method))
MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus])
- logging.info("finished running %s" % program)
+ logging.info("finished running %s", program)
diff --git a/gensim/examples/dmlcz/gensim_xml.py b/gensim/examples/dmlcz/gensim_xml.py
index f810d045d4..0b8661ac77 100755
--- a/gensim/examples/dmlcz/gensim_xml.py
+++ b/gensim/examples/dmlcz/gensim_xml.py
@@ -72,20 +72,20 @@ def generateSimilar(corpus, index, method):
if SAVE_EMPTY or articles:
output = ''.join(articles) # concat all similars to one string
if not DRY_RUN: # only open output files for writing if DRY_RUN is false
- logging.info("generating %s (%i similars)" % (outfile, len(articles)))
+ logging.info("generating %s (%i similars)", outfile, len(articles))
outfile = open(outfile, 'w')
outfile.write(SIMILAR % output) # add xml headers and print to file
outfile.close()
else:
- logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output))
+ logging.info("would be generating %s (%i similars):%s\n", outfile, len(articles), output)
else:
- logging.debug("skipping %s (no similar found)" % outfile)
+ logging.debug("skipping %s (no similar found)", outfile)
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
- logging.info("running %s" % ' '.join(sys.argv))
+ logging.info("running %s", ' '.join(sys.argv))
program = os.path.basename(sys.argv[0])
@@ -100,9 +100,9 @@ def generateSimilar(corpus, index, method):
config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])
- logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
+ logging.info("loading word id mapping from %s", config.resultFile('wordids.txt'))
id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
- logging.info("loaded %i word ids" % len(id2word))
+ logging.info("loaded %i word ids", len(id2word))
corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
input = MmCorpus(config.resultFile('_%s.mm' % method))
@@ -110,11 +110,11 @@ def generateSimilar(corpus, index, method):
# initialize structure for similarity queries
if method == 'lsi' or method == 'rp': # for these methods, use dense vectors
- index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms)
+ index = MatrixSimilarity(input, num_best=MAX_SIMILAR + 1, num_features=input.numTerms)
else:
- index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1)
+ index = SparseMatrixSimilarity(input, num_best=MAX_SIMILAR + 1)
index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op)
generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format
- logging.info("finished running %s" % program)
+ logging.info("finished running %s", program)
diff --git a/gensim/examples/dmlcz/runall.sh b/gensim/examples/dmlcz/runall.sh
index 5d3f5819f2..236c1dce80 100644
--- a/gensim/examples/dmlcz/runall.sh
+++ b/gensim/examples/dmlcz/runall.sh
@@ -7,7 +7,7 @@ BIN_PATH=~/xrehurek/gensim/dmlcz
RESULT_PATH=~/xrehurek/results
# set python path, so that python can find and import gensim modules
-export PYTHONPATH=~/xrehurek:$PYTHONPATH
+export PYTHONPATH=~/xrehurek:${PYTHONPATH}
# Language is set to 'any', meaning all articles are processed for similarity in
# one go, regardless of their language.
@@ -17,18 +17,18 @@ language=any
# ========== parse all article sources, build article co-occurence matrix ======
-${BIN_PATH}/gensim_build.py $language 2>&1 | tee ${RESULT_PATH}/gensim_build.log
+${BIN_PATH}/gensim_build.py ${language} 2>&1 | tee ${RESULT_PATH}/gensim_build.log
# ========== build transformation models =======================================
for method in tfidf rp;
do
- ( ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) &
+ ( ${BIN_PATH}/gensim_genmodel.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) &
done
wait
method=lsi
-${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log
+${BIN_PATH}/gensim_genmodel.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log
# =========== generate output xml files ========================================
@@ -36,6 +36,6 @@ ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensi
# NOTE if out of memory, move tfidf out of the loop (tfidf uses a lot of memory here)
for method in tfidf lsi rp;
do
- ( ${BIN_PATH}/gensim_xml.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) &
+ ( ${BIN_PATH}/gensim_xml.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) &
done
wait
diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py
index da4e0ac0b0..4193da0820 100644
--- a/gensim/examples/dmlcz/sources.py
+++ b/gensim/examples/dmlcz/sources.py
@@ -110,7 +110,7 @@ def parseDmlMeta(cls, xmlfile):
name, cont = name.strip(), cont.strip()
if name == 'msc':
if len(cont) != 5:
- logger.warning('invalid MSC=%s in %s' % (cont, xmlfile))
+ logger.warning('invalid MSC=%s in %s', cont, xmlfile)
result.setdefault('msc', []).append(cont)
continue
if name == 'idMR':
@@ -132,25 +132,24 @@ def isArticle(self, path):
return False
# and contain the fulltext.txt file
if not os.path.exists(os.path.join(path, 'fulltext.txt')):
- logger.info('missing fulltext in %s' % path)
+ logger.info('missing fulltext in %s', path)
return False
# and also the meta.xml file
if not os.path.exists(os.path.join(path, 'meta.xml')):
- logger.info('missing meta.xml in %s' % path)
+ logger.info('missing meta.xml in %s', path)
return False
return True
def findArticles(self):
dirTotal = artAccepted = 0
- logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir))
+ logger.info("looking for '%s' articles inside %s", self.sourceId, self.baseDir)
for root, dirs, files in os.walk(self.baseDir):
dirTotal += 1
root = os.path.normpath(root)
if self.isArticle(root):
artAccepted += 1
yield self.idFromDir(root)
- logger.info('%i directories processed, found %i articles' %
- (dirTotal, artAccepted))
+ logger.info('%i directories processed, found %i articles', dirTotal, artAccepted)
def getContent(self, uri):
"""
@@ -200,15 +199,15 @@ def isArticle(self, path):
return False
# and contain a dspace_id file
if not (os.path.exists(os.path.join(path, 'dspace_id'))):
- logger.info('missing dspace_id in %s' % path)
+ logger.info('missing dspace_id in %s', path)
return False
# and contain either fulltext.txt or fulltext_dspace.txt file
if not (os.path.exists(os.path.join(path, 'fulltext.txt')) or os.path.exists(os.path.join(path, 'fulltext-dspace.txt'))):
- logger.info('missing fulltext in %s' % path)
+ logger.info('missing fulltext in %s', path)
return False
# and contain the meta.xml file
if not os.path.exists(os.path.join(path, 'meta.xml')):
- logger.info('missing meta.xml in %s' % path)
+ logger.info('missing meta.xml in %s', path)
return False
return True
@@ -278,7 +277,6 @@ class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler):
# these errors silently.
def error(self, exception):
pass
-# logger.debug("SAX error parsing xml: %s" % exception)
warning = fatalError = error
# endclass ArxmlivErrorHandler
@@ -302,21 +300,20 @@ def isArticle(self, path):
return False
# and contain the tex.xml file
if not os.path.exists(os.path.join(path, 'tex.xml')):
- logger.warning('missing tex.xml in %s' % path)
+ logger.warning('missing tex.xml in %s', path)
return False
return True
def findArticles(self):
dirTotal = artAccepted = 0
- logger.info("looking for '%s' articles inside %s" % (self.sourceId, self.baseDir))
+ logger.info("looking for '%s' articles inside %s", self.sourceId, self.baseDir)
for root, dirs, files in os.walk(self.baseDir):
dirTotal += 1
root = os.path.normpath(root)
if self.isArticle(root):
artAccepted += 1
yield self.idFromDir(root)
- logger.info('%i directories processed, found %i articles' %
- (dirTotal, artAccepted))
+ logger.info('%i directories processed, found %i articles', dirTotal, artAccepted)
def getContent(self, uri):
"""
diff --git a/gensim/interfaces.py b/gensim/interfaces.py
index 4087fd8893..81f85a8527 100644
--- a/gensim/interfaces.py
+++ b/gensim/interfaces.py
@@ -56,8 +56,7 @@ def __iter__(self):
def save(self, *args, **kwargs):
import warnings
- warnings.warn("corpus.save() stores only the (tiny) iteration object; "
- "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)")
+ warnings.warn("corpus.save() stores only the (tiny) iteration object; to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)")
super(CorpusABC, self).save(*args, **kwargs)
def __len__(self):
@@ -95,12 +94,11 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
raise NotImplementedError('cannot instantiate abstract base class')
# example code:
- logger.info("converting corpus to ??? format: %s" % fname)
+ logger.info("converting corpus to ??? format: %s", fname)
with utils.smart_open(fname, 'wb') as fout:
for doc in corpus: # iterate over the document stream
fmt = str(doc) # format the document appropriately...
fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
-# endclass CorpusABC
class TransformedCorpus(CorpusABC):
@@ -127,7 +125,6 @@ def __getitem__(self, docno):
return self.obj[self.corpus[docno]]
else:
raise RuntimeError('Type {} does not support slicing.'.format(type(self.corpus)))
-# endclass TransformedCorpus
class TransformationABC(utils.SaveLoad):
@@ -162,7 +159,6 @@ def _apply(self, corpus, chunksize=None, **kwargs):
and return the result as another corpus.
"""
return TransformedCorpus(self, corpus, chunksize, **kwargs)
-# endclass TransformationABC
class SimilarityABC(utils.SaveLoad):
@@ -263,7 +259,7 @@ def __iter__(self):
# (unlike numpy). so, clip the end of the chunk explicitly to make
# scipy.sparse happy
chunk_end = min(self.index.shape[0], chunk_start + self.chunksize)
- chunk = self.index[chunk_start : chunk_end]
+ chunk = self.index[chunk_start: chunk_end]
for sim in self[chunk]:
yield sim
else:
@@ -272,4 +268,3 @@ def __iter__(self):
# restore old normalization value
self.normalize = norm
-# endclass SimilarityABC
diff --git a/gensim/matutils.py b/gensim/matutils.py
index 58904e1657..4b072bfd4a 100644
--- a/gensim/matutils.py
+++ b/gensim/matutils.py
@@ -21,26 +21,12 @@
from scipy.stats import entropy
import scipy.linalg
from scipy.linalg.lapack import get_lapack_funcs
+from scipy.linalg.special_matrices import triu
from scipy.special import psi # gamma function utils
from six import iteritems, itervalues, string_types
from six.moves import xrange, zip as izip
-# scipy is not a stable package yet, locations change, so try to work
-# around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8)
-try:
- from scipy.linalg.basic import triu
-except ImportError:
- from scipy.linalg.special_matrices import triu
-
-try:
- from np import triu_indices
-except ImportError:
- # np < 1.4
- def triu_indices(n, k=0):
- m = np.ones((n, n), int)
- a = triu(m, k)
- return np.where(a != 0)
blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
@@ -101,7 +87,7 @@ def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=
data = np.empty((num_nnz,), dtype=dtype)
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
- logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs))
+ logger.info("PROGRESS: at document #%i/%i", docno, num_docs)
posnext = posnow + len(doc)
indices[posnow: posnext] = [feature_id for feature_id, _ in doc]
data[posnow: posnext] = [feature_weight for _, feature_weight in doc]
@@ -114,7 +100,7 @@ def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=
num_nnz, data, indices, indptr = 0, [], [], [0]
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
- logger.info("PROGRESS: at document #%i" % (docno))
+ logger.info("PROGRESS: at document #%i", docno)
indices.extend([feature_id for feature_id, _ in doc])
data.extend([feature_weight for _, feature_weight in doc])
num_nnz += len(doc)
@@ -150,7 +136,7 @@ def zeros_aligned(shape, dtype, order='C', align=128):
nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize
buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded")
start_index = -buffer.ctypes.data % align
- return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order)
+ return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order)
def ismatrix(m):
@@ -333,7 +319,6 @@ def __iter__(self):
def __len__(self):
return len(self.dense)
-# endclass DenseCorpus
class Sparse2Corpus(object):
@@ -356,7 +341,6 @@ def __iter__(self):
def __len__(self):
return self.sparse.shape[1]
-# endclass Sparse2Corpus
def veclen(vec):
@@ -381,7 +365,7 @@ def ret_log_normalize_vec(vec, axis=1):
log_shift = log_max - np.log(len(vec) + 1.0) - max_val
tot = np.sum(np.exp(vec + log_shift))
log_norm = np.log(tot) - log_shift
- vec = vec - log_norm
+ vec -= log_norm
else:
if axis == 1: # independently normalize each sample
max_val = np.max(vec, 1)
@@ -391,10 +375,10 @@ def ret_log_normalize_vec(vec, axis=1):
vec = vec - log_norm[:, np.newaxis]
elif axis == 0: # normalize each feature
k = ret_log_normalize_vec(vec.T)
- return (k[0].T, k[1])
+ return k[0].T, k[1]
else:
raise ValueError("'%s' is not a supported axis" % axis)
- return (vec, log_norm)
+ return vec, log_norm
blas_nrm2 = blas('nrm2', np.array([], dtype=float))
@@ -435,7 +419,7 @@ def unitvec(vec, norm='l2'):
try:
first = next(iter(vec)) # is there at least one element?
- except Exception:
+ except StopIteration:
return vec
if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format
@@ -476,10 +460,10 @@ def isbow(vec):
vec = vec.todense().tolist()
try:
id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking
- id_, val_ = int(id_), float(val_)
+ int(id_), float(val_)
except IndexError:
return True # this is to handle the empty input case
- except Exception:
+ except (ValueError, TypeError):
return False
return True
@@ -610,7 +594,7 @@ def dirichlet_expectation(alpha):
For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.
"""
- if (len(alpha.shape) == 1):
+ if len(alpha.shape) == 1:
result = psi(alpha) - psi(np.sum(alpha))
else:
result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
@@ -628,7 +612,7 @@ def qr_destroy(la):
del la[0], la # now `a` is the only reference to the input matrix
m, n = a.shape
# perform q, r = QR(a); code hacked out of scipy.linalg.qr
- logger.debug("computing QR of %s dense matrix" % str(a.shape))
+ logger.debug("computing QR of %s dense matrix", str(a.shape))
geqrf, = get_lapack_funcs(('geqrf',), (a,))
qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True)
qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True)
@@ -675,12 +659,10 @@ def write_headers(self, num_docs, num_terms, num_nnz):
if num_nnz < 0:
# we don't know the matrix shape/density yet, so only log a general line
- logger.info("saving sparse matrix to %s" % self.fname)
+ logger.info("saving sparse matrix to %s", self.fname)
self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody
else:
- logger.info(
- "saving sparse %sx%s matrix with %i non-zero entries to %s",
- num_docs, num_terms, num_nnz, self.fname)
+ logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s", num_docs, num_terms, num_nnz, self.fname)
self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
self.last_docno = -1
self.headers_written = True
@@ -737,7 +719,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None,
else:
bow = doc
if docno % progress_cnt == 0:
- logger.info("PROGRESS: saving document #%i" % docno)
+ logger.info("PROGRESS: saving document #%i", docno)
if index:
posnow = mw.fout.tell()
if posnow == poslast:
@@ -755,11 +737,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None,
num_terms = num_terms or _num_terms
if num_docs * num_terms != 0:
- logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % (
- num_docs, num_terms,
- 100.0 * num_nnz / (num_docs * num_terms),
- num_nnz,
- num_docs * num_terms))
+ logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)", num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms)
# now write proper headers, by seeking and overwriting the spaces written earlier
mw.fake_headers(num_docs, num_terms, num_nnz)
@@ -779,10 +757,9 @@ def __del__(self):
self.close() # does nothing if called twice (on an already closed file), so no worries
def close(self):
- logger.debug("closing %s" % self.fname)
+ logger.debug("closing %s", self.fname)
if hasattr(self, 'fout'):
self.fout.close()
-# endclass MmWriter
class MmReader(object):
@@ -806,7 +783,7 @@ def __init__(self, input, transposed=True):
`input` is either a string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
"""
- logger.info("initializing corpus reader from %s" % input)
+ logger.info("initializing corpus reader from %s", input)
self.input, self.transposed = input, transposed
with utils.file_or_filename(self.input) as lines:
try:
@@ -821,14 +798,12 @@ def __init__(self, input, transposed=True):
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
- self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
+ self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
- logger.info(
- "accepted corpus with %i documents, %i features, %i non-zero entries",
- self.num_docs, self.num_terms, self.num_nnz)
+ logger.info("accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz)
def __len__(self):
return self.num_docs
@@ -917,4 +892,3 @@ def docbyoffset(self, offset):
document.append((termid, val,)) # add another field to the current document
return document
-# endclass MmReader
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
index 5c25a86fd5..530f7c4980 100644
--- a/gensim/models/__init__.py
+++ b/gensim/models/__init__.py
@@ -36,7 +36,7 @@ class VocabTransform(interfaces.TransformationABC):
Old features that have no counterpart in the new ids are discarded. This
can be used to filter vocabulary of a corpus "online"::
- >>> old2new = dict((oldid, newid) for newid, oldid in enumerate(ids_you_want_to_keep))
+ >>> old2new = {oldid: newid for newid, oldid in enumerate(ids_you_want_to_keep)}
>>> vt = VocabTransform(old2new)
>>> for vec_with_new_ids in vt[corpus_with_old_ids]:
>>> ...
@@ -44,7 +44,6 @@ class VocabTransform(interfaces.TransformationABC):
"""
def __init__(self, old2new, id2token=None):
- # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems())
self.old2new = old2new
self.id2token = id2token
@@ -58,4 +57,3 @@ def __getitem__(self, bow):
return self._apply(bow)
return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
-# endclass VocabTransform
diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py
index 7e1ee02d62..bcc63e8f8c 100755
--- a/gensim/models/atmodel.py
+++ b/gensim/models/atmodel.py
@@ -84,7 +84,7 @@ def construct_doc2author(corpus, author2doc):
return doc2author
-def construct_author2doc(corpus, doc2author):
+def construct_author2doc(doc2author):
"""Make a mapping from author IDs to document IDs."""
# First get a set of all authors.
@@ -118,10 +118,10 @@ class AuthorTopicModel(LdaModel):
"""
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None,
- chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
- alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
- gamma_threshold=0.001, serialized=False, serialization_path=None,
- minimum_probability=0.01, random_state=None):
+ chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
+ alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
+ gamma_threshold=0.001, serialized=False, serialization_path=None,
+ minimum_probability=0.01, random_state=None):
"""
If the iterable corpus and one of author2doc/doc2author dictionaries are given,
start training straight away. If not given, the model is left untrained
@@ -212,7 +212,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
self.id2word = id2word
if corpus is None and self.id2word is None:
- raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')
+ raise ValueError(
+ "at least one of corpus/id2word must be specified, to establish input space dimensionality"
+ )
if self.id2word is None:
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
@@ -252,7 +254,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
if serialized and not serialization_path:
raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).")
if serialized and serialization_path:
- assert not isfile(serialization_path), "A file already exists at the serialization_path path; choose a different serialization_path, or delete the file."
+ assert not isfile(serialization_path), \
+ "A file already exists at the serialization_path path; " \
+ "choose a different serialization_path, or delete the file."
self.serialization_path = serialization_path
# Initialize an empty self.corpus.
@@ -260,7 +264,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
- assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
+ assert self.alpha.shape == (self.num_topics,), \
+ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
if isinstance(eta, six.string_types):
if eta == 'asymmetric':
@@ -272,7 +277,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
- (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms))
+ (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)
+ )
# VB constants
self.iterations = iterations
@@ -300,7 +306,7 @@ def init_empty_corpus(self):
"""
if self.serialized:
- # Tnitialize the corpus as a serialized empty list.
+ # Initialize the corpus as a serialized empty list.
# This corpus will be extended in self.update.
MmCorpus.serialize(self.serialization_path, []) # Serialize empty corpus.
self.corpus = MmCorpus(self.serialization_path) # Store serialized corpus object in self.corpus.
@@ -333,9 +339,8 @@ def extend_corpus(self, corpus):
assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
self.corpus.extend(corpus)
- def compute_phinorm(self, ids, authors_d, expElogthetad, expElogbetad):
+ def compute_phinorm(self, expElogthetad, expElogbetad):
"""Efficiently computes the normalizing factor in phi."""
- phinorm = np.zeros(len(ids))
expElogtheta_sum = expElogthetad.sum(axis=0)
phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100
@@ -362,8 +367,8 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
"""
try:
- _ = len(chunk) # noqa:F841
- except Exception:
+ len(chunk)
+ except TypeError:
# convert iterators/generators to plain list, so we have len() etc.
chunk = list(chunk)
if len(chunk) > 1:
@@ -389,9 +394,9 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
# TODO: this is duplication of code in LdaModel. Refactor.
if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
# make sure the term IDs are ints, otherwise np will get upset
- ids = [int(id) for id, _ in doc]
+ ids = [int(idx) for idx, _ in doc]
else:
- ids = [id for id, _ in doc]
+ ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc])
# Get all authors in current document, and convert the author names to integer IDs.
@@ -406,11 +411,10 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
expElogbetad = self.expElogbeta[:, ids]
# Compute the normalizing constant of phi for the current document.
- phinorm = self.compute_phinorm(ids, authors_d, expElogthetad, expElogbetad)
+ phinorm = self.compute_phinorm(expElogthetad, expElogbetad)
# Iterate between gamma and phi until convergence
- for iteration in xrange(self.iterations):
-
+ for _ in xrange(self.iterations):
lastgamma = tilde_gamma.copy()
# Update gamma.
@@ -428,7 +432,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
expElogthetad = np.exp(Elogthetad)
# Update the normalizing constant in phi.
- phinorm = self.compute_phinorm(ids, authors_d, expElogthetad, expElogbetad)
+ phinorm = self.compute_phinorm(expElogthetad, expElogbetad)
# Check for convergence.
# Criterion is mean change in "local" gamma.
@@ -452,8 +456,10 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm)
if len(chunk) > 1:
- logger.debug("%i/%i documents converged within %i iterations",
- converged, len(chunk), self.iterations)
+ logger.debug(
+ "%i/%i documents converged within %i iterations",
+ converged, len(chunk), self.iterations
+ )
if collect_sstats:
# This step finishes computing the sufficient statistics for the
@@ -473,7 +479,10 @@ def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_id
# TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible.
if state is None:
state = self.state
- gamma, sstats = self.inference(chunk, author2doc, doc2author, rhot, collect_sstats=True, chunk_doc_idx=chunk_doc_idx)
+ gamma, sstats = self.inference(
+ chunk, author2doc, doc2author, rhot,
+ collect_sstats=True, chunk_doc_idx=chunk_doc_idx
+ )
state.sstats += sstats
state.numdocs += len(chunk)
return gamma
@@ -492,13 +501,14 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None):
corpus_words = sum(cnt for document in chunk for _, cnt in document)
subsample_ratio = 1.0 * total_docs / len(chunk)
perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
- logger.info("%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words" %
- (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words))
+ logger.info(
+ "%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words",
+ perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
+ )
return perwordbound
- def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None,
- passes=None, update_every=None, eval_every=None, iterations=None,
- gamma_threshold=None, chunks_as_numpy=False):
+ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None,
+ update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False):
"""
Train the model with new documents, by EM-iterating over `corpus` until
the topics converge (or until the maximum number of allowed iterations
@@ -590,14 +600,14 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None,
if doc2author is None:
doc2author = construct_doc2author(corpus, author2doc)
elif author2doc is None:
- author2doc = construct_author2doc(corpus, doc2author)
+ author2doc = construct_author2doc(doc2author)
# Number of authors that need to be updated.
num_input_authors = len(author2doc)
try:
len_input_corpus = len(corpus)
- except Exception:
+ except TypeError:
logger.warning("input corpus stream has no len(); counting documents")
len_input_corpus = sum(1 for _ in corpus)
if len_input_corpus == 0:
@@ -650,7 +660,7 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None,
# Train on all documents of authors in input_corpus.
train_corpus_idx = []
- for a in author2doc.keys(): # For all authors in input corpus.
+ for _ in author2doc.keys(): # For all authors in input corpus.
for doc_ids in self.author2doc.values(): # For all documents in total corpus.
train_corpus_idx.extend(doc_ids)
@@ -674,17 +684,15 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None,
evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)
updates_per_pass = max(1, lencorpus / updateafter)
- logger.info("running %s author-topic training, %s topics, %s authors, %i passes over "
- "the supplied corpus of %i documents, updating model once "
- "every %i documents, evaluating perplexity every %i documents, "
- "iterating %ix with a convergence threshold of %f",
- updatetype, self.num_topics, num_input_authors, passes, lencorpus,
- updateafter, evalafter, iterations,
- gamma_threshold)
+ logger.info(
+ "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once "
+ "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f",
+ updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter,
+ evalafter, iterations, gamma_threshold
+ )
if updates_per_pass * passes < 10:
- logger.warning("too few updates, training might not converge; consider "
- "increasing the number of passes or iterations to improve accuracy")
+ logger.warning("too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy")
# rho is the "speed" of updating; TODO try other fncs
# pass_ + num_updates handles increasing the starting t for each pass,
@@ -694,7 +702,7 @@ def rho():
for pass_ in xrange(passes):
if self.dispatcher:
- logger.info('initializing %s workers' % self.numworkers)
+ logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
else:
# gamma is not needed in "other", thus its shape is (0, 0).
@@ -713,13 +721,17 @@ def rho():
if self.dispatcher:
# add the chunk to dispatcher's job queue, so workers can munch on it
- logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i',
- pass_, chunk_no * chunksize + len(chunk), lencorpus)
+ logger.info(
+ "PROGRESS: pass %i, dispatching documents up to #%i/%i",
+ pass_, chunk_no * chunksize + len(chunk), lencorpus
+ )
# this will eventually block until some jobs finish, because the queue has a small finite length
self.dispatcher.putjob(chunk)
else:
- logger.info('PROGRESS: pass %i, at document #%i/%i',
- pass_, chunk_no * chunksize + len(chunk), lencorpus)
+ logger.info(
+ "PROGRESS: pass %i, at document #%i/%i",
+ pass_, chunk_no * chunksize + len(chunk), lencorpus
+ )
# do_estep requires the indexes of the documents being trained on, to know what authors
# correspond to the documents.
gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx)
@@ -757,8 +769,6 @@ def rho():
other = self.dispatcher.getstate()
self.do_mstep(rho(), other, pass_ > 0)
del other
- dirty = False
- # endfor entire corpus update
def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
"""
@@ -833,11 +843,11 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None,
# Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which
# is the same computation as in normalizing phi.
- phinorm = self.compute_phinorm(ids, authors_d, expElogtheta[authors_d, :], expElogbeta[:, ids])
+ phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids])
word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm))
# Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures
- # that the likelihood is always rougly on the same scale.
+ # that the likelihood is always roughly on the same scale.
word_score *= subsample_ratio
# E[log p(theta | alpha) - log q(theta | gamma)]
@@ -863,12 +873,12 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None,
return total_score
def get_document_topics(self, word_id, minimum_probability=None):
- '''
+ """
This method overwrites `LdaModel.get_document_topics` and simply raises an
exception. `get_document_topics` is not valid for the author-topic model,
use `get_author_topics` instead.
- '''
+ """
raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.')
@@ -891,13 +901,12 @@ def get_author_topics(self, author_name, minimum_probability=None):
topic_dist = self.state.gamma[author_id, :] / sum(self.state.gamma[author_id, :])
- author_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
- if topicvalue >= minimum_probability]
+ author_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability]
return author_topics
def __getitem__(self, author_names, eps=None):
- '''
+ """
Return topic distribution for input author as a list of
(topic_id, topic_probabiity) 2-tuples.
@@ -905,7 +914,7 @@ def __getitem__(self, author_names, eps=None):
Do not call this method directly, instead use `model[author_names]`.
- '''
+ """
if isinstance(author_names, list):
items = []
for a in author_names:
@@ -914,4 +923,3 @@ def __getitem__(self, author_names, eps=None):
items = self.get_author_topics(author_names, minimum_probability=eps)
return items
-# endclass AuthorTopicModel
diff --git a/gensim/models/callbacks.py b/gensim/models/callbacks.py
index d4598cca1d..3ef7fbe978 100644
--- a/gensim/models/callbacks.py
+++ b/gensim/models/callbacks.py
@@ -45,7 +45,8 @@ class CoherenceMetric(Metric):
"""
Metric class for coherence evaluation
"""
- def __init__(self, corpus=None, texts=None, dictionary=None, coherence=None, window_size=None, topn=10, logger=None, viz_env=None, title=None):
+ def __init__(self, corpus=None, texts=None, dictionary=None, coherence=None,
+ window_size=None, topn=10, logger=None, viz_env=None, title=None):
"""
Args:
corpus : Gensim document corpus.
@@ -108,7 +109,10 @@ def get_value(self, **kwargs):
self.model = None
self.topics = None
super(CoherenceMetric, self).set_parameters(**kwargs)
- cm = gensim.models.CoherenceModel(self.model, self.topics, self.texts, self.corpus, self.dictionary, self.window_size, self.coherence, self.topn)
+ cm = gensim.models.CoherenceModel(
+ self.model, self.topics, self.texts, self.corpus, self.dictionary,
+ self.window_size, self.coherence, self.topn
+ )
return cm.get_coherence()
@@ -146,7 +150,8 @@ class DiffMetric(Metric):
"""
Metric class for topic difference evaluation
"""
- def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True, annotation=False, normed=True, logger=None, viz_env=None, title=None):
+ def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True,
+ annotation=False, normed=True, logger=None, viz_env=None, title=None):
"""
Args:
distance : measure used to calculate difference between any topic pair. Available values:
@@ -181,7 +186,10 @@ def get_value(self, **kwargs):
other_model : second topic model instance to calculate the difference from
"""
super(DiffMetric, self).set_parameters(**kwargs)
- diff_diagonal, _ = self.model.diff(self.other_model, self.distance, self.num_words, self.n_ann_terms, self.diagonal, self.annotation, self.normed)
+ diff_diagonal, _ = self.model.diff(
+ self.other_model, self.distance, self.num_words, self.n_ann_terms,
+ self.diagonal, self.annotation, self.normed
+ )
return diff_diagonal
@@ -189,7 +197,8 @@ class ConvergenceMetric(Metric):
"""
Metric class for convergence evaluation
"""
- def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True, annotation=False, normed=True, logger=None, viz_env=None, title=None):
+ def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=True,
+ annotation=False, normed=True, logger=None, viz_env=None, title=None):
"""
Args:
distance : measure used to calculate difference between any topic pair. Available values:
@@ -224,7 +233,10 @@ def get_value(self, **kwargs):
other_model : second topic model instance to calculate the difference from
"""
super(ConvergenceMetric, self).set_parameters(**kwargs)
- diff_diagonal, _ = self.model.diff(self.other_model, self.distance, self.num_words, self.n_ann_terms, self.diagonal, self.annotation, self.normed)
+ diff_diagonal, _ = self.model.diff(
+ self.other_model, self.distance, self.num_words, self.n_ann_terms,
+ self.diagonal, self.annotation, self.normed
+ )
return np.sum(diff_diagonal)
@@ -287,23 +299,33 @@ def on_epoch_end(self, epoch, topics=None):
if epoch == 0:
if value.ndim > 0:
diff_mat = np.array([value])
- viz_metric = self.viz.heatmap(X=diff_mat.T, env=metric.viz_env, opts=dict(xlabel='Epochs', ylabel=label, title=label))
+ viz_metric = self.viz.heatmap(
+ X=diff_mat.T, env=metric.viz_env, opts=dict(xlabel='Epochs', ylabel=label, title=label)
+ )
# store current epoch's diff diagonal
self.diff_mat.put(diff_mat)
# saving initial plot window
self.windows.append(copy.deepcopy(viz_metric))
else:
- viz_metric = self.viz.line(Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, opts=dict(xlabel='Epochs', ylabel=label, title=label))
+ viz_metric = self.viz.line(
+ Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env,
+ opts=dict(xlabel='Epochs', ylabel=label, title=label)
+ )
# saving initial plot window
self.windows.append(copy.deepcopy(viz_metric))
else:
if value.ndim > 0:
# concatenate with previous epoch's diff diagonals
diff_mat = np.concatenate((self.diff_mat.get(), np.array([value])))
- self.viz.heatmap(X=diff_mat.T, env=metric.viz_env, win=self.windows[i], opts=dict(xlabel='Epochs', ylabel=label, title=label))
+ self.viz.heatmap(
+ X=diff_mat.T, env=metric.viz_env, win=self.windows[i],
+ opts=dict(xlabel='Epochs', ylabel=label, title=label)
+ )
self.diff_mat.put(diff_mat)
else:
- self.viz.updateTrace(Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, win=self.windows[i])
+ self.viz.updateTrace(
+ Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, win=self.windows[i]
+ )
if metric.logger == "shell":
statement = "".join(("Epoch ", str(epoch), ": ", label, " estimate: ", str(value)))
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 4fa80ea15e..f6781903a8 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -168,7 +168,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
if isinstance(model.id2word, FakeDict):
raise ValueError(
"The associated dictionary should be provided with the corpus or 'id2word'"
- " for topic model should be set as the associated dictionary.")
+ " for topic model should be set as the associated dictionary."
+ )
else:
self.dictionary = model.id2word
else:
@@ -236,9 +237,7 @@ def topics(self, topics):
if self.model is not None:
new_topics = self._get_topics()
if topics is not None:
- logger.warning(
- "Ignoring topics you are attempting to set in favor of model's topics: %s",
- self.model)
+ logger.warning("Ignoring topics you are attempting to set in favor of model's topics: %s", self.model)
elif topics is not None:
new_topics = []
for topic in topics:
@@ -275,7 +274,8 @@ def _get_topics(self):
except AttributeError:
raise ValueError(
"This topic model is not currently supported. Supported topic models"
- " should implement the `get_topics` method.")
+ " should implement the `get_topics` method."
+ )
def segment_topics(self):
return self.measure.seg(self.topics)
@@ -294,7 +294,8 @@ def estimate_probabilities(self, segmented_topics=None):
self._accumulator = self.measure.prob(
texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size,
- processes=self.processes)
+ processes=self.processes
+ )
return self._accumulator
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index 5e0dc20418..04bf923c65 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -70,9 +70,9 @@
try:
from gensim.models.doc2vec_inner import train_document_dbow, train_document_dm, train_document_dm_concat
from gensim.models.word2vec_inner import FAST_VERSION # blas-adaptation shared from word2vec
- logger.debug('Fast version of {0} is being used'.format(__name__))
+ logger.debug('Fast version of %s is being used', __name__)
except ImportError:
- logger.warning('Slow version of {0} is being used'.format(__name__))
+ logger.warning('Slow version of %s is being used', __name__)
# failed... fall back to plain numpy (20-80x slower training than the above)
FAST_VERSION = -1
@@ -109,9 +109,10 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
train_batch_sg(model, [doc_words], alpha, work)
for doctag_index in doctag_indexes:
for word in doc_words:
- train_sg_pair(model, word, doctag_index, alpha, learn_vectors=learn_doctags,
- learn_hidden=learn_hidden, context_vectors=doctag_vectors,
- context_locks=doctag_locks)
+ train_sg_pair(
+ model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden,
+ context_vectors=doctag_vectors, context_locks=doctag_locks
+ )
return len(doc_words)
@@ -173,9 +174,9 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
return len(word_vocabs)
- def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
- learn_doctags=True, learn_words=True, learn_hidden=True,
- word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True,
+ learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None,
+ doctag_vectors=None, doctag_locks=None):
"""
Update distributed memory model ("PV-DM") by training on a single document, using a
concatenation of the context window word vectors (rather than a sum or average).
@@ -382,10 +383,8 @@ def estimated_lookup_memory(self):
def reset_weights(self, model):
length = max(len(self.doctags), self.count)
if self.mapfile_path:
- self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL,
- mode='w+', shape=(length, model.vector_size))
- self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL,
- mode='w+', shape=(length,))
+ self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size))
+ self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,))
self.doctag_syn0_lockf.fill(1.0)
else:
self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL)
@@ -481,7 +480,11 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
return dists
best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
# ignore (don't return) docs from the input
- result = [(self.index_to_doctag(sim + clip_start), float(dists[sim])) for sim in best if (sim + clip_start) not in all_docs]
+ result = [
+ (self.index_to_doctag(sim + clip_start), float(dists[sim]))
+ for sim in best
+ if (sim + clip_start) not in all_docs
+ ]
return result[:topn]
def doesnt_match(self, docs):
@@ -494,7 +497,7 @@ def doesnt_match(self, docs):
self.init_sims()
docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns
- logger.debug("using docs %s" % docs)
+ logger.debug("using docs %s", docs)
if not docs:
raise ValueError("cannot select a doc from an empty list")
vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL)
@@ -551,8 +554,7 @@ def repeat(self, word_count):
class Doc2Vec(Word2Vec):
"""Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf"""
- def __init__(self, documents=None, dm_mean=None,
- dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1,
+ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1,
docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs):
"""
Initialize the model from an iterable of `documents`. Each document is a
@@ -664,7 +666,7 @@ def reset_weights(self):
if self.dm and self.dm_concat:
# expand l1 size to match concatenated tags+words length
self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size
- logger.info("using concatenative %d-dimensional layer1" % (self.layer1_size))
+ logger.info("using concatenative %d-dimensional layer1", self.layer1_size)
super(Doc2Vec, self).reset_weights()
self.docvecs.reset_weights(self)
@@ -686,14 +688,16 @@ def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False
if not checked_string_types:
if isinstance(document.words, string_types):
logger.warning(
- "Each 'words' should be a list of words (usually unicode strings)."
- "First 'words' here is instead plain %s." % type(document.words)
+ "Each 'words' should be a list of words (usually unicode strings). First 'words' here is instead plain %s.",
+ type(document.words)
)
checked_string_types += 1
if document_no % progress_per == 0:
interval_rate = (total_words - interval_count) / (default_timer() - interval_start)
- logger.info("PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags",
- document_no, total_words, interval_rate, len(vocab), len(self.docvecs))
+ logger.info(
+ "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags",
+ document_no, total_words, interval_rate, len(vocab), len(self.docvecs)
+ )
interval_start = default_timer()
interval_count = total_words
document_length = len(document.words)
@@ -709,8 +713,10 @@ def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False
utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
min_reduce += 1
- logger.info("collected %i word types and %i unique tags from a corpus of %i examples and %i words",
- len(vocab), len(self.docvecs), document_no + 1, total_words)
+ logger.info(
+ "collected %i word types and %i unique tags from a corpus of %i examples and %i words",
+ len(vocab), len(self.docvecs), document_no + 1, total_words
+ )
self.corpus_count = document_no + 1
self.raw_vocab = vocab
@@ -721,15 +727,20 @@ def _do_train_job(self, job, alpha, inits):
indexed_doctags = self.docvecs.indexed_doctags(doc.tags)
doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags
if self.sg:
- tally += train_document_dbow(self, doc.words, doctag_indexes, alpha, work,
- train_words=self.dbow_words,
- doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ tally += train_document_dbow(
+ self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
+ )
elif self.dm_concat:
- tally += train_document_dm_concat(self, doc.words, doctag_indexes, alpha, work, neu1,
- doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ tally += train_document_dm_concat(
+ self, doc.words, doctag_indexes, alpha, work, neu1,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
+ )
else:
- tally += train_document_dm(self, doc.words, doctag_indexes, alpha, work, neu1,
- doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ tally += train_document_dm(
+ self, doc.words, doctag_indexes, alpha, work, neu1,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
+ )
self.docvecs.trained_item(indexed_doctags)
return tally, self._raw_word_count(job)
@@ -754,17 +765,20 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
for i in range(steps):
if self.sg:
- train_document_dbow(self, doc_words, doctag_indexes, alpha, work,
- learn_words=False, learn_hidden=False,
- doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ train_document_dbow(
+ self, doc_words, doctag_indexes, alpha, work,
+ learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
+ )
elif self.dm_concat:
- train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1,
- learn_words=False, learn_hidden=False,
- doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ train_document_dm_concat(
+ self, doc_words, doctag_indexes, alpha, work, neu1,
+ learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
+ )
else:
- train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1,
- learn_words=False, learn_hidden=False,
- doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ train_document_dm(
+ self, doc_words, doctag_indexes, alpha, work, neu1,
+ learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
+ )
alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha
return doctag_vectors[0]
@@ -850,7 +864,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
with utils.smart_open(fname, 'ab') as fout:
if not word_vec:
total_vec = len(self.docvecs)
- logger.info("storing %sx%s projection weights into %s" % (total_vec, self.vector_size, fname))
+ logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
# store as in input order
for i in range(len(self.docvecs)):
diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx
index 2c0b2c5655..c4fee9ed0d 100644
--- a/gensim/models/doc2vec_inner.pyx
+++ b/gensim/models/doc2vec_inner.pyx
@@ -640,7 +640,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
window_indexes[n] = null_word_index
else:
window_indexes[n] = indexes[m]
- n = n + 1
+ n += 1
for m in range(2 * window):
memcpy(&_neu1[(doctag_len + m) * vector_size], &_word_vectors[window_indexes[m] * vector_size],
vector_size * cython.sizeof(REAL_t))
diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
index b26c8fa639..ebae837b57 100755
--- a/gensim/models/hdpmodel.py
+++ b/gensim/models/hdpmodel.py
@@ -196,7 +196,7 @@ def inference(self, chunk):
raise RuntimeError("model must be trained to perform inference")
chunk = list(chunk)
if len(chunk) > 1:
- logger.debug("performing inference on a chunk of %i documents" % len(chunk))
+ logger.debug("performing inference on a chunk of %i documents", len(chunk))
gamma = np.zeros((len(chunk), self.lda_beta.shape[0]))
for d, doc in enumerate(chunk):
@@ -214,8 +214,7 @@ def __getitem__(self, bow, eps=0.01):
gamma = self.inference([bow])[0]
topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else []
- return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
- if topicvalue >= eps]
+ return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps]
def update(self, corpus):
save_freq = max(1, int(10000 / self.chunksize)) # save every 10k docs, roughly
@@ -265,7 +264,7 @@ def update_chunk(self, chunk, update=True, opt_o=True):
unique_words[word_id] = len(unique_words)
word_list.append(word_id)
- Wt = len(word_list) # length of words in these documents
+ wt = len(word_list) # length of words in these documents
# ...and do the lazy updates on the necessary columns of lambda
rw = np.array([self.m_r[t] for t in self.m_timestamp[word_list]])
@@ -274,7 +273,7 @@ def update_chunk(self, chunk, update=True, opt_o=True):
psi(self.m_eta + self.m_lambda[:, word_list]) - \
psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis])
- ss = SuffStats(self.m_T, Wt, len(chunk))
+ ss = SuffStats(self.m_T, wt, len(chunk))
Elogsticks_1st = expect_log_sticks(self.m_var_sticks) # global sticks
@@ -285,19 +284,19 @@ def update_chunk(self, chunk, update=True, opt_o=True):
if len(doc) > 0:
doc_word_ids, doc_word_counts = zip(*doc)
doc_score = self.doc_e_step(
- doc, ss, Elogsticks_1st,
- word_list, unique_words, doc_word_ids,
- doc_word_counts, self.m_var_converge)
+ ss, Elogsticks_1st,
+ unique_words, doc_word_ids,
+ doc_word_counts, self.m_var_converge
+ )
count += sum(doc_word_counts)
score += doc_score
if update:
self.update_lambda(ss, word_list, opt_o)
- return (score, count)
+ return score, count
- def doc_e_step(self, doc, ss, Elogsticks_1st, word_list,
- unique_words, doc_word_ids, doc_word_counts, var_converge):
+ def doc_e_step(self, ss, Elogsticks_1st, unique_words, doc_word_ids, doc_word_counts, var_converge):
"""
e step for a single doc
"""
@@ -392,8 +391,7 @@ def update_lambda(self, sstats, word_list, opt_o):
self.m_rhot = rhot
# Update appropriate columns of lambda based on documents.
- self.m_lambda[:, word_list] = self.m_lambda[:, word_list] * (1 - rhot) + \
- rhot * self.m_D * sstats.m_var_beta_ss / sstats.m_chunksize
+ self.m_lambda[:, word_list] = self.m_lambda[:, word_list] * (1 - rhot) + rhot * self.m_D * sstats.m_var_beta_ss / sstats.m_chunksize
self.m_lambda_sum = (1 - rhot) * self.m_lambda_sum + \
rhot * self.m_D * np.sum(sstats.m_var_beta_ss, axis=1) / sstats.m_chunksize
@@ -401,8 +399,7 @@ def update_lambda(self, sstats, word_list, opt_o):
self.m_timestamp[word_list] = self.m_updatect
self.m_r.append(self.m_r[-1] + np.log(1 - rhot))
- self.m_varphi_ss = (1.0 - rhot) * self.m_varphi_ss + rhot * \
- sstats.m_var_sticks_ss * self.m_D / sstats.m_chunksize
+ self.m_varphi_ss = (1.0 - rhot) * self.m_varphi_ss + rhot * sstats.m_var_sticks_ss * self.m_D / sstats.m_chunksize
if opt_o:
self.optimal_ordering()
@@ -431,10 +428,8 @@ def update_expectations(self):
topics we've learned we'll get the correct behavior.
"""
for w in xrange(self.m_W):
- self.m_lambda[:, w] *= np.exp(self.m_r[-1] -
- self.m_r[self.m_timestamp[w]])
- self.m_Elogbeta = psi(self.m_eta + self.m_lambda) - \
- psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis])
+ self.m_lambda[:, w] *= np.exp(self.m_r[-1] - self.m_r[self.m_timestamp[w]])
+ self.m_Elogbeta = psi(self.m_eta + self.m_lambda) - psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis])
self.m_timestamp[:] = self.m_updatect
self.m_status_up_to_date = True
@@ -448,8 +443,10 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No
"""
if num_words is not None: # deprecated num_words is used
- logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.")
- logger.warning("Please use topn instead.")
+ logger.warning(
+ "The parameter num_words for show_topic() would be deprecated in the updated version. "
+ "Please use topn instead."
+ )
topn = num_words
if not self.m_status_up_to_date:
@@ -492,7 +489,7 @@ def save_topics(self, doc_count=None):
else:
fname = 'doc-%i' % doc_count
fname = '%s/%s.topics' % (self.outputdir, fname)
- logger.info("saving topics to %s" % fname)
+ logger.info("saving topics to %s", fname)
betas = self.m_lambda + self.m_eta
np.savetxt(fname, betas)
@@ -527,13 +524,12 @@ def hdp_to_lda(self):
alpha[i] = sticks[i] * left
left = left - alpha[i]
alpha[self.m_T - 1] = left
- alpha = alpha * self.m_alpha
+ alpha *= self.m_alpha
# beta
- beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta +
- self.m_lambda_sum[:, np.newaxis])
+ beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis])
- return (alpha, beta)
+ return alpha, beta
def suggested_lda_model(self):
"""
@@ -560,12 +556,14 @@ def evaluate_test_corpus(self, corpus):
lda_betad = self.lda_beta[:, doc_word_ids]
log_predicts = np.log(np.dot(theta, lda_betad))
doc_score = sum(log_predicts) / len(doc)
- logger.info('TEST: %6d %.5f' % (i, doc_score))
+ logger.info('TEST: %6d %.5f', i, doc_score)
score += likelihood
total_words += sum(doc_word_counts)
- logger.info('TEST: average score: %.5f, total score: %.5f, test docs: %d' % (score / total_words, score, len(corpus)))
+ logger.info(
+ "TEST: average score: %.5f, total score: %.5f, test docs: %d",
+ score / total_words, score, len(corpus)
+ )
return score
-# endclass HdpModel
class HdpTopicFormatter(object):
@@ -627,14 +625,20 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
def print_topic(self, topic_id, topn=None, num_words=None):
if num_words is not None: # deprecated num_words is used
- warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.")
+ warnings.warn(
+ "The parameter num_words for print_topic() would be deprecated in the updated version. "
+ "Please use topn instead."
+ )
topn = num_words
return self.show_topic(topic_id, topn, formatted=True)
def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None,):
if num_words is not None: # deprecated num_words is used
- warnings.warn("The parameter num_words for show_topic() would be deprecated in the updated version. Please use topn instead.")
+ warnings.warn(
+ "The parameter num_words for show_topic() would be deprecated in the updated version. "
+ "Please use topn instead."
+ )
topn = num_words
lambdak = list(self.data[topic_id, :])
@@ -668,4 +672,3 @@ def format_topic(self, topic_id, topic_terms):
fmt = (topic_id, fmt)
return fmt
-# endclass HdpTopicFormatter
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index c2dfd419cb..75cc11ee69 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -146,11 +146,11 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None)
total_vec = len(self.vocab)
vector_size = self.syn0.shape[1]
if fvocab is not None:
- logger.info("storing vocabulary in %s" % (fvocab))
+ logger.info("storing vocabulary in %s", fvocab)
with utils.smart_open(fvocab, 'wb') as vout:
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
- logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname))
+ logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
assert (len(self.vocab), vector_size) == self.syn0.shape
with utils.smart_open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
@@ -205,7 +205,7 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
logger.info("loading projection weights from %s", fname)
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
- vocab_size, vector_size = map(int, header.split()) # throws for invalid file format
+ vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
if limit:
vocab_size = min(vocab_size, limit)
result = cls()
@@ -232,7 +232,7 @@ def add_word(word, weights):
if binary:
binary_len = dtype(REAL).itemsize * vector_size
- for line_no in xrange(vocab_size):
+ for _ in xrange(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
@@ -253,8 +253,8 @@ def add_word(word, weights):
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
if len(parts) != vector_size + 1:
- raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
- word, weights = parts[0], list(map(REAL, parts[1:]))
+ raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
+ word, weights = parts[0], [REAL(x) for x in parts[1:]]
add_word(word, weights)
if result.syn0.shape[0] != len(result.vocab):
logger.info(
@@ -264,7 +264,7 @@ def add_word(word, weights):
result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
assert (len(result.vocab), vector_size) == result.syn0.shape
- logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
+ logger.info("loaded %s matrix from %s", result.syn0.shape, fname)
return result
def word_vec(self, word, use_norm=False):
@@ -400,12 +400,13 @@ def wmdistance(self, document1, document2):
diff1 = len_pre_oov1 - len(document1)
diff2 = len_pre_oov2 - len(document2)
if diff1 > 0 or diff2 > 0:
- logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).',
- diff1, diff2)
+ logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)
if len(document1) == 0 or len(document2) == 0:
- logger.info('At least one of the documents had no words that were'
- 'in the vocabulary. Aborting (returning inf).')
+ logger.info(
+ "At least one of the documents had no words that werein the vocabulary. "
+ "Aborting (returning inf)."
+ )
return float('inf')
dictionary = Dictionary(documents=[document1, document2])
@@ -481,8 +482,10 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
positive = [positive]
- all_words = set([self.vocab[word].index for word in positive + negative
- if not isinstance(word, ndarray) and word in self.vocab])
+ all_words = {
+ self.vocab[word].index for word in positive + negative
+ if not isinstance(word, ndarray) and word in self.vocab
+ }
positive = [
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
@@ -638,16 +641,16 @@ def n_similarity(self, ws1, ws2):
raise ZeroDivisionError('Atleast one of the passed list is empty.')
v1 = [self[word] for word in ws1]
v2 = [self[word] for word in ws2]
- return dot(matutils.unitvec(array(v1).mean(axis=0)),
- matutils.unitvec(array(v2).mean(axis=0)))
+ return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
@staticmethod
def log_accuracy(section):
correct, incorrect = len(section['correct']), len(section['incorrect'])
if correct + incorrect > 0:
- logger.info("%s: %.1f%% (%i/%i)" %
- (section['section'], 100.0 * correct / (correct + incorrect),
- correct, correct + incorrect))
+ logger.info(
+ "%s: %.1f%% (%i/%i)",
+ section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect
+ )
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
"""
@@ -672,7 +675,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
- ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)
+ ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
sections, section = [], None
for line_no, line in enumerate(utils.smart_open(questions)):
@@ -692,16 +695,16 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
a, b, c, expected = [word.upper() for word in line.split()]
else:
a, b, c, expected = [word for word in line.split()]
- except Exception:
- logger.info("skipping invalid line #%i in %s" % (line_no, questions))
+ except ValueError:
+ logger.info("skipping invalid line #%i in %s", line_no, questions)
continue
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
- logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip()))
+ logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip())
continue
original_vocab = self.vocab
self.vocab = ok_vocab
- ignore = set([a, b, c]) # input words to be ignored
+ ignore = {a, b, c} # input words to be ignored
predicted = None
# find the most likely prediction, ignoring OOV words and input words
sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
@@ -736,8 +739,7 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0])
logger.info('Pairs with unknown words ratio: %.1f%%', oov)
- def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
- dummy4unknown=False):
+ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
"""
Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`.
@@ -762,7 +764,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
Otherwise (default False), these pairs are skipped entirely.
"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
- ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)
+ ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
similarity_gold = []
similarity_model = []
@@ -783,7 +785,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
else:
a, b, sim = [word for word in line.split(delimiter)]
sim = float(sim)
- except Exception:
+ except (ValueError, TypeError):
logger.info('skipping invalid line #%d in %s', line_no, pairs)
continue
if a not in ok_vocab or b not in ok_vocab:
@@ -802,15 +804,12 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case
pearson = stats.pearsonr(similarity_gold, similarity_model)
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
- logger.debug(
- 'Pearson correlation coefficient against %s: %f with p-value %f',
- pairs, pearson[0], pearson[1]
- )
+ logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1])
logger.debug(
'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
pairs, spearman[0], spearman[1]
)
- logger.debug('Pairs with unknown words: %d' % oov)
+ logger.debug('Pairs with unknown words: %d', oov)
self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
return pearson, spearman, oov_ratio
@@ -844,5 +843,8 @@ def get_embedding_layer(self, train_embeddings=False):
# set `trainable` as `False` to use the pretrained word embedding
# No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights
- layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights], trainable=train_embeddings)
+ layer = Embedding(
+ input_dim=weights.shape[0], output_dim=weights.shape[1],
+ weights=[weights], trainable=train_embeddings
+ )
return layer
diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py
index 91e7f237c7..6b3bd53c44 100755
--- a/gensim/models/lda_dispatcher.py
+++ b/gensim/models/lda_dispatcher.py
@@ -85,11 +85,11 @@ def initialize(self, **model_params):
worker = Pyro4.Proxy(uri)
workerid = len(self.workers)
# make time consuming methods work asynchronously
- logger.info("registering worker #%i at %s" % (workerid, uri))
+ logger.info("registering worker #%i at %s", workerid, uri)
worker.initialize(workerid, dispatcher=self.callback, **model_params)
self.workers[workerid] = worker
except Pyro4.errors.PyroError:
- logger.warning("unresponsive worker at %s, deleting it from the name server" % uri)
+ logger.warning("unresponsive worker at %s, deleting it from the name server", uri)
ns.remove(name)
if not self.workers:
@@ -104,16 +104,16 @@ def getworkers(self):
@Pyro4.expose
def getjob(self, worker_id):
- logger.info("worker #%i requesting a new job" % worker_id)
+ logger.info("worker #%i requesting a new job", worker_id)
job = self.jobs.get(block=True, timeout=1)
- logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize()))
+ logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize())
return job
@Pyro4.expose
def putjob(self, job):
self._jobsreceived += 1
self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT)
- logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize())
+ logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize())
@Pyro4.expose
def getstate(self):
@@ -121,11 +121,11 @@ def getstate(self):
Merge states from across all workers and return the result.
"""
logger.info("end of input, assigning all remaining jobs")
- logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived))
+ logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived)
while self._jobsdone < self._jobsreceived:
time.sleep(0.5) # check every half a second
- logger.info("merging states from %i workers" % len(self.workers))
+ logger.info("merging states from %i workers", len(self.workers))
workers = list(self.workers.values())
result = workers[0].getstate()
for worker in workers[1:]:
@@ -140,7 +140,7 @@ def reset(self, state):
Initialize all workers for a new EM iterations.
"""
for workerid, worker in iteritems(self.workers):
- logger.info("resetting worker %s" % workerid)
+ logger.info("resetting worker %s", workerid)
worker.reset(state)
worker.requestjob()
self._jobsdone = 0
@@ -158,7 +158,7 @@ def jobdone(self, workerid):
and `worker.requestjob()`.
"""
self._jobsdone += 1
- logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone))
+ logger.info("worker #%s finished job #%i", workerid, self._jobsdone)
self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way)
def jobsdone(self):
@@ -171,7 +171,7 @@ def exit(self):
Terminate all registered workers and then the dispatcher.
"""
for workerid, worker in iteritems(self.workers):
- logger.info("terminating worker %s" % workerid)
+ logger.info("terminating worker %s", workerid)
worker.exit()
logger.info("terminating dispatcher")
os._exit(0) # exit the whole process (not just this thread ala sys.exit())
@@ -180,27 +180,25 @@ def exit(self):
def main():
parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("--maxsize", help="How many jobs (=chunks of N documents) "
- "to keep 'pre-fetched' in a queue (default: %(default)s)",
- type=int, default=MAX_JOBS_QUEUE)
+ parser.add_argument("--maxsize", help="How many jobs (=chunks of N documents) to keep 'pre-fetched' in a queue (default: %(default)s)", type=int, default=MAX_JOBS_QUEUE)
parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None)
parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int)
parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)",
action='store_const', default=True, const=False)
parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None)
- parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel",
- const=logging.INFO, default=logging.WARNING)
+ parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING)
args = parser.parse_args()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel)
logger.info("running %s", " ".join(sys.argv))
- ns_conf = {"broadcast": args.no_broadcast,
- "host": args.host,
- "port": args.port,
- "hmac_key": args.hmac}
+ ns_conf = {
+ "broadcast": args.no_broadcast,
+ "host": args.host,
+ "port": args.port,
+ "hmac_key": args.hmac
+ }
utils.pyro_daemon(LDA_DISPATCHER_PREFIX, Dispatcher(maxsize=args.maxsize, ns_conf=ns_conf), ns_conf=ns_conf)
-
logger.info("finished running %s", " ".join(sys.argv))
diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py
index ec87c29148..8656672db6 100755
--- a/gensim/models/lda_worker.py
+++ b/gensim/models/lda_worker.py
@@ -50,7 +50,7 @@ def initialize(self, myid, dispatcher, **model_params):
self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
self.dispatcher = dispatcher
self.finished = False
- logger.info("initializing worker #%s" % myid)
+ logger.info("initializing worker #%s", myid)
self.model = ldamodel.LdaModel(**model_params)
@Pyro4.expose
@@ -70,27 +70,26 @@ def requestjob(self):
# no new job: try again, unless we're finished with all work
continue
if job is not None:
- logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone))
+ logger.info("worker #%s received job #%i", self.myid, self.jobsdone)
self.processjob(job)
self.dispatcher.jobdone(self.myid)
else:
- logger.info("worker #%i stopping asking for jobs" % self.myid)
+ logger.info("worker #%i stopping asking for jobs", self.myid)
@utils.synchronous('lock_update')
def processjob(self, job):
- logger.debug("starting to process job #%i" % self.jobsdone)
+ logger.debug("starting to process job #%i", self.jobsdone)
self.model.do_estep(job)
self.jobsdone += 1
if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0:
fname = os.path.join(tempfile.gettempdir(), 'lda_worker.pkl')
self.model.save(fname)
- logger.info("finished processing job #%i" % (self.jobsdone - 1))
+ logger.info("finished processing job #%i", self.jobsdone - 1)
@Pyro4.expose
@utils.synchronous('lock_update')
def getstate(self):
- logger.info("worker #%i returning its state after %s jobs" %
- (self.myid, self.jobsdone))
+ logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone)
result = self.model.state
assert isinstance(result, ldamodel.LdaState)
self.model.clear() # free up mem in-between two EM cycles
@@ -101,7 +100,7 @@ def getstate(self):
@utils.synchronous('lock_update')
def reset(self, state):
assert state is not None
- logger.info("resetting worker #%i" % self.myid)
+ logger.info("resetting worker #%i", self.myid)
self.model.state = state
self.model.sync_state()
self.model.state.reset()
@@ -109,32 +108,29 @@ def reset(self, state):
@Pyro4.oneway
def exit(self):
- logger.info("terminating worker #%i" % self.myid)
+ logger.info("terminating worker #%i", self.myid)
os._exit(0)
-# endclass Worker
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None)
parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int)
- parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)",
- action='store_const', default=True, const=False)
+ parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', default=True, const=False)
parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None)
- parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel",
- const=logging.INFO, default=logging.WARNING)
+ parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING)
args = parser.parse_args()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel)
logger.info("running %s", " ".join(sys.argv))
- ns_conf = {"broadcast": args.no_broadcast,
- "host": args.host,
- "port": args.port,
- "hmac_key": args.hmac}
-
+ ns_conf = {
+ "broadcast": args.no_broadcast,
+ "host": args.host,
+ "port": args.port,
+ "hmac_key": args.hmac
+ }
utils.pyro_daemon(LDA_WORKER_PREFIX, Worker(), random_suffix=True, ns_conf=ns_conf)
-
logger.info("finished running %s", " ".join(sys.argv))
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index 9ed01d84c8..91c4c450a6 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -48,12 +48,7 @@
from gensim.models.callbacks import Callback
# log(sum(exp(x))) that tries to avoid overflow
-try:
- # try importing from here if older scipy is installed
- from scipy.maxentropy import logsumexp
-except ImportError:
- # maxentropy has been removed in recent releases, logsumexp now in misc
- from scipy.misc import logsumexp
+from scipy.misc import logsumexp
logger = logging.getLogger('gensim.models.ldamodel')
@@ -147,8 +142,7 @@ def blend(self, rhot, other, targetsize=None):
if other.numdocs == 0 or targetsize == other.numdocs:
scale = 1.0
else:
- logger.info("merging changes from %i documents into a model of %i documents",
- other.numdocs, targetsize)
+ logger.info("merging changes from %i documents into a model of %i documents", other.numdocs, targetsize)
scale = 1.0 * targetsize / other.numdocs
self.sstats += rhot * scale * other.sstats
@@ -288,7 +282,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
- assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
+ assert self.alpha.shape == (self.num_topics,), \
+ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
if isinstance(eta, six.string_types):
if eta == 'asymmetric':
@@ -324,10 +319,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX
self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX])
logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri))
- self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics,
- chunksize=chunksize, alpha=alpha, eta=eta, distributed=False)
+ self.dispatcher.initialize(
+ id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize,
+ alpha=alpha, eta=eta, distributed=False
+ )
self.numworkers = len(self.dispatcher.getworkers())
- logger.info("using distributed version with %i workers" % self.numworkers)
+ logger.info("using distributed version with %i workers", self.numworkers)
except Exception as err:
logger.error("failed to initialize distributed LDA (%s)", err)
raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
@@ -382,8 +379,9 @@ def init_dir_prior(self, prior, name):
return init_prior, is_auto
def __str__(self):
- return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \
- (self.num_terms, self.num_topics, self.decay, self.chunksize)
+ return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (
+ self.num_terms, self.num_topics, self.decay, self.chunksize
+ )
def sync_state(self):
self.expElogbeta = np.exp(self.state.get_Elogbeta())
@@ -412,8 +410,8 @@ def inference(self, chunk, collect_sstats=False):
"""
try:
- _ = len(chunk)
- except Exception:
+ len(chunk)
+ except TypeError:
# convert iterators/generators to plain list, so we have len() etc.
chunk = list(chunk)
if len(chunk) > 1:
@@ -436,9 +434,9 @@ def inference(self, chunk, collect_sstats=False):
for d, doc in enumerate(chunk):
if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
# make sure the term IDs are ints, otherwise np will get upset
- ids = [int(id) for id, _ in doc]
+ ids = [int(idx) for idx, _ in doc]
else:
- ids = [id for id, _ in doc]
+ ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc])
gammad = gamma[d, :]
Elogthetad = Elogtheta[d, :]
@@ -462,7 +460,7 @@ def inference(self, chunk, collect_sstats=False):
phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
# If gamma hasn't changed much, we're done.
meanchange = np.mean(abs(gammad - lastgamma))
- if (meanchange < self.gamma_threshold):
+ if meanchange < self.gamma_threshold:
converged += 1
break
gamma[d, :] = gammad
@@ -472,8 +470,7 @@ def inference(self, chunk, collect_sstats=False):
sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)
if len(chunk) > 1:
- logger.debug("%i/%i documents converged within %i iterations",
- converged, len(chunk), self.iterations)
+ logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations)
if collect_sstats:
# This step finishes computing the sufficient statistics for the
@@ -533,8 +530,10 @@ def log_perplexity(self, chunk, total_docs=None):
corpus_words = sum(cnt for document in chunk for _, cnt in document)
subsample_ratio = 1.0 * total_docs / len(chunk)
perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
- logger.info("%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" %
- (perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words))
+ logger.info(
+ "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words",
+ perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
+ )
return perwordbound
def update(self, corpus, chunksize=None, decay=None, offset=None,
@@ -621,12 +620,14 @@ def update(self, corpus, chunksize=None, decay=None, offset=None,
"iterating %ix with a convergence threshold of %f",
updatetype, self.num_topics, passes, lencorpus,
updateafter, evalafter, iterations,
- gamma_threshold)
+ gamma_threshold
+ )
if updates_per_pass * passes < 10:
logger.warning(
- "too few updates, training might not converge; consider "
- "increasing the number of passes or iterations to improve accuracy")
+ "too few updates, training might not converge; "
+ "consider increasing the number of passes or iterations to improve accuracy"
+ )
# rho is the "speed" of updating; TODO try other fncs
# pass_ + num_updates handles increasing the starting t for each pass,
@@ -643,7 +644,7 @@ def rho():
for pass_ in xrange(passes):
if self.dispatcher:
- logger.info('initializing %s workers' % self.numworkers)
+ logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape)
@@ -658,13 +659,17 @@ def rho():
if self.dispatcher:
# add the chunk to dispatcher's job queue, so workers can munch on it
- logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i',
- pass_, chunk_no * chunksize + len(chunk), lencorpus)
+ logger.info(
+ "PROGRESS: pass %i, dispatching documents up to #%i/%i",
+ pass_, chunk_no * chunksize + len(chunk), lencorpus
+ )
# this will eventually block until some jobs finish, because the queue has a small finite length
self.dispatcher.putjob(chunk)
else:
- logger.info('PROGRESS: pass %i, at document #%i/%i',
- pass_, chunk_no * chunksize + len(chunk), lencorpus)
+ logger.info(
+ "PROGRESS: pass %i, at document #%i/%i",
+ pass_, chunk_no * chunksize + len(chunk), lencorpus
+ )
gammat = self.do_estep(chunk, other)
if self.optimize_alpha:
@@ -870,7 +875,7 @@ def get_topic_terms(self, topicid, topn=10):
topic = self.get_topics()[topicid]
topic = topic / topic.sum() # normalize to probability distribution
bestn = matutils.argsort(topic, topn, reverse=True)
- return [(id, topic[id]) for id in bestn]
+ return [(idx, topic[idx]) for idx in bestn]
def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None,
coherence='u_mass', topn=20, processes=-1):
@@ -888,7 +893,8 @@ def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None,
cm = CoherenceModel(
model=self, corpus=corpus, texts=texts, dictionary=dictionary,
window_size=window_size, coherence=coherence, topn=topn,
- processes=processes)
+ processes=processes
+ )
coherence_scores = cm.get_coherence_per_topic()
str_topics = []
@@ -995,7 +1001,8 @@ def get_term_topics(self, word_id, minimum_probability=None):
return values
- def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, diagonal=False, annotation=True, normed=True):
+ def diff(self, other, distance="kullback_leibler", num_words=100,
+ n_ann_terms=10, diagonal=False, annotation=True, normed=True):
"""
Calculate difference topic2topic between two Lda models
`other` instances of `LdaMulticore` or `LdaModel`
@@ -1049,7 +1056,8 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10
d1, d2 = fst_topics, snd_topics
if diagonal:
- assert t1_size == t2_size, "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix"
+ assert t1_size == t2_size, \
+ "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix"
# initialize z and annotation array
z = np.zeros(t1_size)
if annotation:
@@ -1096,7 +1104,7 @@ def __getitem__(self, bow, eps=None):
"""
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
- def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs):
+ def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, **kwargs):
"""
Save the model to file.
@@ -1135,7 +1143,7 @@ def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **
if isinstance(ignore, six.string_types):
ignore = [ignore]
ignore = [e for e in ignore if e] # make sure None and '' are not in the list
- ignore = list(set(['state', 'dispatcher', 'id2word']) | set(ignore))
+ ignore = list({'state', 'dispatcher', 'id2word'} | set(ignore))
else:
ignore = ['state', 'dispatcher', 'id2word']
@@ -1188,10 +1196,9 @@ def load(cls, fname, *args, **kwargs):
# check if `id2word_fname` file is present on disk
# if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file
# if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load
- if (os.path.isfile(id2word_fname)):
+ if os.path.isfile(id2word_fname):
try:
result.id2word = utils.unpickle(id2word_fname)
except Exception as e:
logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e)
return result
-# endclass LdaModel
diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py
index 39c3c40666..0c22c64f7c 100644
--- a/gensim/models/ldamulticore.py
+++ b/gensim/models/ldamulticore.py
@@ -143,11 +143,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
if isinstance(alpha, six.string_types) and alpha == 'auto':
raise NotImplementedError("auto-tuning alpha not implemented in multicore LDA; use plain LdaModel.")
- super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics,
+ super(LdaMulticore, self).__init__(
+ corpus=corpus, num_topics=num_topics,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability,
- minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics)
+ minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics
+ )
def update(self, corpus, chunks_as_numpy=False):
"""
@@ -169,7 +171,7 @@ def update(self, corpus, chunks_as_numpy=False):
"""
try:
lencorpus = len(corpus)
- except Exception:
+ except TypeError:
logger.warning("input corpus stream has no len(); counting documents")
lencorpus = sum(1 for _ in corpus)
if lencorpus == 0:
@@ -187,15 +189,17 @@ def update(self, corpus, chunks_as_numpy=False):
evalafter = min(lencorpus, (self.eval_every or 0) * updateafter)
updates_per_pass = max(1, lencorpus / updateafter)
- logger.info("running %s LDA training, %s topics, %i passes over the"
- " supplied corpus of %i documents, updating every %i documents,"
- " evaluating every ~%i documents, iterating %ix with a convergence threshold of %f",
- updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter,
- self.iterations, self.gamma_threshold)
+ logger.info(
+ "running %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, "
+ "updating every %i documents, evaluating every ~%i documents, iterating %ix with a convergence threshold of %f",
+ updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, self.iterations, self.gamma_threshold
+ )
if updates_per_pass * self.passes < 10:
- logger.warning("too few updates, training might not converge; consider "
- "increasing the number of passes or iterations to improve accuracy")
+ logger.warning(
+ "too few updates, training might not converge; "
+ "consider increasing the number of passes or iterations to improve accuracy"
+ )
job_queue = Queue(maxsize=2 * self.workers)
result_queue = Queue()
@@ -240,9 +244,10 @@ def process_result_queue(force=False):
job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1)
chunk_put = True
queue_size[0] += 1
- logger.info('PROGRESS: pass %i, dispatched chunk #%i = '
- 'documents up to #%i/%i, outstanding queue size %i',
- pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0])
+ logger.info(
+ "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, outstanding queue size %i",
+ pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0]
+ )
except queue.Full:
# in case the input job queue is full, keep clearing the
# result queue, to make sure we don't deadlock
diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py
index 6399e17aae..26709c04d6 100644
--- a/gensim/models/ldaseqmodel.py
+++ b/gensim/models/ldaseqmodel.py
@@ -50,8 +50,8 @@ class LdaSeqModel(utils.SaveLoad):
"""
def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
- initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
- random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
+ initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
+ random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
"""
`corpus` is any iterable gensim corpus
@@ -91,7 +91,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
if corpus is not None:
try:
self.corpus_len = len(corpus)
- except Exception:
+ except TypeError:
logger.warning("input corpus stream has no len(); counting documents")
self.corpus_len = sum(1 for _ in corpus)
@@ -113,7 +113,10 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
# the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities.
self.topic_chains = []
for topic in range(0, num_topics):
- sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance)
+ sslm_ = sslm(
+ num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
+ chain_variance=chain_variance, obs_variance=obs_variance
+ )
self.topic_chains.append(sslm_)
# the following are class variables which are to be integrated during Document Influence Model
@@ -125,7 +128,10 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
# if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
if corpus is not None and time_slice is not None:
if initialize == 'gensim':
- lda_model = ldamodel.LdaModel(corpus, id2word=self.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas, random_state=random_state)
+ lda_model = ldamodel.LdaModel(
+ corpus, id2word=self.id2word, num_topics=self.num_topics,
+ passes=passes, alpha=self.alphas, random_state=random_state
+ )
self.sstats = np.transpose(lda_model.state.sstats)
if initialize == 'ldamodel':
self.sstats = np.transpose(lda_model.state.sstats)
@@ -206,7 +212,7 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter,
topic_bound = self.fit_lda_seq_topics(topic_suffstats)
bound += topic_bound
- if ((bound - old_bound) < 0):
+ if (bound - old_bound) < 0:
# if max_iter is too low, increase iterations.
if lda_inference_max_iter < LOWER_ITER:
lda_inference_max_iter *= ITER_MULT_LOW
@@ -227,7 +233,8 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter,
return bound
- def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize):
+ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
+ iter_, lda_inference_max_iter, chunksize):
"""
Inference or E- Step.
This is used to set up the gensim LdaModel to be used for each time-slice.
@@ -243,14 +250,21 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_infe
model = "DTM"
if model == "DTM":
- bound, gammas = self.inferDTMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize)
+ bound, gammas = self.inferDTMseq(
+ corpus, topic_suffstats, gammas, lhoods, lda,
+ ldapost, iter_, bound, lda_inference_max_iter, chunksize
+ )
elif model == "DIM":
self.InfluenceTotalFixed(corpus)
- bound, gammas = self.inferDIMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize)
+ bound, gammas = self.inferDIMseq(
+ corpus, topic_suffstats, gammas, lhoods, lda,
+ ldapost, iter_, bound, lda_inference_max_iter, chunksize
+ )
return bound, gammas
- def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize):
+ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
+ ldapost, iter_, bound, lda_inference_max_iter, chunksize):
"""
Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound.
Need to pass the LdaSeq model, corpus, sufficient stats, gammas and lhoods matrices previously created,
@@ -281,9 +295,13 @@ def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, ite
# TODO: replace fit_lda_post with appropriate ldamodel functions, if possible.
if iter_ == 0:
- doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter)
+ doc_lhood = LdaPost.fit_lda_post(
+ ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
+ )
else:
- doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter)
+ doc_lhood = LdaPost.fit_lda_post(
+ ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
+ )
if topic_suffstats is not None:
topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats)
@@ -310,7 +328,6 @@ def fit_lda_seq_topics(self, topic_suffstats):
Fit lda sequence topic wise.
"""
lhood = 0
- lhood_term = 0
for k, chain in enumerate(self.topic_chains):
logger.info("Fitting topic number %i", k)
@@ -416,8 +433,6 @@ def __getitem__(self, doc):
# should even the likelihoods be returned?
return doc_topic
-# endclass LdaSeqModel
-
class sslm(utils.SaveLoad):
"""
@@ -593,10 +608,8 @@ def fit_sslm(self, sstats):
sslm_max_iter = 2
converged = sslm_fit_threshold + 1
- totals = np.zeros(sstats.shape[1])
-
# computing variance, fwd_variance
- self.variance, self.fwd_variance = map(np.array, list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
+ self.variance, self.fwd_variance = (np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
# column sum of sstats
totals = sstats.sum(axis=0)
@@ -631,8 +644,8 @@ def compute_bound(self, sstats, totals):
Compute log probability bound.
Forumula is as described in appendix of DTM by Blei. (formula no. 5)
"""
- W = self.vocab_len
- T = self.num_time_slices
+ w = self.vocab_len
+ t = self.num_time_slices
term_1 = 0
term_2 = 0
@@ -643,19 +656,19 @@ def compute_bound(self, sstats, totals):
chain_variance = self.chain_variance
# computing mean, fwd_mean
- self.mean, self.fwd_mean = map(np.array, (zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)])))
+ self.mean, self.fwd_mean = (np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)]))
self.zeta = self.update_zeta()
- for w in range(0, W):
- val += (self.variance[w][0] - self.variance[w][T]) / 2 * chain_variance
+ for w in range(0, w):
+ val += (self.variance[w][0] - self.variance[w][t]) / 2 * chain_variance
logger.info("Computing bound, all times")
- for t in range(1, T + 1):
+ for t in range(1, t + 1):
term_1 = 0.0
term_2 = 0.0
ent = 0.0
- for w in range(0, W):
+ for w in range(0, w):
m = self.mean[w][t]
prev_m = self.mean[w][t - 1]
@@ -725,7 +738,9 @@ def update_obs(self, sstats, totals):
if model == "DTM":
# slowest part of method
- obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0)
+ obs = optimize.fmin_cg(
+ f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0
+ )
if model == "DIM":
pass
runs += 1
@@ -803,7 +818,6 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv):
for u in range(1, T + 1):
mean_u = mean[u]
- variance_u_prev = variance[u - 1] # noqa:F841
mean_u_prev = mean[u - 1]
dmean_u = mean_deriv[u]
dmean_u_prev = mean_deriv[u - 1]
@@ -1026,7 +1040,6 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats):
topic_suffstats[k] = topic_ss
return topic_suffstats
-# endclass LdaPost
# the following functions are used in update_obs as the function to optimize
@@ -1060,7 +1073,6 @@ def f_obs(x, *args):
for t in range(1, T + 1):
mean_t = mean[t]
mean_t_prev = mean[t - 1]
- var_t_prev = variance[t - 1] # noqa:F841
val = mean_t - mean_t_prev
term1 += val * val
diff --git a/gensim/models/logentropy_model.py b/gensim/models/logentropy_model.py
index d4bfc93479..05f79ae3c2 100644
--- a/gensim/models/logentropy_model.py
+++ b/gensim/models/logentropy_model.py
@@ -45,7 +45,7 @@ class LogEntropyModel(interfaces.TransformationABC):
Model persistency is achieved via its load/save methods.
"""
- def __init__(self, corpus, id2word=None, normalize=True):
+ def __init__(self, corpus, normalize=True):
"""
`normalize` dictates whether the resulting vectors will be
set to unit length.
@@ -58,8 +58,7 @@ def __init__(self, corpus, id2word=None, normalize=True):
self.initialize(corpus)
def __str__(self):
- return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs,
- self.n_words)
+ return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words)
def initialize(self, corpus):
"""
@@ -71,7 +70,7 @@ def initialize(self, corpus):
glob_num_words, doc_no = 0, -1
for doc_no, bow in enumerate(corpus):
if doc_no % 10000 == 0:
- logger.info("PROGRESS: processing document #%i" % doc_no)
+ logger.info("PROGRESS: processing document #%i", doc_no)
glob_num_words += len(bow)
for term_id, term_count in bow:
glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count
@@ -81,14 +80,14 @@ def initialize(self, corpus):
self.n_words = glob_num_words
# and finally compute the global weights
- logger.info("calculating global log entropy weights for %i "
- "documents and %i features (%i matrix non-zeros)"
- % (self.n_docs, len(glob_freq), self.n_words))
+ logger.info(
+ "calculating global log entropy weights for %i documents and %i features (%i matrix non-zeros)",
+ self.n_docs, len(glob_freq), self.n_words
+ )
logger.debug('iterating over corpus')
for doc_no2, bow in enumerate(corpus):
for key, freq in bow:
- p = (float(freq) / glob_freq[key]) * math.log(float(freq) /
- glob_freq[key])
+ p = (float(freq) / glob_freq[key]) * math.log(float(freq) / glob_freq[key])
self.entr[key] = self.entr.get(key, 0.0) + p
if doc_no2 != doc_no:
raise ValueError("LogEntropyModel doesn't support generators as training data")
@@ -107,8 +106,11 @@ def __getitem__(self, bow):
return self._apply(bow)
# unknown (new) terms will be given zero weight (NOT infinity/huge)
- vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
- for term_id, tf in bow if term_id in self.entr]
+ vector = [
+ (term_id, math.log(tf + 1) * self.entr.get(term_id))
+ for term_id, tf in bow
+ if term_id in self.entr
+ ]
if self.normalize:
vector = matutils.unitvec(vector)
return vector
diff --git a/gensim/models/lsi_dispatcher.py b/gensim/models/lsi_dispatcher.py
index 5a69327522..dd18734dfb 100755
--- a/gensim/models/lsi_dispatcher.py
+++ b/gensim/models/lsi_dispatcher.py
@@ -80,7 +80,7 @@ def initialize(self, **model_params):
worker = Pyro4.Proxy(uri)
workerid = len(self.workers)
# make time consuming methods work asynchronously
- logger.info("registering worker #%i from %s" % (workerid, uri))
+ logger.info("registering worker #%i from %s", workerid, uri)
worker.initialize(workerid, dispatcher=self.callback, **model_params)
self.workers[workerid] = worker
except Pyro4.errors.PyroError:
@@ -99,16 +99,16 @@ def getworkers(self):
@Pyro4.expose
def getjob(self, worker_id):
- logger.info("worker #%i requesting a new job" % worker_id)
+ logger.info("worker #%i requesting a new job", worker_id)
job = self.jobs.get(block=True, timeout=1)
- logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize()))
+ logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize())
return job
@Pyro4.expose
def putjob(self, job):
self._jobsreceived += 1
self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT)
- logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize())
+ logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize())
@Pyro4.expose
def getstate(self):
@@ -116,7 +116,7 @@ def getstate(self):
Merge projections from across all workers and return the final projection.
"""
logger.info("end of input, assigning all remaining jobs")
- logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived))
+ logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived)
while self._jobsdone < self._jobsreceived:
time.sleep(0.5) # check every half a second
@@ -124,11 +124,11 @@ def getstate(self):
# and not `workers - 1` merges!
# but merging only takes place once, after all input data has been processed,
# so the overall effect would be small... compared to the amount of coding :-)
- logger.info("merging states from %i workers" % len(self.workers))
+ logger.info("merging states from %i workers", self.workers)
workers = list(self.workers.items())
result = workers[0][1].getstate()
for workerid, worker in workers[1:]:
- logger.info("pulling state from worker %s" % workerid)
+ logger.info("pulling state from worker %s", workerid)
result.merge(worker.getstate())
logger.info("sending out merged projection")
return result
@@ -139,7 +139,7 @@ def reset(self):
Initialize all workers for a new decomposition.
"""
for workerid, worker in iteritems(self.workers):
- logger.info("resetting worker %s" % workerid)
+ logger.info("resetting worker %s", workerid)
worker.reset()
worker.requestjob()
self._jobsdone = 0
@@ -157,7 +157,7 @@ def jobdone(self, workerid):
worker.requestjob().
"""
self._jobsdone += 1
- logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone))
+ logger.info("worker #%s finished job #%i", workerid, self._jobsdone)
worker = self.workers[workerid]
worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way)
@@ -171,7 +171,7 @@ def exit(self):
Terminate all registered workers and then the dispatcher.
"""
for workerid, worker in iteritems(self.workers):
- logger.info("terminating worker %s" % workerid)
+ logger.info("terminating worker %s", workerid)
worker.exit()
logger.info("terminating dispatcher")
os._exit(0) # exit the whole process (not just this thread ala sys.exit())
@@ -180,7 +180,7 @@ def exit(self):
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- logger.info("running %s" % " ".join(sys.argv))
+ logger.info("running %s", " ".join(sys.argv))
program = os.path.basename(sys.argv[0])
# make sure we have enough cmd line parameters
@@ -194,7 +194,7 @@ def main():
maxsize = int(sys.argv[1])
utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=maxsize))
- logger.info("finished running %s" % program)
+ logger.info("finished running %s", program)
if __name__ == '__main__':
diff --git a/gensim/models/lsi_worker.py b/gensim/models/lsi_worker.py
index 4cae372ffd..ffb31eafb9 100755
--- a/gensim/models/lsi_worker.py
+++ b/gensim/models/lsi_worker.py
@@ -47,7 +47,7 @@ def initialize(self, myid, dispatcher, **model_params):
self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
self.dispatcher = dispatcher
self.finished = False
- logger.info("initializing worker #%s" % myid)
+ logger.info("initializing worker #%s", myid)
self.model = lsimodel.LsiModel(**model_params)
@Pyro4.expose
@@ -67,11 +67,11 @@ def requestjob(self):
# no new job: try again, unless we're finished with all work
continue
if job is not None:
- logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone))
+ logger.info("worker #%s received job #%i", self.myid, self.jobsdone)
self.processjob(job)
self.dispatcher.jobdone(self.myid)
else:
- logger.info("worker #%i stopping asking for jobs" % self.myid)
+ logger.info("worker #%i stopping asking for jobs", self.myid)
@utils.synchronous('lock_update')
def processjob(self, job):
@@ -84,8 +84,7 @@ def processjob(self, job):
@Pyro4.expose
@utils.synchronous('lock_update')
def getstate(self):
- logger.info("worker #%i returning its state after %s jobs" %
- (self.myid, self.jobsdone))
+ logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone)
assert isinstance(self.model.projection, lsimodel.Projection)
self.finished = True
return self.model.projection
@@ -93,20 +92,20 @@ def getstate(self):
@Pyro4.expose
@utils.synchronous('lock_update')
def reset(self):
- logger.info("resetting worker #%i" % self.myid)
+ logger.info("resetting worker #%i", self.myid)
self.model.projection = self.model.projection.empty_like()
self.finished = False
@Pyro4.oneway
def exit(self):
- logger.info("terminating worker #%i" % self.myid)
+ logger.info("terminating worker #%i", self.myid)
os._exit(0)
# endclass Worker
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- logger.info("running %s" % " ".join(sys.argv))
+ logger.info("running %s", " ".join(sys.argv))
program = os.path.basename(sys.argv[0])
# make sure we have enough cmd line parameters
@@ -116,7 +115,7 @@ def main():
utils.pyro_daemon('gensim.lsi_worker', Worker(), random_suffix=True)
- logger.info("finished running %s" % program)
+ logger.info("finished running %s", program)
if __name__ == '__main__':
diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
index f9e16cd23b..e55a009ab8 100644
--- a/gensim/models/lsimodel.py
+++ b/gensim/models/lsimodel.py
@@ -161,8 +161,9 @@ def merge(self, other, decay=1.0):
self.s = other.s.copy()
return
if self.m != other.m:
- raise ValueError("vector space mismatch: update is using %s features, expected %s" %
- (other.m, self.m))
+ raise ValueError(
+ "vector space mismatch: update is using %s features, expected %s" % (other.m, self.m)
+ )
logger.info("merging projections: %s + %s", str(self.u.shape), str(other.u.shape))
m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
# TODO Maybe keep the bases as elementary reflectors, without
@@ -182,8 +183,10 @@ def merge(self, other, decay=1.0):
assert not other.u
# find the rotation that diagonalizes r
- k = np.bmat([[np.diag(decay * self.s), np.multiply(c, other.s)],
- [matutils.pad(np.array([]).reshape(0, 0), min(m, n2), n1), np.multiply(r, other.s)]])
+ k = np.bmat([
+ [np.diag(decay * self.s), np.multiply(c, other.s)],
+ [matutils.pad(np.array([]).reshape(0, 0), min(m, n2), n1), np.multiply(r, other.s)]
+ ])
logger.debug("computing SVD of %s dense matrix", k.shape)
try:
# in np < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
@@ -216,9 +219,6 @@ def merge(self, other, decay=1.0):
for i in xrange(self.u.shape[1]):
if self.u[0, i] < 0.0:
self.u[:, i] *= -1.0
-# diff = np.dot(self.u.T, self.u) - np.eye(self.u.shape[1])
-# logger.info('orth error after=%f' % np.sum(diff * diff))
-# endclass Projection
class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
@@ -300,7 +300,9 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1)
self.docs_processed = 0
- self.projection = Projection(self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples)
+ self.projection = Projection(
+ self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples
+ )
self.numworkers = 1
if not distributed:
@@ -308,16 +310,18 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
self.dispatcher = None
else:
if not onepass:
- raise NotImplementedError("distributed stochastic LSA not implemented yet; "
- "run either distributed one-pass, or serial randomized.")
+ raise NotImplementedError(
+ "distributed stochastic LSA not implemented yet; "
+ "run either distributed one-pass, or serial randomized."
+ )
try:
import Pyro4
dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher')
logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri))
- dispatcher.initialize(id2word=self.id2word, num_topics=num_topics,
- chunksize=chunksize, decay=decay,
- power_iters=self.power_iters, extra_samples=self.extra_samples,
- distributed=False, onepass=onepass)
+ dispatcher.initialize(
+ id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay,
+ power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass
+ )
self.dispatcher = dispatcher
self.numworkers = len(dispatcher.getworkers())
logger.info("using distributed version with %i workers", self.numworkers)
@@ -359,7 +363,8 @@ def add_documents(self, corpus, chunksize=None, decay=None):
update.u, update.s = stochastic_svd(
corpus, self.num_topics,
num_terms=self.num_terms, chunksize=chunksize,
- extra_dims=self.extra_samples, power_iters=self.power_iters)
+ extra_dims=self.extra_samples, power_iters=self.power_iters
+ )
self.projection.merge(update, decay=decay)
self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0
else:
@@ -385,7 +390,10 @@ def add_documents(self, corpus, chunksize=None, decay=None):
logger.info("dispatched documents up to #%s", doc_no)
else:
# serial version, there is only one "worker" (myself) => process the job directly
- update = Projection(self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters)
+ update = Projection(
+ self.num_terms, self.num_topics, job, extra_dims=self.extra_samples,
+ power_iters=self.power_iters
+ )
del job
self.projection.merge(update, decay=decay)
del update
@@ -397,19 +405,21 @@ def add_documents(self, corpus, chunksize=None, decay=None):
logger.info("reached the end of input; now waiting for all remaining jobs to finish")
self.projection = self.dispatcher.getstate()
self.docs_processed += doc_no
-# logger.info("top topics after adding %i documents" % doc_no)
-# self.print_debug(10)
else:
assert not self.dispatcher, "must be in serial mode to receive jobs"
assert self.onepass, "distributed two-pass algo not supported yet"
- update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters)
+ update = Projection(
+ self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples,
+ power_iters=self.power_iters
+ )
self.projection.merge(update, decay=decay)
logger.info("processed sparse job of %i documents", corpus.shape[1])
self.docs_processed += corpus.shape[1]
def __str__(self):
return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (
- self.num_terms, self.num_topics, self.decay, self.chunksize)
+ self.num_terms, self.num_topics, self.decay, self.chunksize
+ )
def __getitem__(self, bow, scaled=False, chunksize=512):
"""
@@ -579,9 +589,8 @@ def load(cls, fname, *args, **kwargs):
try:
result.projection = super(LsiModel, cls).load(projection_fname, *args, **kwargs)
except Exception as e:
- logging.warning("failed to load projection from %s: %s" % (projection_fname, e))
+ logging.warning("failed to load projection from %s: %s", projection_fname, e)
return result
-# endclass LsiModel
def print_debug(id2token, u, s, topics, num_words=10, num_neg=None):
@@ -679,7 +688,7 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
q, _ = matutils.qr_destroy(y) # orthonormalize the range
logger.debug("running %i power iterations", power_iters)
- for power_iter in xrange(power_iters):
+ for _ in xrange(power_iters):
q = corpus.T * q
q = [corpus * q]
q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step
@@ -697,8 +706,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
num_docs += n
logger.debug("multiplying chunk * gauss")
o = np.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix
- sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o
- chunk.data, o.ravel(), y.ravel())
+ sparsetools.csc_matvecs(
+ m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o
+ chunk.data, o.ravel(), y.ravel()
+ )
del chunk, o
y = [y]
q, _ = matutils.qr_destroy(y) # orthonormalize the range
@@ -723,7 +734,7 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
if scipy.sparse.issparse(corpus):
b = qt * corpus
- logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
+ logger.info("2nd phase: running dense svd on %s matrix", str(b.shape))
u, s, vt = scipy.linalg.svd(b, full_matrices=False)
del b, vt
else:
diff --git a/gensim/models/normmodel.py b/gensim/models/normmodel.py
index a78dc604dc..31e843beb3 100644
--- a/gensim/models/normmodel.py
+++ b/gensim/models/normmodel.py
@@ -55,7 +55,7 @@ def calc_norm(self, corpus):
"""
Calculates the norm by calling matutils.unitvec with the norm parameter.
"""
- logger.info("Performing %s normalization..." % (self.norm))
+ logger.info("Performing %s normalization...", self.norm)
norms = []
numnnz = 0
docno = 0
@@ -73,4 +73,3 @@ def normalize(self, bow):
def __getitem__(self, bow):
return self.normalize(bow)
-# endclass NormModel
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 1f0826258c..263968526f 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -108,9 +108,8 @@ class Phrases(interfaces.TransformationABC):
"""
- def __init__(self, sentences=None, min_count=5, threshold=10.0,
- max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
- scoring='default'):
+ def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000,
+ delimiter=b'_', progress_per=10000, scoring='default'):
"""
Initialize the model from an iterable of `sentences`. Each sentence must be
a list of words (unicode strings) that will be used for training.
@@ -179,7 +178,8 @@ def __str__(self):
"""Get short string representation of this phrase detector."""
return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % (
self.__class__.__name__, len(self.vocab), self.min_count,
- self.threshold, self.max_vocab_size)
+ self.threshold, self.max_vocab_size
+ )
@staticmethod
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
@@ -191,8 +191,10 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
min_reduce = 1
for sentence_no, sentence in enumerate(sentences):
if sentence_no % progress_per == 0:
- logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
- (sentence_no, total_words, len(vocab)))
+ logger.info(
+ "PROGRESS: at sentence #%i, processed %i words and %i word types",
+ sentence_no, total_words, len(vocab)
+ )
sentence = [utils.any2utf8(w) for w in sentence]
for bigram in zip(sentence, sentence[1:]):
vocab[bigram[0]] += 1
@@ -208,8 +210,10 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
utils.prune_vocab(vocab, min_reduce)
min_reduce += 1
- logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
- (len(vocab), total_words, sentence_no + 1))
+ logger.info(
+ "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
+ len(vocab), total_words, sentence_no + 1
+ )
return min_reduce, vocab, total_words
def add_vocab(self, sentences):
@@ -262,11 +266,9 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
corpus_word_count = self.corpus_word_count
if scoring == 'default':
- scoring_function = \
- partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
+ scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
elif scoring == 'npmi':
- scoring_function = \
- partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
+ scoring_function = partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
# no else here to catch unknown scoring function, check is done in Phrases.__init__
for sentence in sentences:
@@ -282,10 +284,6 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
count_b = float(vocab[word_b])
count_ab = float(vocab[bigram_word])
score = scoring_function(count_a, count_b, count_ab)
- # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
- # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
- # added mincount check because if the scorer doesn't contain min_count
- # it would not be enforced otherwise
if score > threshold and count_ab >= min_count:
if as_tuples:
yield ((word_a, word_b), score)
@@ -336,8 +334,6 @@ def __getitem__(self, sentence):
pb = float(vocab[word_b])
pab = float(vocab[bigram_word])
score = (pab - min_count) / pa / pb * len(vocab)
- # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
- # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
if score > threshold:
new_s.append(bigram_word)
last_bigram = True
@@ -453,7 +449,7 @@ def __getitem__(self, sentence):
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
- logging.info("running %s" % " ".join(sys.argv))
+ logging.info("running %s", " ".join(sys.argv))
# check and process cmdline input
program = os.path.basename(sys.argv[0])
diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py
index 1186a041a0..f5753c75c5 100644
--- a/gensim/models/rpmodel.py
+++ b/gensim/models/rpmodel.py
@@ -61,7 +61,7 @@ def initialize(self, corpus):
self.num_terms = 1 + max([-1] + self.id2word.keys())
shape = self.num_topics, self.num_terms
- logger.info("constructing %s random matrix" % str(shape))
+ logger.info("constructing %s random matrix", str(shape))
# Now construct the projection matrix itself.
# Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
# and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
@@ -89,10 +89,11 @@ def __getitem__(self, bow):
vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / np.sqrt(self.num_topics)
vec = np.asfortranarray(vec, dtype=np.float32)
topic_dist = np.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1)
- return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
- if np.isfinite(topicvalue) and not np.allclose(topicvalue, 0.0)]
+ return [
+ (topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
+ if np.isfinite(topicvalue) and not np.allclose(topicvalue, 0.0)
+ ]
def __setstate__(self, state):
self.__dict__ = state
self.freshly_loaded = True
-# endclass RpModel
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 4b5ba02e02..50320ad747 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -28,7 +28,7 @@ def precompute_idfs(wglobal, dfs, total_docs):
"""Precompute the inverse document frequency mapping for all terms."""
# not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
# this method is here just to speed things up a little.
- return dict((termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs))
+ return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
class TfidfModel(interfaces.TransformationABC):
@@ -49,9 +49,8 @@ class TfidfModel(interfaces.TransformationABC):
Model persistency is achieved via its load/save methods.
"""
- def __init__(
- self, corpus=None, id2word=None, dictionary=None,
- wlocal=utils.identity, wglobal=df2idf, normalize=True):
+ def __init__(self, corpus=None, id2word=None, dictionary=None,
+ wlocal=utils.identity, wglobal=df2idf, normalize=True):
"""
Compute tf-idf by multiplying a local component (term frequency) with a
global component (inverse document frequency), and normalizing
@@ -89,7 +88,8 @@ def __init__(
# step that goes through the corpus (= an optimization).
if corpus is not None:
logger.warning(
- "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus")
+ "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
+ )
self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
self.dfs = dictionary.dfs.copy()
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
@@ -129,7 +129,8 @@ def initialize(self, corpus):
n_features = max(dfs) if dfs else 0
logger.info(
"calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
- self.num_docs, n_features, self.num_nnz)
+ self.num_docs, n_features, self.num_nnz
+ )
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
def __getitem__(self, bow, eps=1e-12):
@@ -158,4 +159,3 @@ def __getitem__(self, bow, eps=1e-12):
# make sure there are no explicit zeroes in the vector (must be sparse)
vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
return vector
-# endclass TfidfModel
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 255b9c553f..2111478c00 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -159,7 +159,9 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
- train_sg_pair(model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss)
+ train_sg_pair(
+ model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss
+ )
result += len(word_vocabs)
return result
@@ -382,11 +384,10 @@ class Word2Vec(utils.SaveLoad):
"""
- def __init__(
- self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
- max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
- sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
- trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False):
+ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
+ max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
+ sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+ trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False):
"""
Initialize the model from an iterable of `sentences`. Each sentence is a
list of words (unicode strings) that will be used for training.
@@ -460,9 +461,9 @@ def __init__(
self.load = call_on_class_only
if FAST_VERSION == -1:
- logger.warning('Slow version of {0} is being used'.format(__name__))
+ logger.warning('Slow version of %s is being used', __name__)
else:
- logger.debug('Fast version of {0} is being used'.format(__name__))
+ logger.debug('Fast version of %s is being used', __name__)
self.initialize_word_vectors()
self.sg = int(sg)
@@ -498,12 +499,14 @@ def __init__(
if isinstance(sentences, GeneratorType):
raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
self.build_vocab(sentences, trim_rule=trim_rule)
- self.train(sentences, total_examples=self.corpus_count, epochs=self.iter,
- start_alpha=self.alpha, end_alpha=self.min_alpha)
+ self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha)
else:
if trim_rule is not None:
- logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ")
- logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.")
+ logger.warning(
+ "The rule, if given, is only used to prune vocabulary during build_vocab() "
+ "and is not stored as part of the model. Model initialized without sentences. "
+ "trim_rule provided, if any, will be ignored."
+ )
def initialize_word_vectors(self):
self.wv = KeyedVectors()
@@ -546,7 +549,9 @@ def create_binary_tree(self):
heapq.heapify(heap)
for i in xrange(len(self.wv.vocab) - 1):
min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
- heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2))
+ heapq.heappush(
+ heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2)
+ )
# recurse over the tree, assigning a binary code to each vocabulary word
if heap:
@@ -587,13 +592,16 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
if not checked_string_types:
if isinstance(sentence, string_types):
logger.warning(
- "Each 'sentences' item should be a list of words (usually unicode strings)."
- "First item here is instead plain %s.", type(sentence)
+ "Each 'sentences' item should be a list of words (usually unicode strings). "
+ "First item here is instead plain %s.",
+ type(sentence)
)
checked_string_types += 1
if sentence_no % progress_per == 0:
- logger.info("PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
- sentence_no, sum(itervalues(vocab)) + total_words, len(vocab))
+ logger.info(
+ "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
+ sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)
+ )
for word in sentence:
vocab[word] += 1
@@ -602,12 +610,15 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
min_reduce += 1
total_words += sum(itervalues(vocab))
- logger.info("collected %i word types from a corpus of %i raw words and %i sentences",
- len(vocab), total_words, sentence_no + 1)
+ logger.info(
+ "collected %i word types from a corpus of %i raw words and %i sentences",
+ len(vocab), total_words, sentence_no + 1
+ )
self.corpus_count = sentence_no + 1
self.raw_vocab = vocab
- def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False):
+ def scale_vocab(self, min_count=None, sample=None, dry_run=False,
+ keep_raw_vocab=False, trim_rule=None, update=False):
"""
Apply vocabulary settings for `min_count` (discarding less-frequent words)
and `sample` (controlling the downsampling of more-frequent words).
@@ -648,12 +659,16 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab
drop_total += v
original_unique_total = len(retain_words) + drop_unique
retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1)
- logger.info("min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
- min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique)
+ logger.info(
+ "min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
+ min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique
+ )
original_total = retain_total + drop_total
retain_pct = retain_total * 100 / max(original_total, 1)
- logger.info("min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
- min_count, retain_total, retain_pct, original_total, drop_total)
+ logger.info(
+ "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
+ min_count, retain_total, retain_pct, original_total, drop_total
+ )
else:
logger.info("Updating model with new vocabulary")
new_total = pre_exist_total = 0
@@ -677,10 +692,12 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab
original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique
pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1)
new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1)
- logger.info("""New added %i unique words (%i%% of original %i)
- and increased the count of %i pre-existing words (%i%% of original %i)""",
- len(new_words), new_unique_pct, original_unique_total,
- len(pre_exist_words), pre_exist_unique_pct, original_unique_total)
+ logger.info(
+ "New added %i unique words (%i%% of original %i) "
+ "and increased the count of %i pre-existing words (%i%% of original %i)",
+ len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words),
+ pre_exist_unique_pct, original_unique_total
+ )
retain_words = new_words + pre_exist_words
retain_total = new_total + pre_exist_total
@@ -713,15 +730,16 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab
self.raw_vocab = defaultdict(int)
logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique)
- logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
- downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total)
-
- # return from each step: words-affected, resulting-corpus-size
- report_values = {'drop_unique': drop_unique, 'retain_total': retain_total,
- 'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total)}
+ logger.info(
+ "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
+ downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total
+ )
- # print extra memory estimates
- report_values['memory'] = self.estimate_memory(vocab_size=len(retain_words))
+ # return from each step: words-affected, resulting-corpus-size, extra memory estimates
+ report_values = {
+ 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique,
+ 'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words))
+ }
return report_values
@@ -787,8 +805,7 @@ def _raw_word_count(self, job):
return sum(len(sentence) for sentence in job)
def train(self, sentences, total_examples=None, total_words=None,
- epochs=None, start_alpha=None, end_alpha=None,
- word_count=0,
+ epochs=None, start_alpha=None, end_alpha=None, word_count=0,
queue_factor=2, report_delay=1.0, compute_loss=None):
"""
Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
@@ -804,11 +821,13 @@ def train(self, sentences, total_examples=None, total_words=None,
explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()`
is only called once, the model's cached `iter` value should be supplied as `epochs` value.
"""
- if (self.model_trimmed_post_training):
+ if self.model_trimmed_post_training:
raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
if FAST_VERSION < 0:
- warnings.warn("C extension not loaded for Word2Vec, training will be slow. "
- "Install a C compiler and reinstall gensim for fast training.")
+ warnings.warn(
+ "C extension not loaded for Word2Vec, training will be slow. "
+ "Install a C compiler and reinstall gensim for fast training."
+ )
self.neg_labels = []
if self.negative > 0:
# precompute negative labels optimization for pure-python training
@@ -822,8 +841,8 @@ def train(self, sentences, total_examples=None, total_words=None,
logger.info(
"training model with %i workers on %i vocabulary and %i features, "
"using sg=%s hs=%s sample=%s negative=%s window=%s",
- self.workers, len(self.wv.vocab), self.layer1_size, self.sg,
- self.hs, self.sample, self.negative, self.window)
+ self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window
+ )
if not self.wv.vocab:
raise RuntimeError("you must first build vocabulary before training the model")
@@ -834,10 +853,15 @@ def train(self, sentences, total_examples=None, total_words=None,
raise ValueError(
"The number of sentences in the training corpus is missing. Did you load the model via KeyedVectors.load_word2vec_format?"
"Models loaded via load_word2vec_format don't support further training. "
- "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.")
+ "Instead start with a blank model, scan_vocab on the new corpus, "
+ "intersect_word2vec_format with the old model, then train."
+ )
if total_words is None and total_examples is None:
- raise ValueError("You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count.")
+ raise ValueError(
+ "You must specify either total_examples or total_words, for proper alpha and progress calculations. "
+ "The usual value is total_examples=model.corpus_count."
+ )
if epochs is None:
raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.")
start_alpha = start_alpha or self.alpha
@@ -872,9 +896,7 @@ def job_producer():
pushed_words, pushed_examples = 0, 0
next_alpha = start_alpha
if next_alpha > self.min_alpha_yet_reached:
- logger.warning(
- "Effective 'alpha' higher than previous training cycles"
- )
+ logger.warning("Effective 'alpha' higher than previous training cycles")
self.min_alpha_yet_reached = next_alpha
job_no = 0
@@ -890,7 +912,8 @@ def job_producer():
# no => submit the existing job
logger.debug(
"queueing job #%i (%i words, %i sentences) at alpha %.05f",
- job_no, batch_size, len(job_batch), next_alpha)
+ job_no, batch_size, len(job_batch), next_alpha
+ )
job_no += 1
job_queue.put((job_batch, next_alpha))
@@ -914,15 +937,15 @@ def job_producer():
if job_batch:
logger.debug(
"queueing job #%i (%i words, %i sentences) at alpha %.05f",
- job_no, batch_size, len(job_batch), next_alpha)
+ job_no, batch_size, len(job_batch), next_alpha
+ )
job_no += 1
job_queue.put((job_batch, next_alpha))
if job_no == 0 and self.train_count == 0:
logger.warning(
"train() called with an empty iterator (if not intended, "
- "be sure to provide a corpus that offers restartable "
- "iteration = an iterable)."
+ "be sure to provide a corpus that offers restartable iteration = an iterable)."
)
# give the workers heads up that they can finish -- no more work!
@@ -967,34 +990,31 @@ def job_producer():
logger.info(
"PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
100.0 * example_count / total_examples, trained_word_count / elapsed,
- utils.qsize(job_queue), utils.qsize(progress_queue))
+ utils.qsize(job_queue), utils.qsize(progress_queue)
+ )
else:
# words-based progress %
logger.info(
"PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
100.0 * raw_word_count / total_words, trained_word_count / elapsed,
- utils.qsize(job_queue), utils.qsize(progress_queue))
+ utils.qsize(job_queue), utils.qsize(progress_queue)
+ )
next_report = elapsed + report_delay
# all done; report the final stats
elapsed = default_timer() - start
logger.info(
"training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
- raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed)
+ raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
+ )
if job_tally < 10 * self.workers:
- logger.warning(
- "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
- )
+ logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay")
# check that the input corpus hasn't changed during iteration
if total_examples and total_examples != example_count:
- logger.warning(
- "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples
- )
+ logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
if total_words and total_words != raw_word_count:
- logger.warning(
- "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words
- )
+ logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words)
self.train_count += 1 # number of times train() has been called
self.total_train_time += elapsed
@@ -1021,21 +1041,25 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
"""
if FAST_VERSION < 0:
- warnings.warn("C extension compilation failed, scoring will be slow. "
- "Install a C compiler and reinstall gensim for fastness.")
+ warnings.warn(
+ "C extension compilation failed, scoring will be slow. "
+ "Install a C compiler and reinstall gensim for fastness."
+ )
logger.info(
"scoring sentences with %i workers on %i vocabulary and %i features, "
"using sg=%s hs=%s sample=%s and negative=%s",
- self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative)
+ self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative
+ )
if not self.wv.vocab:
raise RuntimeError("you must first build vocabulary before scoring new data")
if not self.hs:
- raise RuntimeError("We have currently only implemented score \
- for the hierarchical softmax scheme, so you need to have \
- run word2vec with hs=1 and negative=0 for this to work.")
+ raise RuntimeError(
+ "We have currently only implemented score for the hierarchical softmax scheme, "
+ "so you need to have run word2vec with hs=1 and negative=0 for this to work."
+ )
def worker_loop():
"""Compute log probability for each sentence, lifting lists of sentences from the jobs queue."""
@@ -1081,15 +1105,14 @@ def worker_loop():
if (job_no - 1) * chunksize > total_sentences:
logger.warning(
"terminating after %i sentences (set higher total_sentences if you want more).",
- total_sentences)
+ total_sentences
+ )
job_no -= 1
raise StopIteration()
logger.debug("putting job #%i in the queue", job_no)
job_queue.put(items)
except StopIteration:
- logger.info(
- "reached end of input; waiting to finish %i outstanding jobs",
- job_no - done_jobs + 1)
+ logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1)
for _ in xrange(self.workers):
job_queue.put(None) # give the workers heads up that they can finish -- no more work!
push_done = True
@@ -1102,7 +1125,8 @@ def worker_loop():
if elapsed >= next_report:
logger.info(
"PROGRESS: at %.2f%% sentences, %.0f sentences/s",
- 100.0 * sentence_count, sentence_count / elapsed)
+ 100.0 * sentence_count, sentence_count / elapsed
+ )
next_report = elapsed + report_delay # don't flood log, wait report_delay seconds
else:
# loop ended by job count; really done
@@ -1114,7 +1138,8 @@ def worker_loop():
self.clear_sims()
logger.info(
"scoring %i sentences took %.1fs, %.0f sentences/s",
- sentence_count, elapsed, sentence_count / elapsed)
+ sentence_count, elapsed, sentence_count / elapsed
+ )
return sentence_scores[:sentence_count]
def clear_sims(self):
@@ -1141,9 +1166,10 @@ def update_weights(self):
# Raise an error if an online update is run before initial training on a corpus
if not len(self.wv.syn0):
- raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
- "First build the vocabulary of your model with a corpus "
- "before doing an online update.")
+ raise RuntimeError(
+ "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
+ "First build the vocabulary of your model with a corpus before doing an online update."
+ )
self.wv.syn0 = vstack([self.wv.syn0, newsyn0])
@@ -1192,16 +1218,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
training. Use 1.0 to allow further training updates of merged vectors.
"""
overlap_count = 0
- logger.info("loading projection weights from %s" % (fname))
+ logger.info("loading projection weights from %s", fname)
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
- vocab_size, vector_size = map(int, header.split()) # throws for invalid file format
+ vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
if not vector_size == self.vector_size:
raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname))
# TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)?
if binary:
binary_len = dtype(REAL).itemsize * vector_size
- for line_no in xrange(vocab_size):
+ for _ in xrange(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
@@ -1220,13 +1246,13 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
for line_no, line in enumerate(fin):
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
if len(parts) != vector_size + 1:
- raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
- word, weights = parts[0], list(map(REAL, parts[1:]))
+ raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
+ word, weights = parts[0], [REAL(x) for x in parts[1:]]
if word in self.wv.vocab:
overlap_count += 1
self.wv.syn0[self.wv.vocab[word].index] = weights
self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes
- logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname))
+ logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname)
def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
"""
@@ -1301,9 +1327,10 @@ def n_similarity(self, ws1, ws2):
def predict_output_word(self, context_words_list, topn=10):
"""Report the probability distribution of the center word given the context words as input to the trained model."""
if not self.negative:
- raise RuntimeError("We have currently only implemented predict_output_word "
- "for the negative sampling scheme, so you need to have "
- "run word2vec with negative > 0 for this to work.")
+ raise RuntimeError(
+ "We have currently only implemented predict_output_word for the negative sampling scheme, "
+ "so you need to have run word2vec with negative > 0 for this to work."
+ )
if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'):
raise RuntimeError("Parameters required for predicting the output words not found.")
@@ -1344,8 +1371,10 @@ def estimate_memory(self, vocab_size=None, report=None):
if self.negative:
report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize
report['total'] = sum(report.values())
- logger.info("estimated required memory for %i words and %i dimensions: %i bytes",
- vocab_size, self.vector_size, report['total'])
+ logger.info(
+ "estimated required memory for %i words and %i dimensions: %i bytes",
+ vocab_size, self.vector_size, report['total']
+ )
return report
@staticmethod
@@ -1375,7 +1404,11 @@ def __str__(self):
return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False):
- warnings.warn("This method would be deprecated in the future. Keep just_word_vectors = model.wv to retain just the KeyedVectors instance for read-only querying of word vectors.")
+ warnings.warn(
+ "This method would be deprecated in the future. "
+ "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance "
+ "for read-only querying of word vectors."
+ )
if save_syn1 and save_syn1neg and save_syn0_lockf:
return
if hasattr(self, 'syn1') and not save_syn1:
@@ -1581,19 +1614,19 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
self.input_files = [self.source] # force code compatibility with list of files
elif os.path.isdir(self.source):
self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path
- logging.debug('reading directory ' + self.source)
+ logging.debug('reading directory %s', self.source)
self.input_files = os.listdir(self.source)
self.input_files = [self.source + file for file in self.input_files] # make full paths
self.input_files.sort() # makes sure it happens in filename order
else: # not a file or a directory, then we can't do anything with it
raise ValueError('input is neither a file nor a path')
- logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files))
+ logging.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
def __iter__(self):
- '''iterate through the files'''
+ """iterate through the files"""
for file_name in self.input_files:
- logging.info('reading file ' + file_name)
+ logging.info('reading file %s', file_name)
with utils.smart_open(file_name) as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split()
@@ -1649,7 +1682,8 @@ def __iter__(self):
model = Word2Vec(
corpus, size=args.size, min_count=args.min_count, workers=args.threads,
window=args.window, sample=args.sample, sg=skipgram, hs=args.hs,
- negative=args.negative, cbow_mean=1, iter=args.iter)
+ negative=args.negative, cbow_mean=1, iter=args.iter
+ )
if args.output:
outfile = args.output
diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx
index b60f158c65..fe988fc24c 100755
--- a/gensim/models/word2vec_inner.pyx
+++ b/gensim/models/word2vec_inner.pyx
@@ -579,7 +579,7 @@ cdef void score_pair_sg_hs(
row2 = word_point[b] * size
f = our_dot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1
- f = sgn*f
+ f *= sgn
if f <= -MAX_EXP or f >= MAX_EXP:
continue
f = LOG_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
@@ -673,7 +673,7 @@ cdef void score_pair_cbow_hs(
row2 = word_point[b] * size
f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE)
sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1
- f = sgn*f
+ f *= sgn
if f <= -MAX_EXP or f >= MAX_EXP:
continue
f = LOG_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py
index bc02663e04..1f450a457a 100644
--- a/gensim/models/wrappers/dtmmodel.py
+++ b/gensim/models/wrappers/dtmmodel.py
@@ -41,9 +41,9 @@ class DtmModel(utils.SaveLoad):
"""
- def __init__(
- self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None,
- lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True):
+ def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100,
+ id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10,
+ alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True):
"""
`dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`.
@@ -88,7 +88,7 @@ def __init__(
try:
lencorpus = len(corpus)
- except Exception:
+ except TypeError:
logger.warning("input corpus stream has no len(); counting documents")
lencorpus = sum(1 for _ in corpus)
if lencorpus == 0:
@@ -97,8 +97,10 @@ def __init__(
raise ValueError("""There is a text without words in the input corpus.
This breaks method='fixed' (The DIM model).""")
if lencorpus != sum(time_slices):
- raise ValueError("mismatched timeslices %{slices} for corpus of len {clen}".format(
- slices=sum(time_slices), clen=lencorpus))
+ raise ValueError(
+ "mismatched timeslices %{slices} for corpus of len {clen}"
+ .format(slices=sum(time_slices), clen=lencorpus)
+ )
self.lencorpus = lencorpus
if prefix is None:
rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
@@ -171,7 +173,7 @@ def convert_input(self, corpus, time_slices):
Serialize documents in LDA-C format to a temporary text file,.
"""
- logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
+ logger.info("serializing temporary corpus to %s", self.fcorpustxt())
# write out the corpus in a file format that DTM understands:
corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)
@@ -187,17 +189,25 @@ def train(self, corpus, time_slices, mode, model):
"""
self.convert_input(corpus, time_slices)
- arguments = "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} --outname={p4} --alpha={p5}".format(
- p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha)
+ arguments = \
+ "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \
+ "--outname={p4} --alpha={p5}".format(
+ p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda,
+ p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha
+ )
- params = "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} --top_chain_var={p3} --rng_seed={p4} ".format(
- p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed)
+ params = \
+ "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} " \
+ "--top_chain_var={p3} --rng_seed={p4} ".format(
+ p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter,
+ p3=self.top_chain_var, p4=self.rng_seed
+ )
arguments = arguments + " " + params
- logger.info("training DTM with args %s" % arguments)
+ logger.info("training DTM with args %s", arguments)
cmd = [self.dtm_path] + arguments.split()
- logger.info("Running command %s" % cmd)
+ logger.info("Running command %s", cmd)
check_output(args=cmd, stderr=PIPE)
self.em_steps = np.loadtxt(self.fem_steps())
@@ -255,13 +265,6 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted
else:
num_topics = min(num_topics, self.num_topics)
chosen_topics = range(num_topics)
- # add a little random jitter, to randomize results around the same
- # alpha
- # sort_alpha = self.alpha + 0.0001 * \
- # numpy.random.rand(len(self.alpha))
- # sorted_topics = list(numpy.argsort(sort_alpha))
- # chosen_topics = sorted_topics[: topics / 2] + \
- # sorted_topics[-topics / 2:]
if times < 0 or times >= len(self.time_slices):
times = len(self.time_slices)
@@ -278,9 +281,6 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted
else:
topic = self.show_topic(i, time, num_words=num_words)
shown.append(topic)
- # if log:
- # logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i],
- # topic))
return shown
def show_topic(self, topicid, time, topn=50, num_words=None):
@@ -290,25 +290,30 @@ def show_topic(self, topicid, time, topn=50, num_words=None):
"""
if num_words is not None: # deprecated num_words is used
- logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.")
- logger.warning("Please use topn instead.")
+ logger.warning(
+ "The parameter num_words for show_topic() would be deprecated in the updated version. "
+ "Please use topn instead."
+ )
topn = num_words
topics = self.lambda_[:, :, time]
topic = topics[topicid]
- # liklihood to probability
+ # likelihood to probability
topic = np.exp(topic)
# normalize to probability dist
topic = topic / topic.sum()
# sort according to prob
bestn = matutils.argsort(topic, topn, reverse=True)
- beststr = [(topic[id], self.id2word[id]) for id in bestn]
+ beststr = [(topic[idx], self.id2word[idx]) for idx in bestn]
return beststr
def print_topic(self, topicid, time, topn=10, num_words=None):
"""Return the given topic, formatted as a string."""
if num_words is not None: # deprecated num_words is used
- warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.")
+ warnings.warn(
+ "The parameter num_words for print_topic() would be deprecated in the updated version. "
+ "Please use topn instead."
+ )
topn = num_words
return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)])
@@ -320,7 +325,7 @@ def dtm_vis(self, corpus, time):
input parameter is the year to do the visualisation.
"""
topic_term = np.exp(self.lambda_[:, :, time]) / np.exp(self.lambda_[:, :, time]).sum()
- topic_term = topic_term * self.num_topics
+ topic_term *= self.num_topics
doc_topic = self.gamma_
diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
index 839cb46633..c2c8ee0688 100644
--- a/gensim/models/wrappers/fasttext.py
+++ b/gensim/models/wrappers/fasttext.py
@@ -19,7 +19,7 @@
Example:
>>> from gensim.models.wrappers import FastText
->>> model = fasttext.FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8')
+>>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8')
>>> print model['forests'] # prints vector for given out-of-vocabulary word
.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information
@@ -41,6 +41,11 @@
logger = logging.getLogger(__name__)
+try:
+ FileNotFoundError
+except NameError:
+ FileNotFoundError = IOError
+
FASTTEXT_FILEFORMAT_MAGIC = 793712314
@@ -146,7 +151,7 @@ def initialize_word_vectors(self):
@classmethod
def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5,
- word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12):
+ word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12):
"""
`ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`.
@@ -212,7 +217,7 @@ def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, a
cmd.append("-%s" % option)
cmd.append(str(value))
- output = utils.check_output(args=cmd) # noqa:F841
+ utils.check_output(args=cmd)
model = cls.load_fasttext_format(output_file)
cls.delete_training_files(output_file)
return model
@@ -265,17 +270,17 @@ def load_model_params(self, file_handle):
magic, version = self.struct_unpack(file_handle, '@2i')
if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format
self.new_format = True
- dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d')
+ dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d')
else: # older format
self.new_format = False
dim = magic
ws = version
- epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
+ epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
# Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
self.vector_size = dim
self.window = ws
self.iter = epoch
- self.min_count = minCount
+ self.min_count = min_count
self.negative = neg
self.hs = loss == 1
self.sg = model == 2
@@ -307,7 +312,9 @@ def load_dict(self, file_handle, encoding='utf8'):
# For more info : https://github.com/facebookresearch/fastText/issues/218
assert word == "__label__", (
- 'mismatched vocab_size ({}) and nwords ({}), extra word "{}"'.format(vocab_size, nwords, word))
+ 'mismatched vocab_size ({}) and nwords ({}), extra word "{}"'
+ .format(vocab_size, nwords, word)
+ )
continue # don't add word to vocab
self.wv.vocab[word] = Vocab(index=i, count=count)
@@ -320,7 +327,8 @@ def load_dict(self, file_handle, encoding='utf8'):
# expecting to log this warning only for pretrained french vector, wiki.fr
logger.warning(
"mismatch between final vocab size (%s words), and expected vocab size (%s words)",
- len(self.wv.vocab), vocab_size)
+ len(self.wv.vocab), vocab_size
+ )
if self.new_format:
for j in range(pruneidx_size):
@@ -332,7 +340,9 @@ def load_vectors(self, file_handle):
num_vectors, dim = self.struct_unpack(file_handle, '@2q')
# Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
assert self.vector_size == dim, (
- 'mismatch between vector size in model params ({}) and model vectors ({})'.format(self.vector_size, dim))
+ 'mismatch between vector size in model params ({}) and model vectors ({})'
+ .format(self.vector_size, dim)
+ )
float_size = struct.calcsize('@f')
if float_size == 4:
dtype = np.dtype(np.float32)
@@ -343,8 +353,10 @@ def load_vectors(self, file_handle):
self.wv.syn0_all = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
self.wv.syn0_all = self.wv.syn0_all.reshape((num_vectors, dim))
assert self.wv.syn0_all.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \
- 'mismatch between actual weight matrix shape {} and expected shape {}'.format(
- self.wv.syn0_all.shape, (self.bucket + len(self.wv.vocab), self.vector_size))
+ 'mismatch between actual weight matrix shape {} and expected shape {}'\
+ .format(
+ self.wv.syn0_all.shape, (self.bucket + len(self.wv.vocab), self.vector_size)
+ )
self.init_ngrams()
@@ -378,7 +390,10 @@ def init_ngrams(self):
ngram_weights = self.wv.syn0_all
- logger.info("loading weights for %s words for fastText model from %s", len(self.wv.vocab), self.file_name)
+ logger.info(
+ "loading weights for %s words for fastText model from %s",
+ len(self.wv.vocab), self.file_name
+ )
for w, vocab in self.wv.vocab.items():
word_ngrams = self.compute_ngrams(w, self.wv.min_n, self.wv.max_n)
@@ -386,7 +401,10 @@ def init_ngrams(self):
self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]])
self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1)
- logger.info("loaded %s weight matrix for fastText model from %s", self.wv.syn0.shape, self.file_name)
+ logger.info(
+ "loaded %s weight matrix for fastText model from %s",
+ self.wv.syn0.shape, self.file_name
+ )
@staticmethod
def compute_ngrams(word, min_n, max_n):
diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py
index a4e435810f..19c93e5f6c 100644
--- a/gensim/models/wrappers/ldamallet.py
+++ b/gensim/models/wrappers/ldamallet.py
@@ -143,7 +143,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True):
self.corpus2mallet(corpus, fout)
# convert the text file above into MALLET's internal format
- cmd = self.mallet_path + ' import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input %s --output %s'
+ cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex \"\S+\" --input %s --output %s"
if infer:
cmd += ' --use-pipe-from ' + self.fcorpusmallet()
cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
@@ -158,8 +158,10 @@ def train(self, corpus):
'--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\
'--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s'
cmd = cmd % (
- self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers,
- self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold)
+ self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval,
+ self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations,
+ self.finferencer(), self.topic_threshold
+ )
# NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
logger.info("training MALLET LDA with %s", cmd)
check_output(args=cmd, shell=True)
@@ -176,7 +178,10 @@ def __getitem__(self, bow, iterations=100):
self.convert_input(bow, infer=True)
cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
- cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold)
+ cmd = cmd % (
+ self.fcorpusmallet() + '.infer', self.finferencer(),
+ self.fdoctopics() + '.infer', iterations, self.topic_threshold
+ )
logger.info("inferring topics with MALLET LDA '%s'", cmd)
check_output(args=cmd, shell=True)
result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
@@ -255,13 +260,11 @@ def show_topic(self, topicid, topn=10, num_words=None):
topn = num_words
if self.word_topics is None:
- logger.warning(
- "Run train or load_word_topics before showing topics."
- )
+ logger.warning("Run train or load_word_topics before showing topics.")
topic = self.word_topics[topicid]
topic = topic / topic.sum() # normalize to probability dist
bestn = matutils.argsort(topic, topn, reverse=True)
- beststr = [(self.id2word[id], topic[id]) for id in bestn]
+ beststr = [(self.id2word[idx], topic[idx]) for idx in bestn]
return beststr
def get_version(self, direc_path):
@@ -305,14 +308,9 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True):
# the MALLET doctopic format changed in 2.0.8 to exclude the id,
# this handles the file differently dependent on the pattern
if len(parts) == 2 * self.num_topics:
- doc = [(id_, weight)
- for id_, weight in zip(map(int, parts[::2]),
- map(float, parts[1::2]))
- if abs(weight) > eps]
+ doc = [(int(id_), float(weight)) for id_, weight in zip(*[iter(parts)] * 2) if abs(float(weight)) > eps]
elif len(parts) == self.num_topics and mallet_version != '2.0.7':
- doc = [(id_, weight)
- for id_, weight in enumerate(map(float, parts))
- if abs(weight) > eps]
+ doc = [(id_, float(weight)) for id_, weight in enumerate(parts) if abs(float(weight)) > eps]
else:
if mallet_version == "2.0.7":
"""
@@ -375,6 +373,7 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
model_gensim = LdaModel(
id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
alpha=mallet_model.alpha, iterations=iterations,
- gamma_threshold=gamma_threshold)
+ gamma_threshold=gamma_threshold
+ )
model_gensim.expElogbeta[:] = mallet_model.wordtopics
return model_gensim
diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py
index afa19c4327..ede5074b99 100644
--- a/gensim/models/wrappers/ldavowpalwabbit.py
+++ b/gensim/models/wrappers/ldavowpalwabbit.py
@@ -68,7 +68,7 @@
from gensim import utils, matutils
from gensim.models.ldamodel import LdaModel
-LOG = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
class LdaVowpalWabbit(utils.SaveLoad):
@@ -140,11 +140,10 @@ def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None,
if self.id2word is None:
if corpus is None:
- raise ValueError('at least one of corpus/id2word must be '
- 'specified, to establish input space '
- 'dimensionality')
- LOG.warning('no word id mapping provided; initializing from '
- 'corpus, assuming identity')
+ raise ValueError(
+ "at least one of corpus/id2word must be specified, to establish input space dimensionality"
+ )
+ logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
self.id2word = utils.dict_from_corpus(corpus)
self.num_terms = len(self.id2word)
elif len(self.id2word) > 0:
@@ -153,8 +152,7 @@ def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None,
self.num_terms = 0
if self.num_terms == 0:
- raise ValueError('cannot compute LDA over an empty collection '
- '(no terms)')
+ raise ValueError("cannot compute LDA over an empty collection (no terms)")
# LDA parameters
self.num_topics = num_topics
@@ -186,7 +184,7 @@ def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None,
def train(self, corpus):
"""Clear any existing model state, and train on given corpus."""
- LOG.debug('Training new model from corpus')
+ logger.debug('Training new model from corpus')
# reset any existing offset, model, or topics generated
self.offset = self._initial_offset
@@ -206,7 +204,7 @@ def update(self, corpus):
if not os.path.exists(self._model_filename):
return self.train(corpus)
- LOG.debug('Updating exiting model from corpus')
+ logger.debug('Updating exiting model from corpus')
# reset any existing topics generated
self._topics = None
@@ -228,12 +226,10 @@ def log_perplexity(self, chunk):
vw_data = self._predict(chunk)[1]
corpus_words = sum(cnt for document in chunk for _, cnt in document)
bound = -vw_data['average_loss']
- LOG.info("%.3f per-word bound, %.1f perplexity estimate based on a "
- "held-out corpus of %i documents with %i words",
- bound,
- numpy.exp2(-bound),
- vw_data['corpus_size'],
- corpus_words)
+ logger.info(
+ "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words",
+ bound, numpy.exp2(-bound), vw_data['corpus_size'], corpus_words
+ )
return bound
def get_topics(self):
@@ -267,7 +263,7 @@ def show_topics(self, num_topics=10, num_words=10,
shown.append(topic)
if log:
- LOG.info("topic #%i (%.3f): %s", i, self.alpha, topic)
+ logger.info("topic #%i (%.3f): %s", i, self.alpha, topic)
return shown
@@ -287,12 +283,12 @@ def save(self, fname, *args, **kwargs):
# Vowpal Wabbit uses its own binary model file, read this into
# variable before serialising this object - keeps all data
# self contained within a single serialised file
- LOG.debug("Reading model bytes from '%s'", self._model_filename)
+ logger.debug("Reading model bytes from '%s'", self._model_filename)
with utils.smart_open(self._model_filename, 'rb') as fhandle:
self._model_data = fhandle.read()
if os.path.exists(self._topics_filename):
- LOG.debug("Reading topic bytes from '%s'", self._topics_filename)
+ logger.debug("Reading topic bytes from '%s'", self._topics_filename)
with utils.smart_open(self._topics_filename, 'rb') as fhandle:
self._topics_data = fhandle.read()
@@ -310,13 +306,13 @@ def load(cls, fname, *args, **kwargs):
if lda_vw._model_data:
# Vowpal Wabbit operates on its own binary model file - deserialise
# to file at load time, making it immediately ready for use
- LOG.debug("Writing model bytes to '%s'", lda_vw._model_filename)
+ logger.debug("Writing model bytes to '%s'", lda_vw._model_filename)
with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle:
fhandle.write(lda_vw._model_data)
lda_vw._model_data = None # no need to keep in memory after this
if lda_vw._topics_data:
- LOG.debug("Writing topic bytes to '%s'", lda_vw._topics_filename)
+ logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename)
with utils.smart_open(lda_vw._topics_filename, 'wb') as fhandle:
fhandle.write(lda_vw._topics_data)
lda_vw._topics_data = None
@@ -326,23 +322,25 @@ def load(cls, fname, *args, **kwargs):
def __del__(self):
"""Cleanup the temporary directory used by this wrapper."""
if self.cleanup_files and self.tmp_dir:
- LOG.debug("Recursively deleting: %s", self.tmp_dir)
+ logger.debug("Recursively deleting: %s", self.tmp_dir)
shutil.rmtree(self.tmp_dir)
def _init_temp_dir(self, prefix='tmp'):
"""Create a working temporary directory with given prefix."""
self.tmp_dir = tempfile.mkdtemp(prefix=prefix)
- LOG.info('using %s as temp dir', self.tmp_dir)
+ logger.info('using %s as temp dir', self.tmp_dir)
def _get_vw_predict_command(self, corpus_size):
"""Get list of command line arguments for running prediction."""
- cmd = [self.vw_path,
- '--testonly', # don't update model with this data
- '--lda_D', str(corpus_size),
- '-i', self._model_filename, # load existing binary model
- '-d', self._corpus_filename,
- '--learning_rate', '0', # possibly not needed, but harmless
- '-p', self._predict_filename]
+ cmd = [
+ self.vw_path,
+ '--testonly', # don't update model with this data
+ '--lda_D', str(corpus_size),
+ '-i', self._model_filename, # load existing binary model
+ '-d', self._corpus_filename,
+ '--learning_rate', '0', # possibly not needed, but harmless
+ '-p', self._predict_filename
+ ]
if self.random_seed is not None:
cmd.extend(['--random_seed', str(self.random_seed)])
@@ -355,27 +353,31 @@ def _get_vw_train_command(self, corpus_size, update=False):
If 'update' is set to True, this specifies that we're further training
an existing model.
"""
- cmd = [self.vw_path,
- '-d', self._corpus_filename,
- '--power_t', str(self.decay),
- '--initial_t', str(self.offset),
- '--minibatch', str(self.chunksize),
- '--lda_D', str(corpus_size),
- '--passes', str(self.passes),
- '--cache_file', self._cache_filename,
- '--lda_epsilon', str(self.gamma_threshold),
- '--readable_model', self._topics_filename,
- '-k', # clear cache
- '-f', self._model_filename]
+ cmd = [
+ self.vw_path,
+ '-d', self._corpus_filename,
+ '--power_t', str(self.decay),
+ '--initial_t', str(self.offset),
+ '--minibatch', str(self.chunksize),
+ '--lda_D', str(corpus_size),
+ '--passes', str(self.passes),
+ '--cache_file', self._cache_filename,
+ '--lda_epsilon', str(self.gamma_threshold),
+ '--readable_model', self._topics_filename,
+ '-k', # clear cache
+ '-f', self._model_filename
+ ]
if update:
cmd.extend(['-i', self._model_filename])
else:
# these params are read from model file if updating
- cmd.extend(['--lda', str(self.num_topics),
- '-b', str(_bit_length(self.num_terms)),
- '--lda_alpha', str(self.alpha),
- '--lda_rho', str(self.eta)])
+ cmd.extend([
+ '--lda', str(self.num_topics),
+ '-b', str(_bit_length(self.num_terms)),
+ '--lda_alpha', str(self.alpha),
+ '--lda_rho', str(self.eta)
+ ])
if self.random_seed is not None:
cmd.extend(['--random_seed', str(self.random_seed)])
@@ -393,8 +395,7 @@ def _load_vw_topics(self):
of:
...
"""
- topics = numpy.zeros((self.num_topics, self.num_terms),
- dtype=numpy.float32)
+ topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
with utils.smart_open(self._topics_filename) as topics_file:
found_data = False
@@ -437,8 +438,7 @@ def _predict(self, chunk):
vw_data = _parse_vw_output(_run_vw_command(cmd))
vw_data['corpus_size'] = corpus_size
- predictions = numpy.zeros((corpus_size, self.num_topics),
- dtype=numpy.float32)
+ predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32)
with utils.smart_open(self._predict_filename) as fhandle:
for i, line in enumerate(fhandle):
@@ -524,7 +524,7 @@ def write_corpus_as_vw(corpus, filename):
Returns the number of lines written.
"""
- LOG.debug("Writing corpus to: %s", filename)
+ logger.debug("Writing corpus to: %s", filename)
corpus_size = 0
with utils.smart_open(filename, 'wb') as corpus_file:
@@ -552,16 +552,14 @@ def _parse_vw_output(text):
def _run_vw_command(cmd):
"""Execute given Vowpal Wabbit command, log stdout and stderr."""
- LOG.info("Running Vowpal Wabbit command: %s", ' '.join(cmd))
+ logger.info("Running Vowpal Wabbit command: %s", ' '.join(cmd))
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
output = proc.communicate()[0].decode('utf-8')
- LOG.debug("Vowpal Wabbit output: %s", output)
+ logger.debug("Vowpal Wabbit output: %s", output)
if proc.returncode != 0:
- raise subprocess.CalledProcessError(proc.returncode,
- ' '.join(cmd),
- output=output)
+ raise subprocess.CalledProcessError(proc.returncode, ' '.join(cmd), output=output)
return output
@@ -588,6 +586,7 @@ def vwmodel2ldamodel(vw_model, iterations=50):
model_gensim = LdaModel(
num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
- offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold)
+ offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold
+ )
model_gensim.expElogbeta[:] = vw_model._get_topics()
return model_gensim
diff --git a/gensim/models/wrappers/varembed.py b/gensim/models/wrappers/varembed.py
index eab1e0217c..30bf859ec7 100644
--- a/gensim/models/wrappers/varembed.py
+++ b/gensim/models/wrappers/varembed.py
@@ -18,14 +18,10 @@
"""
import logging
-import sys
-
import numpy as np
-from gensim.models.keyedvectors import KeyedVectors
-
-# utility fnc for pickling, common scipy operations etc
from gensim import utils
+from gensim.models.keyedvectors import KeyedVectors
from gensim.models.word2vec import Vocab
logger = logging.getLogger(__name__)
@@ -56,25 +52,21 @@ def load_varembed_format(cls, vectors, morfessor_model=None):
result = cls()
if vectors is None:
raise Exception("Please provide vectors binary to load varembed model")
- D = utils.unpickle(vectors)
- word_to_ix = D['word_to_ix']
- morpho_to_ix = D['morpho_to_ix']
- word_embeddings = D['word_embeddings']
- morpho_embeddings = D['morpheme_embeddings']
+ d = utils.unpickle(vectors)
+ word_to_ix = d['word_to_ix']
+ morpho_to_ix = d['morpho_to_ix']
+ word_embeddings = d['word_embeddings']
+ morpho_embeddings = d['morpheme_embeddings']
result.load_word_embeddings(word_embeddings, word_to_ix)
if morfessor_model:
- if sys.version_info >= (2, 7): # Morfessor is only supported for Python 2.7 and above.
- try:
- import morfessor
- morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model)
- result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix)
- except ImportError:
- # Morfessor Package not found.
- logger.error('Could not import morfessor. Not using morpheme embeddings')
- raise ImportError('Could not import morfessor.')
- else:
- # Raise exception in Python 2.6 or earlier.
- raise Exception('Using Morphemes requires Python 2.7 and above. Morfessor is not supported in python 2.6')
+ try:
+ import morfessor
+ morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model)
+ result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix)
+ except ImportError:
+ # Morfessor Package not found.
+ logger.error('Could not import morfessor. Not using morpheme embeddings')
+ raise ImportError('Could not import morfessor.')
logger.info('Loaded varembed model vectors from %s', vectors)
return result
@@ -105,6 +97,10 @@ def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho
"""
for word in self.vocab:
morpheme_embedding = np.array(
- [morpho_embeddings[morpho_to_ix.get(m, -1)] for m in morfessor_model.viterbi_segment(word)[0]]).sum(axis=0)
+ [
+ morpho_embeddings[morpho_to_ix.get(m, -1)]
+ for m in morfessor_model.viterbi_segment(word)[0]
+ ]
+ ).sum(axis=0)
self.syn0[self.vocab[word].index] += morpheme_embedding
logger.info("Added morphemes to word vectors")
diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py
index 8426af1d82..c31cd28adc 100644
--- a/gensim/models/wrappers/wordrank.py
+++ b/gensim/models/wrappers/wordrank.py
@@ -96,13 +96,23 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
meta_file = os.path.join(meta_dir, 'meta')
- cmd_vocab_count = [os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)]
- cmd_cooccurence_count = [os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)]
+ cmd_vocab_count = [
+ os.path.join(wr_path, 'glove', 'vocab_count'),
+ '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)
+ ]
+ cmd_cooccurence_count = [
+ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory),
+ '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)
+ ]
cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)]
cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]
commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
- input_fnames = [os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file]
+ input_fnames = [
+ os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
+ os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
+ cooccurrence_file
+ ]
output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]
logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames))
@@ -116,22 +126,24 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
utils.check_output(w, args=cmd_del_vocab_freq)
with smart_open(vocab_file, 'rb') as f:
- numwords = sum(1 for line in f)
+ numwords = sum(1 for _ in f)
with smart_open(cooccurrence_shuf_file, 'rb') as f:
- numlines = sum(1 for line in f)
+ numlines = sum(1 for _ in f)
with smart_open(meta_file, 'wb') as f:
- meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1])
+ meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(
+ numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1],
+ numwords, vocab_file.split('/')[-1]
+ )
f.write(meta_info.encode('utf-8'))
if iter % dump_period == 0:
iter += 1
else:
logger.warning(
- 'Resultant embedding will be from %d iterations rather than the input %d iterations, '
- 'as wordrank dumps the embedding only at dump_period intervals. '
- 'Input an appropriate combination of parameters (iter, dump_period) such that '
- '"iter mod dump_period" is zero.', iter - (iter % dump_period), iter
- )
+ "Resultant embedding will be from %d iterations rather than the input %d iterations, as wordrank dumps the embedding only at dump_period intervals. "
+ "Input an appropriate combination of parameters (iter, dump_period) such that \"iter mod dump_period\" is zero.",
+ iter - (iter % dump_period), iter
+ )
wr_args = {
'path': meta_dir,
@@ -151,20 +163,21 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
}
# run wordrank executable with wr_args
- cmd = ['mpirun', '-np']
- cmd.append(str(np))
- cmd.append(os.path.join(wr_path, 'wordrank'))
+ cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]
for option, value in wr_args.items():
cmd.append('--%s' % option)
cmd.append(str(value))
logger.info("Running wordrank binary")
- output = utils.check_output(args=cmd) # noqa:F841
+ utils.check_output(args=cmd)
# use embeddings from max. iteration's dump
max_iter_dump = iter - (iter % dump_period)
os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words'))
os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts'))
- model = cls.load_wordrank_model(os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble)
+ model = cls.load_wordrank_model(
+ os.path.join(model_dir, 'wordrank.words'), vocab_file,
+ os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble
+ )
if cleanup_files:
rmtree(model_dir)
diff --git a/gensim/nosy.py b/gensim/nosy.py
index 2913e1e694..0606166449 100644
--- a/gensim/nosy.py
+++ b/gensim/nosy.py
@@ -27,7 +27,7 @@
DEFAULTARGS = '--with-color -exe' # -w tests'
-def checkSum():
+def check_sum():
"""
Return a long which can be used to know if any .py files have changed.
"""
@@ -44,10 +44,9 @@ def checkSum():
val = 0
try:
while True:
- if checkSum() != val:
- val = checkSum()
- os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS,
- ' '.join(sys.argv[1:])))
+ if check_sum() != val:
+ val = check_sum()
+ os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS, ' '.join(sys.argv[1:])))
print(datetime.datetime.now().__str__())
print('=' * 77)
time.sleep(1)
diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py
index a22b8b94d1..048e056418 100644
--- a/gensim/parsing/porter.py
+++ b/gensim/parsing/porter.py
@@ -363,10 +363,10 @@ def stem(self, w):
return self.b[:self.k + 1]
def stem_sentence(self, txt):
- return " ".join(map(self.stem, txt.split()))
+ return " ".join(self.stem(x) for x in txt.split())
def stem_documents(self, docs):
- return map(self.stem_sentence, docs)
+ return [self.stem_sentence(x) for x in docs]
if __name__ == '__main__':
diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py
index a92eb98656..ab25361f60 100644
--- a/gensim/parsing/preprocessing.py
+++ b/gensim/parsing/preprocessing.py
@@ -117,8 +117,11 @@ def stem_text(text):
stem = stem_text
-DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces,
- strip_numeric, remove_stopwords, strip_short, stem_text]
+DEFAULT_FILTERS = [
+ lambda x: x.lower(), strip_tags, strip_punctuation,
+ strip_multiple_whitespaces, strip_numeric,
+ remove_stopwords, strip_short, stem_text
+]
def preprocess_string(s, filters=DEFAULT_FILTERS):
diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py
index 4f13de4524..0667440f80 100644
--- a/gensim/scripts/glove2word2vec.py
+++ b/gensim/scripts/glove2word2vec.py
@@ -17,7 +17,6 @@
which contains the number of vectors and their dimensionality (two integers).
"""
-import os
import sys
import logging
import argparse
@@ -30,7 +29,7 @@
def get_glove_info(glove_file_name):
"""Return the number of vectors and dimensions in a file in GloVe format."""
with smart_open(glove_file_name) as f:
- num_lines = sum(1 for line in f)
+ num_lines = sum(1 for _ in f)
with smart_open(glove_file_name) as f:
num_dims = len(f.readline().split()) - 1
return num_lines, num_dims
@@ -53,19 +52,9 @@ def glove2word2vec(glove_input_file, word2vec_output_file):
logging.root.setLevel(level=logging.INFO)
logger.info("running %s", ' '.join(sys.argv))
- # check and process cmdline input
- program = os.path.basename(sys.argv[0])
- if len(sys.argv) < 2:
- print(globals()['__doc__'] % locals())
- sys.exit(1)
-
parser = argparse.ArgumentParser()
- parser.add_argument(
- "-i", "--input", required=True,
- help="Input file, in gloVe format (read-only).")
- parser.add_argument(
- "-o", "--output", required=True,
- help="Output file, in word2vec text format (will be overwritten).")
+ parser.add_argument("-i", "--input", required=True, help="Input file, in gloVe format (read-only).")
+ parser.add_argument("-o", "--output", required=True, help="Output file, in word2vec text format (will be overwritten).")
args = parser.parse_args()
# do the actual conversion
diff --git a/gensim/scripts/make_wiki_online.py b/gensim/scripts/make_wiki_online.py
index 66985a566e..37c437f3e1 100755
--- a/gensim/scripts/make_wiki_online.py
+++ b/gensim/scripts/make_wiki_online.py
@@ -55,7 +55,7 @@
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
- logger.info("running %s" % ' '.join(sys.argv))
+ logger.info("running %s", ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
@@ -107,4 +107,4 @@
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
- logger.info("finished running %s" % program)
+ logger.info("finished running %s", program)
diff --git a/gensim/scripts/make_wiki_online_lemma.py b/gensim/scripts/make_wiki_online_lemma.py
index 66985a566e..37c437f3e1 100755
--- a/gensim/scripts/make_wiki_online_lemma.py
+++ b/gensim/scripts/make_wiki_online_lemma.py
@@ -55,7 +55,7 @@
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
- logger.info("running %s" % ' '.join(sys.argv))
+ logger.info("running %s", ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
@@ -107,4 +107,4 @@
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
- logger.info("finished running %s" % program)
+ logger.info("finished running %s", program)
diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py
index 66985a566e..37c437f3e1 100755
--- a/gensim/scripts/make_wiki_online_nodebug.py
+++ b/gensim/scripts/make_wiki_online_nodebug.py
@@ -55,7 +55,7 @@
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
- logger.info("running %s" % ' '.join(sys.argv))
+ logger.info("running %s", ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
@@ -107,4 +107,4 @@
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
- logger.info("finished running %s" % program)
+ logger.info("finished running %s", program)
diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py
index 66985a566e..37c437f3e1 100755
--- a/gensim/scripts/make_wikicorpus.py
+++ b/gensim/scripts/make_wikicorpus.py
@@ -55,7 +55,7 @@
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
- logger.info("running %s" % ' '.join(sys.argv))
+ logger.info("running %s", ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
@@ -107,4 +107,4 @@
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
- logger.info("finished running %s" % program)
+ logger.info("finished running %s", program)
diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py
index 88cab79d25..6bb1301a59 100644
--- a/gensim/scripts/word2vec2tensor.py
+++ b/gensim/scripts/word2vec2tensor.py
@@ -37,13 +37,13 @@
def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
- '''
+ """
Convert Word2Vec mode to 2D tensor TSV file and metadata file
Args:
- param1 (str): word2vec model file path
- param2 (str): filename prefix
- param2 (bool): set True to use a binary Word2Vec model, defaults to False
- '''
+ word2vec_model_path (str): word2vec model file path
+ tensor_filename (str): filename prefix
+ binary (bool): set True to use a binary Word2Vec model, defaults to False
+ """
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary)
outfiletsv = tensor_filename + '_tensor.tsv'
outfiletsvmeta = tensor_filename + '_metadata.tsv'
@@ -52,11 +52,11 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
with open(outfiletsvmeta, 'w+') as file_metadata:
for word in model.index2word:
file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
- vector_row = '\t'.join(map(str, model[word]))
+ vector_row = '\t'.join(str(x) for x in model[word])
file_vector.write(vector_row + '\n')
- logger.info("2D tensor file saved to %s" % outfiletsv)
- logger.info("Tensor metadata file saved to %s" % outfiletsvmeta)
+ logger.info("2D tensor file saved to %s", outfiletsv)
+ logger.info("Tensor metadata file saved to %s", outfiletsvmeta)
if __name__ == "__main__":
@@ -64,24 +64,12 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
logging.root.setLevel(level=logging.INFO)
logger.info("running %s", ' '.join(sys.argv))
- # check and process cmdline input
- program = os.path.basename(sys.argv[0])
- if len(sys.argv) < 2:
- print(globals()['__doc__'] % locals())
- sys.exit(1)
-
parser = argparse.ArgumentParser()
- parser.add_argument(
- "-i", "--input", required=True,
- help="Input word2vec model")
- parser.add_argument(
- "-o", "--output", required=True,
- help="Output tensor file name prefix")
- parser.add_argument("-b", "--binary",
- required=False,
- help="If word2vec model in binary format, set True, else False")
+ parser.add_argument("-i", "--input", required=True, help="Input word2vec model")
+ parser.add_argument("-o", "--output", required=True, help="Output tensor file name prefix")
+ parser.add_argument("-b", "--binary", required=False, help="If word2vec model in binary format, set True, else False")
args = parser.parse_args()
word2vec2tensor(args.input, args.output, args.binary)
- logger.info("finished running %s", program)
+ logger.info("finished running %s", os.path.basename(sys.argv[0]))
diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py
index 52baea6f4c..878e588613 100644
--- a/gensim/scripts/word2vec_standalone.py
+++ b/gensim/scripts/word2vec_standalone.py
@@ -61,17 +61,8 @@
if __name__ == "__main__":
- logging.basicConfig(
- format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
- level=logging.INFO)
+ logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logger.info("running %s", " ".join(sys.argv))
-
- # check and process cmdline input
- program = os.path.basename(sys.argv[0])
- if len(sys.argv) < 2:
- print(globals()['__doc__'] % locals())
- sys.exit(1)
-
seterr(all='raise') # don't ignore numpy errors
parser = argparse.ArgumentParser()
@@ -107,7 +98,8 @@
model = Word2Vec(
corpus, size=args.size, min_count=args.min_count, workers=args.threads,
window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram,
- hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter)
+ hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter
+ )
if args.output:
outfile = args.output
@@ -124,4 +116,4 @@
questions_file = args.accuracy
model.accuracy(questions_file)
- logger.info("finished running %s", program)
+ logger.info("finished running %s", os.path.basename(sys.argv[0]))
diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py
index efe71159d3..5e93c6f8cf 100755
--- a/gensim/similarities/docsim.py
+++ b/gensim/similarities/docsim.py
@@ -105,7 +105,7 @@ def __getstate__(self):
return result
def __str__(self):
- return ("%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname()))
+ return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname())
def get_index(self):
if not hasattr(self, 'index'):
@@ -209,8 +209,9 @@ def __len__(self):
return len(self.fresh_docs) + sum([len(shard) for shard in self.shards])
def __str__(self):
- return ("Similarity index with %i documents in %i shards (stored under %s)" %
- (len(self), len(self.shards), self.output_prefix))
+ return "Similarity index with %i documents in %i shards (stored under %s)" % (
+ len(self), len(self.shards), self.output_prefix
+ )
def add_documents(self, corpus):
"""
@@ -262,8 +263,9 @@ def close_shard(self):
# consider the shard sparse if its density is < 30%
issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features)
if issparse:
- index = SparseMatrixSimilarity(self.fresh_docs, num_terms=self.num_features,
- num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz)
+ index = SparseMatrixSimilarity(
+ self.fresh_docs, num_terms=self.num_features, num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz
+ )
else:
index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features)
logger.info("creating %s shard #%s", 'sparse' if issparse else 'dense', shardid)
@@ -334,8 +336,7 @@ def __getitem__(self, query):
# the following uses a lot of lazy evaluation and (optionally) parallel
# processing, to improve query latency and minimize memory footprint.
offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
- convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim)
- for doc_index, sim in doc]
+ convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]
is_corpus, query = utils.is_corpus(query)
is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
if not is_corpus:
@@ -370,8 +371,7 @@ def vector_by_id(self, docpos):
if docpos < pos:
break
if not self.shards or docpos < 0 or docpos >= pos:
- raise ValueError("invalid document position: %s (must be 0 <= x < %s)" %
- (docpos, len(self)))
+ raise ValueError("invalid document position: %s (must be 0 <= x < %s)" % (docpos, len(self)))
result = shard.get_document_id(docpos - pos + len(shard))
return result
@@ -458,7 +458,6 @@ def destroy(self):
for fname in glob.glob(self.output_prefix + '*'):
logger.info("deleting %s", fname)
os.remove(fname)
-# endclass Similarity
class MatrixSimilarity(interfaces.SimilarityABC):
@@ -496,7 +495,10 @@ class for description of the other parameters.
if corpus is not None:
if self.num_features <= 0:
- raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
+ raise ValueError(
+ "cannot index a corpus with zero features (you must specify either `num_features` "
+ "or a non-empty corpus in the constructor)"
+ )
logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
# iterate over corpus, populating the numpy index matrix with (normalized)
@@ -535,7 +537,8 @@ def get_similarities(self, query):
if is_corpus:
query = numpy.asarray(
[matutils.sparse2full(vec, self.num_features) for vec in query],
- dtype=self.index.dtype)
+ dtype=self.index.dtype
+ )
else:
if scipy.sparse.issparse(query):
query = query.toarray() # convert sparse to dense
@@ -553,7 +556,6 @@ def get_similarities(self, query):
def __str__(self):
return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1])
-# endclass MatrixSimilarity
class WmdSimilarity(interfaces.SimilarityABC):
@@ -638,7 +640,6 @@ def get_similarities(self, query):
def __str__(self):
return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1])
-# endclass WmdSimilarity
class SparseMatrixSimilarity(interfaces.SimilarityABC):
@@ -689,7 +690,8 @@ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num
matutils.unitvec(v)) for v in corpus)
self.index = matutils.corpus2csc(
corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
- dtype=dtype, printprogress=10000).T
+ dtype=dtype, printprogress=10000
+ ).T
# convert to Compressed Sparse Row for efficient row slicing and multiplications
self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR
@@ -736,4 +738,3 @@ def get_similarities(self, query):
# otherwise, return a 2d matrix (#queries x #index)
result = result.toarray().T
return result
-# endclass SparseMatrixSimilarity
diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py
index d0ca879225..e9323f6998 100644
--- a/gensim/similarities/index.py
+++ b/gensim/similarities/index.py
@@ -17,7 +17,9 @@
try:
from annoy import AnnoyIndex
except ImportError:
- raise ImportError("Annoy has not been installed, if you wish to use the annoy indexer, please run `pip install annoy`")
+ raise ImportError(
+ "Annoy has not been installed, if you wish to use the annoy indexer, please run `pip install annoy`"
+ )
class AnnoyIndexer(object):
@@ -49,7 +51,8 @@ def load(self, fname):
fname_dict = fname + '.d'
if not (os.path.exists(fname) and os.path.exists(fname_dict)):
raise IOError(
- "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict))
+ "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict)
+ )
else:
with smart_open(fname_dict) as f:
d = _pickle.loads(f.read())
diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py
index 0217350d3a..d3128243a6 100644
--- a/gensim/sklearn_api/atmodel.py
+++ b/gensim/sklearn_api/atmodel.py
@@ -23,10 +23,10 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator):
"""
def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None,
- chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
- alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
- gamma_threshold=0.001, serialized=False, serialization_path=None,
- minimum_probability=0.01, random_state=None):
+ chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
+ alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
+ gamma_threshold=0.001, serialized=False, serialization_path=None,
+ minimum_probability=0.01, random_state=None):
"""
Sklearn wrapper for AuthorTopic model. See gensim.models.AuthorTopicModel for parameter details.
"""
@@ -55,11 +55,14 @@ def fit(self, X, y=None):
Fit the model according to the given training data.
Calls gensim.models.AuthorTopicModel
"""
- self.gensim_model = models.AuthorTopicModel(corpus=X, num_topics=self.num_topics, id2word=self.id2word,
+ self.gensim_model = models.AuthorTopicModel(
+ corpus=X, num_topics=self.num_topics, id2word=self.id2word,
author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes,
iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta,
- update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized,
- serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state)
+ update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold,
+ serialized=self.serialized, serialization_path=self.serialization_path,
+ minimum_probability=self.minimum_probability, random_state=self.random_state
+ )
return self
def transform(self, author_names):
@@ -69,7 +72,9 @@ def transform(self, author_names):
"""
# The input as array of array
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
check = lambda x: [x] if not isinstance(x, list) else x
author_names = check(author_names)
@@ -88,11 +93,14 @@ def partial_fit(self, X, author2doc=None, doc2author=None):
Train model over X.
"""
if self.gensim_model is None:
- self.gensim_model = models.AuthorTopicModel(corpus=X, num_topics=self.num_topics, id2word=self.id2word,
+ self.gensim_model = models.AuthorTopicModel(
+ corpus=X, num_topics=self.num_topics, id2word=self.id2word,
author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes,
iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta,
- update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, serialized=self.serialized,
- serialization_path=self.serialization_path, minimum_probability=self.minimum_probability, random_state=self.random_state)
+ update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold,
+ serialized=self.serialized, serialization_path=self.serialization_path,
+ minimum_probability=self.minimum_probability, random_state=self.random_state
+ )
self.gensim_model.update(corpus=X, author2doc=author2doc, doc2author=doc2author)
return self
diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 05d496d9b1..14163f1600 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -22,13 +22,10 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
Base Doc2Vec module
"""
- def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
- dm_tag_count=1, docvecs=None, docvecs_mapfile=None,
- comment=None, trim_rule=None, size=100, alpha=0.025,
- window=5, min_count=5, max_vocab_size=None, sample=1e-3,
- seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5,
- cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1,
- batch_words=10000):
+ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
+ docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
+ max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
+ hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000):
"""
Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
"""
@@ -66,14 +63,16 @@ def fit(self, X, y=None):
Fit the model according to the given training data.
Calls gensim.models.Doc2Vec
"""
- self.gensim_model = models.Doc2Vec(documents=X, dm_mean=self.dm_mean, dm=self.dm,
+ self.gensim_model = models.Doc2Vec(
+ documents=X, dm_mean=self.dm_mean, dm=self.dm,
dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window,
min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
- iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words)
+ iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
+ )
return self
def transform(self, docs):
@@ -83,7 +82,9 @@ def transform(self, docs):
or a single document like : ['calculus', 'mathematical']
"""
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
# The input as array of array
check = lambda x: [x] if isinstance(x[0], string_types) else x
diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py
index 92265a5e8f..d1dcec01a5 100644
--- a/gensim/sklearn_api/hdp.py
+++ b/gensim/sklearn_api/hdp.py
@@ -23,10 +23,8 @@ class HdpTransformer(TransformerMixin, BaseEstimator):
Base HDP module
"""
- def __init__(self, id2word, max_chunks=None, max_time=None,
- chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
- gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
- outputdir=None, random_state=None):
+ def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150,
+ alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None):
"""
Sklearn api for HDP model. See gensim.models.HdpModel for parameter details.
"""
@@ -57,10 +55,12 @@ def fit(self, X, y=None):
else:
corpus = X
- self.gensim_model = models.HdpModel(corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks,
+ self.gensim_model = models.HdpModel(
+ corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks,
max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau,
K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale,
- var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state)
+ var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state
+ )
return self
def transform(self, docs):
@@ -72,7 +72,9 @@ def transform(self, docs):
or a single document like : [(4, 1), (7, 1)]
"""
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
@@ -82,7 +84,7 @@ def transform(self, docs):
max_num_topics = 0
for k, v in enumerate(docs):
X[k] = self.gensim_model[v]
- max_num_topics = max(max_num_topics, max(list(map(lambda x: x[0], X[k]))) + 1)
+ max_num_topics = max(max_num_topics, max(x[0] for x in X[k]) + 1)
for k, v in enumerate(X):
# returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future
@@ -99,10 +101,12 @@ def partial_fit(self, X):
X = matutils.Sparse2Corpus(X)
if self.gensim_model is None:
- self.gensim_model = models.HdpModel(id2word=self.id2word, max_chunks=self.max_chunks,
+ self.gensim_model = models.HdpModel(
+ id2word=self.id2word, max_chunks=self.max_chunks,
max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau,
K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale,
- var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state)
+ var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state
+ )
self.gensim_model.update(corpus=X)
return self
diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py
index 107353e2df..77d539e616 100644
--- a/gensim/sklearn_api/ldamodel.py
+++ b/gensim/sklearn_api/ldamodel.py
@@ -24,11 +24,9 @@ class LdaTransformer(TransformerMixin, BaseEstimator):
Base LDA module
"""
- def __init__(
- self, num_topics=100, id2word=None, chunksize=2000, passes=1,
- update_every=1, alpha='symmetric', eta=None, decay=0.5,
- offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
- minimum_probability=0.01, random_state=None, scorer='perplexity'):
+ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric',
+ eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
+ minimum_probability=0.01, random_state=None, scorer='perplexity'):
"""
Sklearn wrapper for LDA model. See gensim.model.LdaModel for parameter details.
@@ -63,12 +61,14 @@ def fit(self, X, y=None):
else:
corpus = X
- self.gensim_model = models.LdaModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word,
+ self.gensim_model = models.LdaModel(
+ corpus=corpus, num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations,
gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
- random_state=self.random_state)
+ random_state=self.random_state
+ )
return self
def transform(self, docs):
@@ -109,11 +109,13 @@ def partial_fit(self, X):
X = matutils.Sparse2Corpus(X)
if self.gensim_model is None:
- self.gensim_model = models.LdaModel(num_topics=self.num_topics, id2word=self.id2word,
+ self.gensim_model = models.LdaModel(
+ num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold,
- minimum_probability=self.minimum_probability, random_state=self.random_state)
+ minimum_probability=self.minimum_probability, random_state=self.random_state
+ )
self.gensim_model.update(corpus=X)
return self
diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py
index 25b50bf95e..6b96d8d6fa 100644
--- a/gensim/sklearn_api/ldaseqmodel.py
+++ b/gensim/sklearn_api/ldaseqmodel.py
@@ -22,9 +22,9 @@ class LdaSeqTransformer(TransformerMixin, BaseEstimator):
Base LdaSeq module
"""
- def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
- initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
- random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
+ def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None,
+ lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None,
+ lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
"""
Sklearn wrapper for LdaSeq model. See gensim.models.LdaSeqModel for parameter details.
"""
@@ -50,11 +50,13 @@ def fit(self, X, y=None):
Fit the model according to the given training data.
Calls gensim.models.LdaSeqModel
"""
- self.gensim_model = models.LdaSeqModel(corpus=X, time_slice=self.time_slice, id2word=self.id2word,
+ self.gensim_model = models.LdaSeqModel(
+ corpus=X, time_slice=self.time_slice, id2word=self.id2word,
alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats,
lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance,
passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter,
- em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize)
+ em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize
+ )
return self
def transform(self, docs):
diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py
index c44240b2ba..776af6f5da 100644
--- a/gensim/sklearn_api/lsimodel.py
+++ b/gensim/sklearn_api/lsimodel.py
@@ -24,8 +24,7 @@ class LsiTransformer(TransformerMixin, BaseEstimator):
Base LSI module
"""
- def __init__(self, num_topics=200, id2word=None, chunksize=20000,
- decay=1.0, onepass=True, power_iters=2, extra_samples=100):
+ def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100):
"""
Sklearn wrapper for LSI model. See gensim.model.LsiModel for parameter details.
"""
@@ -48,8 +47,10 @@ def fit(self, X, y=None):
else:
corpus = X
- self.gensim_model = models.LsiModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
- decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)
+ self.gensim_model = models.LsiModel(
+ corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
+ decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples
+ )
return self
def transform(self, docs):
@@ -61,7 +62,9 @@ def transform(self, docs):
or a single document like : [(4, 1), (7, 1)]
"""
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
@@ -82,8 +85,10 @@ def partial_fit(self, X):
X = matutils.Sparse2Corpus(X)
if self.gensim_model is None:
- self.gensim_model = models.LsiModel(num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
- decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)
+ self.gensim_model = models.LsiModel(
+ num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, decay=self.decay,
+ onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples
+ )
self.gensim_model.add_documents(corpus=X)
return self
diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py
index 8a944f0235..ad00c51c0e 100644
--- a/gensim/sklearn_api/phrases.py
+++ b/gensim/sklearn_api/phrases.py
@@ -21,8 +21,7 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator):
Base Phrases module
"""
- def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
- delimiter=b'_', progress_per=10000):
+ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000):
"""
Sklearn wrapper for Phrases model.
"""
@@ -37,8 +36,10 @@ def fit(self, X, y=None):
"""
Fit the model according to the given training data.
"""
- self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold,
- max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per)
+ self.gensim_model = models.Phrases(
+ sentences=X, min_count=self.min_count, threshold=self.threshold,
+ max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per
+ )
return self
def transform(self, docs):
@@ -61,8 +62,10 @@ def transform(self, docs):
def partial_fit(self, X):
if self.gensim_model is None:
- self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold,
- max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per)
+ self.gensim_model = models.Phrases(
+ sentences=X, min_count=self.min_count, threshold=self.threshold,
+ max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per
+ )
self.gensim_model.add_vocab(X)
return self
diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py
index 8673c7d39e..62395e0bce 100644
--- a/gensim/sklearn_api/rpmodel.py
+++ b/gensim/sklearn_api/rpmodel.py
@@ -47,7 +47,9 @@ def transform(self, docs):
or a single document like : [(0, 1.0), (1, 1.0), (2, 1.0)]
"""
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py
index e5a96e6551..6beb126d0d 100644
--- a/gensim/sklearn_api/text2bow.py
+++ b/gensim/sklearn_api/text2bow.py
@@ -34,7 +34,7 @@ def fit(self, X, y=None):
"""
Fit the model according to the given training data.
"""
- tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
+ tokenized_docs = [list(self.tokenizer(x)) for x in X]
self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at)
return self
@@ -43,12 +43,14 @@ def transform(self, docs):
Return the BOW format for the input documents.
"""
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
# input as python lists
check = lambda x: [x] if isinstance(x, string_types) else x
docs = check(docs)
- tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs))
+ tokenized_docs = [list(self.tokenizer(x)) for x in docs]
X = [[] for _ in range(0, len(tokenized_docs))]
for k, v in enumerate(tokenized_docs):
@@ -61,6 +63,6 @@ def partial_fit(self, X):
if self.gensim_model is None:
self.gensim_model = Dictionary(prune_at=self.prune_at)
- tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
+ tokenized_docs = [list(self.tokenizer(x)) for x in X]
self.gensim_model.add_documents(tokenized_docs)
return self
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index ca34af6b40..414c597dc1 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -21,8 +21,8 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
Base Tf-Idf module
"""
- def __init__(self, corpus=None, id2word=None, dictionary=None,
- wlocal=gensim.utils.identity, wglobal=gensim.models.tfidfmodel.df2idf, normalize=True):
+ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
+ wglobal=gensim.models.tfidfmodel.df2idf, normalize=True):
"""
Sklearn wrapper for Tf-Idf model.
"""
@@ -46,7 +46,9 @@ def transform(self, docs):
Return the transformed documents after multiplication with the tf-idf matrix.
"""
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
# input as python lists
check = lambda x: [x] if isinstance(x[0], tuple) else x
diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 32d3e2ffa7..6ddea2eb90 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -23,10 +23,9 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
Base Word2Vec module
"""
- def __init__(self, size=100, alpha=0.025, window=5, min_count=5,
- max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
- sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
- trim_rule=None, sorted_vocab=1, batch_words=10000):
+ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
+ workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+ trim_rule=None, sorted_vocab=1, batch_words=10000):
"""
Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
"""
@@ -56,12 +55,14 @@ def fit(self, X, y=None):
Fit the model according to the given training data.
Calls gensim.models.Word2Vec
"""
- self.gensim_model = models.Word2Vec(sentences=X, size=self.size, alpha=self.alpha,
+ self.gensim_model = models.Word2Vec(
+ sentences=X, size=self.size, alpha=self.alpha,
window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size,
sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha,
sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean,
hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule,
- sorted_vocab=self.sorted_vocab, batch_words=self.batch_words)
+ sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
+ )
return self
def transform(self, words):
@@ -69,7 +70,9 @@ def transform(self, words):
Return the word-vectors for the input list of words.
"""
if self.gensim_model is None:
- raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")
+ raise NotFittedError(
+ "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+ )
# The input as array of array
check = lambda x: [x] if isinstance(x, six.string_types) else x
@@ -83,4 +86,7 @@ def transform(self, words):
return np.reshape(np.array(X), (len(words), self.size))
def partial_fit(self, X):
- raise NotImplementedError("'partial_fit' has not been implemented for W2VTransformer. However, the model can be updated with a fixed vocabulary using Gensim API call.")
+ raise NotImplementedError(
+ "'partial_fit' has not been implemented for W2VTransformer. "
+ "However, the model can be updated with a fixed vocabulary using Gensim API call."
+ )
diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
index d634a32b54..1fb11a8d77 100644
--- a/gensim/summarization/bm25.py
+++ b/gensim/summarization/bm25.py
@@ -18,7 +18,7 @@ class BM25(object):
def __init__(self, corpus):
self.corpus_size = len(corpus)
- self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
+ self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
self.corpus = corpus
self.f = []
self.df = {}
@@ -62,7 +62,7 @@ def get_scores(self, document, average_idf):
def get_bm25_weights(corpus):
bm25 = BM25(corpus)
- average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
+ average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
weights = []
for doc in corpus:
diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py
index 8424873e35..c35a59a25d 100644
--- a/gensim/summarization/graph.py
+++ b/gensim/summarization/graph.py
@@ -242,8 +242,7 @@ def del_edge(self, edge):
self.del_edge_labeling((v, u))
def del_edge_labeling(self, edge):
- keys = [edge]
- keys.append(edge[::-1])
+ keys = [edge, edge[::-1]]
for key in keys:
for mapping in [self.edge_properties, self.edge_attr]:
diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
index b24e6f1f04..1630c9389d 100644
--- a/gensim/summarization/keywords.py
+++ b/gensim/summarization/keywords.py
@@ -30,7 +30,7 @@ def _get_pos_filters():
return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)
-def _get_words_for_graph(tokens, pos_filter):
+def _get_words_for_graph(tokens, pos_filter=None):
if pos_filter is None:
include_filters, exclude_filters = _get_pos_filters()
else:
@@ -97,7 +97,7 @@ def _process_text(graph, tokens, split_text):
def _queue_iterator(queue):
iterations = queue.qsize()
- for i in xrange(iterations):
+ for _ in xrange(iterations):
var = queue.get()
yield var
queue.put(var)
@@ -197,7 +197,8 @@ def _format_results(_keywords, combined_keywords, split, scores):
return "\n".join(combined_keywords)
-def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True):
+def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'),
+ lemmatize=False, deacc=True):
# Gets a dict of word -> lemma
text = to_unicode(text)
tokens = _clean_text_by_word(text, deacc=deacc)
diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py
index c067c23faf..3307a2280f 100644
--- a/gensim/summarization/summarizer.py
+++ b/gensim/summarization/summarizer.py
@@ -151,7 +151,7 @@ def summarize_corpus(corpus, ratio=0.2):
# Warns the user if there are too few documents.
if len(corpus) < INPUT_MIN_LENGTH:
- logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.")
+ logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)
graph = _build_graph(hashable_corpus)
_set_graph_edge_weights(graph)
@@ -205,7 +205,7 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
# Warns if the text is too short.
if len(sentences) < INPUT_MIN_LENGTH:
- logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.")
+ logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH)
corpus = _build_corpus(sentences)
diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
index 404c44b18e..fa6a56b887 100644
--- a/gensim/summarization/textcleaner.py
+++ b/gensim/summarization/textcleaner.py
@@ -97,7 +97,7 @@ def clean_text_by_word(text, deacc=True):
else:
tags = None
units = merge_syntactic_units(original_words, filtered_words, tags)
- return dict((unit.text, unit) for unit in units)
+ return {unit.text: unit for unit in units}
def tokenize_by_word(text):
diff --git a/gensim/test/basetests.py b/gensim/test/basetmtests.py
similarity index 91%
rename from gensim/test/basetests.py
rename to gensim/test/basetmtests.py
index a22bfe2d30..e8cb1d259d 100644
--- a/gensim/test/basetests.py
+++ b/gensim/test/basetmtests.py
@@ -13,27 +13,27 @@
class TestBaseTopicModel(object):
- def testPrintTopic(self):
+ def test_print_topic(self):
topics = self.model.show_topics(formatted=True)
for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821
- def testPrintTopics(self):
+ def test_print_topics(self):
topics = self.model.print_topics()
for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821
- def testShowTopic(self):
+ def test_show_topic(self):
topic = self.model.show_topic(1)
for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, (np.floating, float)))
- def testShowTopics(self):
+ def test_show_topics(self):
topics = self.model.show_topics(formatted=False)
for topic_no, topic in topics:
@@ -43,7 +43,7 @@ def testShowTopics(self):
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, (np.floating, float)))
- def testGetTopics(self):
+ def test_get_topics(self):
topics = self.model.get_topics()
vocab_size = len(self.model.id2word)
for topic in topics:
diff --git a/gensim/test/simspeed.py b/gensim/test/simspeed.py
index 4c1ffcab6f..7ba25fc2ea 100755
--- a/gensim/test/simspeed.py
+++ b/gensim/test/simspeed.py
@@ -27,7 +27,7 @@
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- logging.info("running %s" % " ".join(sys.argv))
+ logging.info("running %s", " ".join(sys.argv))
# check and process cmdline input
program = os.path.basename(sys.argv[0])
@@ -54,8 +54,10 @@
# because it needs to convert sparse vecs to np arrays and normalize them to
# unit length=extra work, which #3 avoids.
query = list(itertools.islice(corpus_dense, 1000))
- logging.info("test 1 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)" %
- (len(query), len(index_dense), index_dense.num_features))
+ logging.info(
+ "test 1 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)",
+ len(query), len(index_dense), index_dense.num_features
+ )
for chunksize in [1, 4, 8, 16, 64, 128, 256, 512, 1024]:
start = time()
if chunksize > 1:
@@ -68,13 +70,17 @@
assert len(sims) == len(query) # make sure we have one result for each query document
taken = time() - start
queries = math.ceil(1.0 * len(query) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(query) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(query) / taken, queries / taken
+ )
# Same comment as for test #1 but vs. test #4.
query = list(itertools.islice(corpus_sparse, 1000))
- logging.info("test 2 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)" %
- (len(query), len(corpus_sparse), index_sparse.index.shape[1], density))
+ logging.info(
+ "test 2 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)",
+ len(query), len(corpus_sparse), index_sparse.index.shape[1], density
+ )
for chunksize in [1, 5, 10, 100, 500, 1000]:
start = time()
if chunksize > 1:
@@ -87,28 +93,37 @@
assert len(sims) == len(query) # make sure we have one result for each query document
taken = time() - start
queries = math.ceil(1.0 * len(query) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(query) / taken, queries / taken))
-
- logging.info("test 3 (dense): similarity of all vs. all (%i documents, %i dense features)" %
- (len(corpus_dense), index_dense.num_features))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(query) / taken, queries / taken
+ )
+
+ logging.info(
+ "test 3 (dense): similarity of all vs. all (%i documents, %i dense features)",
+ len(corpus_dense), index_dense.num_features
+ )
for chunksize in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]:
index_dense.chunksize = chunksize
start = time()
# `sims` stores the entire N x N sim matrix in memory!
# this is not necessary, but i added it to test the accuracy of the result
# (=report mean diff below)
- sims = [sim for sim in index_dense]
+ sims = list(index_dense)
taken = time() - start
sims = np.asarray(sims)
if chunksize == 0:
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s)" % (chunksize, taken, len(corpus_dense) / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s)",
+ chunksize, taken, len(corpus_dense) / taken
+ )
unchunksizeed = sims
else:
queries = math.ceil(1.0 * len(corpus_dense) / chunksize)
diff = np.mean(np.abs(unchunksizeed - sims))
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e" %
- (chunksize, taken, len(corpus_dense) / taken, queries / taken, diff))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e",
+ chunksize, taken, len(corpus_dense) / taken, queries / taken, diff
+ )
del sims
index_dense.num_best = 10
@@ -116,32 +131,41 @@
for chunksize in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]:
index_dense.chunksize = chunksize
start = time()
- sims = [sim for sim in index_dense]
+ sims = list(index_dense)
taken = time() - start
if chunksize == 0:
queries = len(corpus_dense)
else:
queries = math.ceil(1.0 * len(corpus_dense) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(corpus_dense) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(corpus_dense) / taken, queries / taken
+ )
index_dense.num_best = None
- logging.info("test 5 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)" %
- (len(corpus_sparse), index_sparse.index.shape[1], density))
+ logging.info(
+ "test 5 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)",
+ len(corpus_sparse), index_sparse.index.shape[1], density
+ )
for chunksize in [0, 5, 10, 100, 500, 1000, 5000]:
index_sparse.chunksize = chunksize
start = time()
- sims = [sim for sim in index_sparse]
+ sims = list(index_sparse)
taken = time() - start
sims = np.asarray(sims)
if chunksize == 0:
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s)" % (chunksize, taken, len(corpus_sparse) / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s)",
+ chunksize, taken, len(corpus_sparse) / taken
+ )
unchunksizeed = sims
else:
queries = math.ceil(1.0 * len(corpus_sparse) / chunksize)
diff = np.mean(np.abs(unchunksizeed - sims))
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e" %
- (chunksize, taken, len(corpus_sparse) / taken, queries / taken, diff))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s), meandiff=%.3e",
+ chunksize, taken, len(corpus_sparse) / taken, queries / taken, diff
+ )
del sims
index_sparse.num_best = 10
@@ -149,14 +173,16 @@
for chunksize in [0, 5, 10, 100, 500, 1000, 5000]:
index_sparse.chunksize = chunksize
start = time()
- sims = [sim for sim in index_sparse]
+ sims = list(index_sparse)
taken = time() - start
if chunksize == 0:
queries = len(corpus_sparse)
else:
queries = math.ceil(1.0 * len(corpus_sparse) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(corpus_sparse) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(corpus_sparse) / taken, queries / taken
+ )
index_sparse.num_best = None
- logging.info("finished running %s" % program)
+ logging.info("finished running %s", program)
diff --git a/gensim/test/simspeed2.py b/gensim/test/simspeed2.py
index 334730a6f1..931caef950 100755
--- a/gensim/test/simspeed2.py
+++ b/gensim/test/simspeed2.py
@@ -25,7 +25,7 @@
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- logging.info("running %s" % " ".join(sys.argv))
+ logging.info("running %s", " ".join(sys.argv))
# check and process cmdline input
program = os.path.basename(sys.argv[0])
@@ -47,8 +47,10 @@
density = 100.0 * sum(shard.num_nnz for shard in index_sparse.shards) / (len(index_sparse) * sparse_features)
- logging.info("test 1 (dense): similarity of all vs. all (%i documents, %i dense features)" %
- (len(corpus_dense), index_dense.num_features))
+ logging.info(
+ "test 1 (dense): similarity of all vs. all (%i documents, %i dense features)",
+ len(corpus_dense), index_dense.num_features
+ )
for chunksize in [1, 8, 32, 64, 128, 256, 512, 1024, index_dense.shardsize]:
index_dense.chunksize = chunksize
start = time()
@@ -56,8 +58,10 @@
pass
taken = time() - start
queries = math.ceil(1.0 * len(corpus_dense) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(corpus_dense) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(corpus_dense) / taken, queries / taken
+ )
index_dense.num_best = 10
logging.info("test 2 (dense): as above, but only ask for the top-10 most similar for each document")
@@ -67,12 +71,17 @@
sims = [sim for sim in index_dense]
taken = time() - start
queries = math.ceil(1.0 * len(corpus_dense) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(corpus_dense) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(corpus_dense) / taken, queries / taken
+ )
index_dense.num_best = None
- logging.info("test 3 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)" %
- (len(corpus_sparse), index_sparse.num_features, density))
+ logging.info(
+ "test 3 (sparse): similarity of all vs. all (%i documents, %i features, %.2f%% density)",
+ len(corpus_sparse), index_sparse.num_features, density
+ )
+
for chunksize in [1, 5, 10, 100, 256, 500, 1000, index_sparse.shardsize]:
index_sparse.chunksize = chunksize
start = time()
@@ -80,8 +89,10 @@
pass
taken = time() - start
queries = math.ceil(1.0 * len(corpus_sparse) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(corpus_sparse) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(corpus_sparse) / taken, queries / taken
+ )
index_sparse.num_best = 10
logging.info("test 4 (sparse): as above, but only ask for the top-10 most similar for each document")
@@ -92,8 +103,10 @@
pass
taken = time() - start
queries = math.ceil(1.0 * len(corpus_sparse) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(corpus_sparse) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(corpus_sparse) / taken, queries / taken
+ )
index_sparse.num_best = None
# Difference between test #5 and test #1 is that the query in #5 is a gensim iterable
@@ -101,8 +114,10 @@
# because it needs to convert sparse vecs to numpy arrays and normalize them to
# unit length=extra work, which #1 avoids.
query = list(itertools.islice(corpus_dense, 1000))
- logging.info("test 5 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)" %
- (len(query), len(index_dense), index_dense.num_features))
+ logging.info(
+ "test 5 (dense): dense corpus of %i docs vs. index (%i documents, %i dense features)",
+ len(query), len(index_dense), index_dense.num_features
+ )
for chunksize in [1, 8, 32, 64, 128, 256, 512, 1024]:
start = time()
if chunksize > 1:
@@ -114,13 +129,17 @@
_ = index_dense[vec]
taken = time() - start
queries = math.ceil(1.0 * len(query) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(query) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(query) / taken, queries / taken
+ )
# Same comment as for test #5.
query = list(itertools.islice(corpus_dense, 1000))
- logging.info("test 6 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)" %
- (len(query), len(corpus_sparse), index_sparse.num_features, density))
+ logging.info(
+ "test 6 (sparse): sparse corpus of %i docs vs. sparse index (%i documents, %i features, %.2f%% density)",
+ len(query), len(corpus_sparse), index_sparse.num_features, density
+ )
for chunksize in [1, 5, 10, 100, 500, 1000]:
start = time()
if chunksize > 1:
@@ -132,7 +151,9 @@
_ = index_sparse[vec]
taken = time() - start
queries = math.ceil(1.0 * len(query) / chunksize)
- logging.info("chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)" %
- (chunksize, taken, len(query) / taken, queries / taken))
+ logging.info(
+ "chunksize=%i, time=%.4fs (%.2f docs/s, %.2f queries/s)",
+ chunksize, taken, len(query) / taken, queries / taken
+ )
- logging.info("finished running %s" % program)
+ logging.info("finished running %s", program)
diff --git a/gensim/test/svd_error.py b/gensim/test/svd_error.py
index 4f204c1147..e6ab11bb78 100755
--- a/gensim/test/svd_error.py
+++ b/gensim/test/svd_error.py
@@ -51,7 +51,7 @@
def norm2(a):
"""Spectral norm ("norm 2") of a symmetric matrix `a`."""
if COMPUTE_NORM2:
- logging.info("computing spectral norm of a %s matrix" % str(a.shape))
+ logging.info("computing spectral norm of a %s matrix", str(a.shape))
return scipy.linalg.eigvalsh(a).max() # much faster than np.linalg.norm(2)
else:
return np.nan
@@ -65,8 +65,10 @@ def print_error(name, aat, u, s, ideal_nf, ideal_n2):
err = -np.dot(u, np.dot(np.diag(s), u.T))
err += aat
nf, n2 = np.linalg.norm(err), norm2(err)
- print('%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g' %
- (name, nf, nf / ideal_nf, n2, n2 / ideal_n2, rmse(err)))
+ print(
+ '%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g' %
+ (name, nf, nf / ideal_nf, n2, n2 / ideal_n2, rmse(err))
+ )
sys.stdout.flush()
@@ -82,7 +84,7 @@ def __iter__(self):
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- logging.info("running %s" % " ".join(sys.argv))
+ logging.info("running %s", " ".join(sys.argv))
program = os.path.basename(sys.argv[0])
# do we have enough cmd line arguments?
@@ -105,7 +107,7 @@ def __iter__(self):
m = int(sys.argv[3])
else:
m = mm.num_terms
- logging.info("using %i documents and %i features" % (n, m))
+ logging.info("using %i documents and %i features", n, m)
corpus = ClippedCorpus(mm, n, m)
id2word = gensim.utils.FakeDict(m)
@@ -137,7 +139,7 @@ def __iter__(self):
print_error("baseline", aat,
np.zeros((m, factors)), np.zeros((factors)), ideal_fro, ideal_n2)
if sparsesvd:
- logging.info("computing SVDLIBC SVD for %i factors" % (factors))
+ logging.info("computing SVDLIBC SVD for %i factors", factors)
taken = time.time()
corpus_ram = gensim.matutils.corpus2csc(corpus, num_terms=m)
ut, s, vt = sparsesvd(corpus_ram, factors)
@@ -152,30 +154,41 @@ def __iter__(self):
del u
for power_iters in POWER_ITERS:
for chunksize in CHUNKSIZE:
- logging.info("computing incremental SVD for %i factors, %i power iterations, chunksize %i" %
- (factors, power_iters, chunksize))
+ logging.info(
+ "computing incremental SVD for %i factors, %i power iterations, chunksize %i",
+ factors, power_iters, chunksize
+ )
taken = time.time()
gensim.models.lsimodel.P2_EXTRA_ITERS = power_iters
- model = gensim.models.LsiModel(corpus, id2word=id2word, num_topics=factors,
- chunksize=chunksize, power_iters=power_iters)
+ model = gensim.models.LsiModel(
+ corpus, id2word=id2word, num_topics=factors,
+ chunksize=chunksize, power_iters=power_iters
+ )
taken = time.time() - taken
u, s = model.projection.u.astype(np.float32), model.projection.s.astype(np.float32)**2
del model
- print("incremental SVD for %i factors, %i power iterations, chunksize %i took %s s (spectrum %f .. %f)" %
- (factors, power_iters, chunksize, taken, s[0], s[-1]))
+ print(
+ "incremental SVD for %i factors, %i power iterations, "
+ "chunksize %i took %s s (spectrum %f .. %f)" %
+ (factors, power_iters, chunksize, taken, s[0], s[-1])
+ )
print_error('incremental SVD', aat, u, s, ideal_fro, ideal_n2)
del u
- logging.info("computing multipass SVD for %i factors, %i power iterations" %
- (factors, power_iters,))
+ logging.info("computing multipass SVD for %i factors, %i power iterations", factors, power_iters)
taken = time.time()
- model = gensim.models.LsiModel(corpus, id2word=id2word, num_topics=factors, chunksize=2000,
- onepass=False, power_iters=power_iters)
+ model = gensim.models.LsiModel(
+ corpus, id2word=id2word, num_topics=factors, chunksize=2000,
+ onepass=False, power_iters=power_iters
+ )
taken = time.time() - taken
u, s = model.projection.u.astype(np.float32), model.projection.s.astype(np.float32)**2
del model
- print("multipass SVD for %i factors, %i power iterations took %s s (spectrum %f .. %f)" %
- (factors, power_iters, taken, s[0], s[-1]))
+ print(
+ "multipass SVD for %i factors, "
+ "%i power iterations took %s s (spectrum %f .. %f)" %
+ (factors, power_iters, taken, s[0], s[-1])
+ )
print_error('multipass SVD', aat, u, s, ideal_fro, ideal_n2)
del u
- logging.info("finished running %s" % program)
+ logging.info("finished running %s", program)
diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py
index 17cf1619f9..97bffde623 100644
--- a/gensim/test/test_atmodel.py
+++ b/gensim/test/test_atmodel.py
@@ -26,7 +26,7 @@
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import atmodel
from gensim import matutils
-from gensim.test import basetests
+from gensim.test import basetmtests
# TODO:
# Test that computing the bound on new unseen documents works as expected (this is somewhat different
@@ -40,23 +40,38 @@
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# Assign some authors randomly to the documents above.
-author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]}
-doc2author = {0: ['john', 'jack'], 1: ['john', 'jill'], 2: ['john', 'jane', 'jack'], 3: ['john', 'jane', 'jill'],
- 4: ['john', 'jane', 'jack'], 5: ['john', 'jane', 'jill'], 6: ['john', 'jane', 'jack'], 7: ['jane', 'jill'],
- 8: ['jane', 'jack']}
+author2doc = {
+ 'john': [0, 1, 2, 3, 4, 5, 6],
+ 'jane': [2, 3, 4, 5, 6, 7, 8],
+ 'jack': [0, 2, 4, 6, 8],
+ 'jill': [1, 3, 5, 7]
+}
+doc2author = {
+ 0: ['john', 'jack'],
+ 1: ['john', 'jill'],
+ 2: ['john', 'jane', 'jack'],
+ 3: ['john', 'jane', 'jill'],
+ 4: ['john', 'jane', 'jack'],
+ 5: ['john', 'jane', 'jill'],
+ 6: ['john', 'jane', 'jack'],
+ 7: ['jane', 'jill'],
+ 8: ['jane', 'jack']
+}
# More data with new and old authors (to test update method).
# Although the text is just a subset of the previous, the model
@@ -73,7 +88,7 @@ def testfile(test_fname=''):
return os.path.join(tempfile.gettempdir(), fname)
-class TestAuthorTopicModel(unittest.TestCase, basetests.TestBaseTopicModel):
+class TestAuthorTopicModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = atmodel.AuthorTopicModel
@@ -101,8 +116,10 @@ def testTransform(self):
passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering
if passed:
break
- logging.warning("Author-topic model failed to converge on attempt %i (got %s, expected %s)" %
- (i, sorted(vec), sorted(expected)))
+ logging.warning(
+ "Author-topic model failed to converge on attempt %i (got %s, expected %s)",
+ i, sorted(vec), sorted(expected)
+ )
self.assertTrue(passed)
def testBasic(self):
@@ -117,8 +134,14 @@ def testBasic(self):
def testAuthor2docMissing(self):
# Check that the results are the same if author2doc is constructed automatically from doc2author.
- model = self.class_(corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0)
- model2 = self.class_(corpus, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0)
+ model = self.class_(
+ corpus, author2doc=author2doc, doc2author=doc2author,
+ id2word=dictionary, num_topics=2, random_state=0
+ )
+ model2 = self.class_(
+ corpus, doc2author=doc2author, id2word=dictionary,
+ num_topics=2, random_state=0
+ )
# Compare Jill's topics before in both models.
jill_topics = model.get_author_topics('jill')
@@ -129,8 +152,14 @@ def testAuthor2docMissing(self):
def testDoc2authorMissing(self):
# Check that the results are the same if doc2author is constructed automatically from author2doc.
- model = self.class_(corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0)
- model2 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, random_state=0)
+ model = self.class_(
+ corpus, author2doc=author2doc, doc2author=doc2author,
+ id2word=dictionary, num_topics=2, random_state=0
+ )
+ model2 = self.class_(
+ corpus, author2doc=author2doc, id2word=dictionary,
+ num_topics=2, random_state=0
+ )
# Compare Jill's topics before in both models.
jill_topics = model.get_author_topics('jill')
@@ -185,7 +214,10 @@ def testUpdateNewDataNewAuthor(self):
def testSerialized(self):
# Test the model using serialized corpora. Basic tests, plus test of update functionality.
- model = self.class_(self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, serialized=True, serialization_path=datapath('testcorpus_serialization.mm'))
+ model = self.class_(
+ self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2,
+ serialized=True, serialization_path=datapath('testcorpus_serialization.mm')
+ )
jill_topics = model.get_author_topics('jill')
jill_topics = matutils.sparse2full(jill_topics, model.num_topics)
@@ -216,7 +248,10 @@ def testTransformSerialized(self):
# better random initialization
for i in range(25): # restart at most 5 times
# create the transformation model
- model = self.class_(id2word=dictionary, num_topics=2, passes=100, random_state=0, serialized=True, serialization_path=datapath('testcorpus_serialization.mm'))
+ model = self.class_(
+ id2word=dictionary, num_topics=2, passes=100, random_state=0,
+ serialized=True, serialization_path=datapath('testcorpus_serialization.mm')
+ )
model.update(self.corpus, author2doc)
jill_topics = model.get_author_topics('jill')
@@ -234,13 +269,21 @@ def testTransformSerialized(self):
remove(datapath('testcorpus_serialization.mm'))
if passed:
break
- logging.warning("Author-topic model failed to converge on attempt %i (got %s, expected %s)" %
- (i, sorted(vec), sorted(expected)))
+ logging.warning(
+ "Author-topic model failed to converge on attempt %i (got %s, expected %s)",
+ i, sorted(vec), sorted(expected)
+ )
self.assertTrue(passed)
def testAlphaAuto(self):
- model1 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, alpha='symmetric', passes=10, num_topics=2)
- modelauto = self.class_(corpus, author2doc=author2doc, id2word=dictionary, alpha='auto', passes=10, num_topics=2)
+ model1 = self.class_(
+ corpus, author2doc=author2doc, id2word=dictionary,
+ alpha='symmetric', passes=10, num_topics=2
+ )
+ modelauto = self.class_(
+ corpus, author2doc=author2doc, id2word=dictionary,
+ alpha='auto', passes=10, num_topics=2
+ )
# did we learn something?
self.assertFalse(all(np.equal(model1.alpha, modelauto.alpha)))
@@ -301,8 +344,14 @@ def testAlpha(self):
self.assertRaises(ValueError, self.class_, **kwargs)
def testEtaAuto(self):
- model1 = self.class_(corpus, author2doc=author2doc, id2word=dictionary, eta='symmetric', passes=10, num_topics=2)
- modelauto = self.class_(corpus, author2doc=author2doc, id2word=dictionary, eta='auto', passes=10, num_topics=2)
+ model1 = self.class_(
+ corpus, author2doc=author2doc, id2word=dictionary,
+ eta='symmetric', passes=10, num_topics=2
+ )
+ modelauto = self.class_(
+ corpus, author2doc=author2doc, id2word=dictionary,
+ eta='auto', passes=10, num_topics=2
+ )
# did we learn something?
self.assertFalse(all(np.equal(model1.eta, modelauto.eta)))
@@ -388,7 +437,10 @@ def testGetTopicTerms(self):
def testGetAuthorTopics(self):
- model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0))
+ model = self.class_(
+ corpus, author2doc=author2doc, id2word=dictionary, num_topics=2,
+ passes=100, random_state=np.random.seed(0)
+ )
author_topics = []
for a in model.id2author.values():
@@ -402,7 +454,10 @@ def testGetAuthorTopics(self):
def testTermTopics(self):
- model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0))
+ model = self.class_(
+ corpus, author2doc=author2doc, id2word=dictionary, num_topics=2,
+ passes=100, random_state=np.random.seed(0)
+ )
# check with word_type
result = model.get_term_topics(2)
@@ -436,7 +491,7 @@ def testPasses(self):
for test_rhot in test_rhots:
model.update(corpus, author2doc)
- msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs]))
+ msg = "{}, {}, {}".format(passes, model.num_updates, model.state.numdocs)
self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)
self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))
diff --git a/gensim/test/test_big.py b/gensim/test/test_big.py
index 5e6972bd1f..abf19c63c7 100644
--- a/gensim/test/test_big.py
+++ b/gensim/test/test_big.py
@@ -38,7 +38,7 @@ def __iter__(self):
doc_len = np.random.poisson(self.doc_len)
ids = np.random.randint(0, len(self.dictionary), doc_len)
if self.words_only:
- yield [str(id) for id in ids]
+ yield [str(idx) for idx in ids]
else:
weights = np.random.poisson(3, doc_len)
yield sorted(zip(ids, weights))
@@ -53,21 +53,21 @@ def testWord2Vec(self):
model = gensim.models.Word2Vec(corpus, size=300, workers=4)
model.save(testfile(), ignore=['syn1'])
del model
- model = gensim.models.Word2Vec.load(testfile())
+ gensim.models.Word2Vec.load(testfile())
def testLsiModel(self):
corpus = BigCorpus(num_docs=50000)
model = gensim.models.LsiModel(corpus, num_topics=500, id2word=corpus.dictionary)
model.save(testfile())
del model
- model = gensim.models.LsiModel.load(testfile())
+ gensim.models.LsiModel.load(testfile())
def testLdaModel(self):
corpus = BigCorpus(num_docs=5000)
model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary)
model.save(testfile())
del model
- model = gensim.models.LdaModel.load(testfile())
+ gensim.models.LdaModel.load(testfile())
if __name__ == '__main__':
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 039db55a48..229d87d1df 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -53,32 +53,39 @@ def setUp(self):
# `topics1` is clearly better as it has a clear distinction between system-human
# interaction and graphs. Hence both the coherence measures for `topics1` should be
# greater.
- self.topics1 = [['human', 'computer', 'system', 'interface'],
- ['graph', 'minors', 'trees', 'eps']]
- self.topics2 = [['user', 'graph', 'minors', 'system'],
- ['time', 'graph', 'survey', 'minors']]
+ self.topics1 = [
+ ['human', 'computer', 'system', 'interface'],
+ ['graph', 'minors', 'trees', 'eps']
+ ]
+ self.topics2 = [
+ ['user', 'graph', 'minors', 'system'],
+ ['time', 'graph', 'survey', 'minors']
+ ]
self.ldamodel = LdaModel(
corpus=self.corpus, id2word=self.dictionary, num_topics=2,
- passes=0, iterations=0)
+ passes=0, iterations=0
+ )
mallet_home = os.environ.get('MALLET_HOME', None)
self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
if self.mallet_path:
self.malletmodel = LdaMallet(
mallet_path=self.mallet_path, corpus=self.corpus,
- id2word=self.dictionary, num_topics=2, iterations=0)
+ id2word=self.dictionary, num_topics=2, iterations=0
+ )
vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
if not vw_path:
logging.info(
- "Environment variable 'VOWPAL_WABBIT_PATH' not specified,"
- " skipping sanity checks for LDA Model")
+ "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
+ )
self.vw_path = None
else:
self.vw_path = vw_path
self.vwmodel = LdaVowpalWabbit(
self.vw_path, corpus=self.corpus, id2word=self.dictionary,
- num_topics=2, passes=0)
+ num_topics=2, passes=0
+ )
def check_coherence_measure(self, coherence):
"""Check provided topic coherence algorithm on given topics"""
@@ -179,20 +186,24 @@ def testErrors(self):
# not providing dictionary
self.assertRaises(
ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
- coherence='u_mass')
+ coherence='u_mass'
+ )
# not providing texts for c_v and instead providing corpus
self.assertRaises(
ValueError, CoherenceModel, topics=self.topics1, corpus=self.corpus,
- dictionary=self.dictionary, coherence='c_v')
+ dictionary=self.dictionary, coherence='c_v'
+ )
# not providing corpus or texts for u_mass
self.assertRaises(
ValueError, CoherenceModel, topics=self.topics1, dictionary=self.dictionary,
- coherence='u_mass')
+ coherence='u_mass'
+ )
def testPersistence(self):
fname = testfile()
model = CoherenceModel(
- topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
+ topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
+ )
model.save(fname)
model2 = CoherenceModel.load(fname)
self.assertTrue(model.get_coherence() == model2.get_coherence())
@@ -200,7 +211,8 @@ def testPersistence(self):
def testPersistenceCompressed(self):
fname = testfile() + '.gz'
model = CoherenceModel(
- topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
+ topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
+ )
model.save(fname)
model2 = CoherenceModel.load(fname)
self.assertTrue(model.get_coherence() == model2.get_coherence())
@@ -208,7 +220,8 @@ def testPersistenceCompressed(self):
def testPersistenceAfterProbabilityEstimationUsingCorpus(self):
fname = testfile()
model = CoherenceModel(
- topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
+ topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
+ )
model.estimate_probabilities()
model.save(fname)
model2 = CoherenceModel.load(fname)
@@ -218,7 +231,8 @@ def testPersistenceAfterProbabilityEstimationUsingCorpus(self):
def testPersistenceAfterProbabilityEstimationUsingTexts(self):
fname = testfile()
model = CoherenceModel(
- topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v')
+ topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v'
+ )
model.estimate_probabilities()
model.save(fname)
model2 = CoherenceModel.load(fname)
diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py
index e5a5786613..f6c7d8b43c 100644
--- a/gensim/test/test_corpora_dictionary.py
+++ b/gensim/test/test_corpora_dictionary.py
@@ -43,7 +43,8 @@ def setUp(self):
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+ ['graph', 'minors', 'survey']
+ ]
def testDocFreqOneDoc(self):
texts = [['human', 'interface', 'computer']]
@@ -102,9 +103,10 @@ def testBuild(self):
self.assertEqual(sorted(d.dfs.keys()), expected_keys)
self.assertEqual(sorted(d.dfs.values()), expected_values)
- expected_keys = sorted(['computer', 'eps', 'graph', 'human',
- 'interface', 'minors', 'response', 'survey',
- 'system', 'time', 'trees', 'user'])
+ expected_keys = sorted([
+ 'computer', 'eps', 'graph', 'human', 'interface',
+ 'minors', 'response', 'survey', 'system', 'time', 'trees', 'user'
+ ])
expected_values = list(range(12))
self.assertEqual(sorted(d.token2id.keys()), expected_keys)
self.assertEqual(sorted(d.token2id.values()), expected_values)
@@ -127,21 +129,21 @@ def testFilterKeepTokens_keepTokens(self):
# provide keep_tokens argument, keep the tokens given
d = Dictionary(self.texts)
d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['human', 'survey'])
- expected = set(['graph', 'trees', 'human', 'system', 'user', 'survey'])
+ expected = {'graph', 'trees', 'human', 'system', 'user', 'survey'}
self.assertEqual(set(d.token2id.keys()), expected)
def testFilterKeepTokens_unchangedFunctionality(self):
# do not provide keep_tokens argument, filter_extremes functionality is unchanged
d = Dictionary(self.texts)
d.filter_extremes(no_below=3, no_above=1.0)
- expected = set(['graph', 'trees', 'system', 'user'])
+ expected = {'graph', 'trees', 'system', 'user'}
self.assertEqual(set(d.token2id.keys()), expected)
def testFilterKeepTokens_unseenToken(self):
# do provide keep_tokens argument with unseen tokens, filter_extremes functionality is unchanged
d = Dictionary(self.texts)
d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['unknown_token'])
- expected = set(['graph', 'trees', 'system', 'user'])
+ expected = {'graph', 'trees', 'system', 'user'}
self.assertEqual(set(d.token2id.keys()), expected)
def testFilterMostFrequent(self):
@@ -160,7 +162,8 @@ def testFilterTokens(self):
expected = {
'computer': 0, 'eps': 8, 'graph': 10, 'human': 1,
'interface': 2, 'minors': 11, 'response': 3, 'survey': 4,
- 'system': 5, 'time': 6, 'trees': 9, 'user': 7}
+ 'system': 5, 'time': 6, 'trees': 9, 'user': 7
+ }
del expected[removed_word]
self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
@@ -186,7 +189,8 @@ def test_saveAsText(self):
small_text = [
["prvé", "slovo"],
["slovo", "druhé"],
- ["druhé", "slovo"]]
+ ["druhé", "slovo"]
+ ]
d = Dictionary(small_text)
@@ -264,7 +268,8 @@ def test_from_corpus(self):
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
- "Graph minors A survey"]
+ "Graph minors A survey"
+ ]
stoplist = set('for a of the and to in'.split())
texts = [
[word for word in document.lower().split() if word not in stoplist]
diff --git a/gensim/test/test_corpora_hashdictionary.py b/gensim/test/test_corpora_hashdictionary.py
index 6f314d65ee..808246dc59 100644
--- a/gensim/test/test_corpora_hashdictionary.py
+++ b/gensim/test/test_corpora_hashdictionary.py
@@ -36,7 +36,8 @@ def setUp(self):
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+ ['graph', 'minors', 'survey']
+ ]
def testDocFreqOneDoc(self):
texts = [['human', 'interface', 'computer']]
@@ -92,7 +93,7 @@ def testDebugMode(self):
# two words
texts = [['human', 'cat']]
d = HashDictionary(texts, debug=True, myhash=zlib.adler32)
- expected = {9273: set(['cat']), 31002: set(['human'])}
+ expected = {9273: {'cat'}, 31002: {'human'}}
self.assertEqual(d.id2token, expected)
# now the same thing, with debug off
@@ -105,8 +106,17 @@ def testRange(self):
# all words map to the same id
d = HashDictionary(self.texts, id_range=1, debug=True)
dfs = {0: 9}
- id2token = {0: set(['minors', 'graph', 'system', 'trees', 'eps', 'computer', 'survey', 'user', 'human', 'time', 'interface', 'response'])}
- token2id = {'minors': 0, 'graph': 0, 'system': 0, 'trees': 0, 'eps': 0, 'computer': 0, 'survey': 0, 'user': 0, 'human': 0, 'time': 0, 'interface': 0, 'response': 0}
+ id2token = {
+ 0: {
+ 'minors', 'graph', 'system', 'trees', 'eps', 'computer',
+ 'survey', 'user', 'human', 'time', 'interface', 'response'
+ }
+ }
+ token2id = {
+ 'minors': 0, 'graph': 0, 'system': 0, 'trees': 0,
+ 'eps': 0, 'computer': 0, 'survey': 0, 'user': 0,
+ 'human': 0, 'time': 0, 'interface': 0, 'response': 0
+ }
self.assertEqual(d.dfs, dfs)
self.assertEqual(d.id2token, id2token)
self.assertEqual(d.token2id, token2id)
@@ -114,29 +124,31 @@ def testRange(self):
# 2 ids: 0/1 for even/odd number of bytes in the word
d = HashDictionary(self.texts, id_range=2, myhash=lambda key: len(key))
dfs = {0: 7, 1: 7}
- id2token = {0: set(['minors', 'system', 'computer', 'survey', 'user', 'time', 'response']), 1: set(['interface', 'graph', 'trees', 'eps', 'human'])}
- token2id = {'minors': 0, 'graph': 1, 'system': 0, 'trees': 1, 'eps': 1, 'computer': 0, 'survey': 0, 'user': 0, 'human': 1, 'time': 0, 'interface': 1, 'response': 0}
+ id2token = {
+ 0: {'minors', 'system', 'computer', 'survey', 'user', 'time', 'response'},
+ 1: {'interface', 'graph', 'trees', 'eps', 'human'}
+ }
+ token2id = {
+ 'minors': 0, 'graph': 1, 'system': 0, 'trees': 1, 'eps': 1, 'computer': 0,
+ 'survey': 0, 'user': 0, 'human': 1, 'time': 0, 'interface': 1, 'response': 0
+ }
self.assertEqual(d.dfs, dfs)
self.assertEqual(d.id2token, id2token)
self.assertEqual(d.token2id, token2id)
def testBuild(self):
d = HashDictionary(self.texts, myhash=zlib.adler32)
- expected = {5232: 2,
- 5798: 3,
- 10608: 2,
- 12466: 2,
- 12736: 3,
- 15001: 2,
- 18451: 3,
- 23844: 3,
- 28591: 2,
- 29104: 2,
- 31002: 2,
- 31049: 2}
+ expected = {
+ 5232: 2, 5798: 3, 10608: 2, 12466: 2, 12736: 3, 15001: 2,
+ 18451: 3, 23844: 3, 28591: 2, 29104: 2, 31002: 2, 31049: 2
+ }
self.assertEqual(d.dfs, expected)
- expected = {'minors': 15001, 'graph': 18451, 'system': 5798, 'trees': 23844, 'eps': 31049, 'computer': 10608, 'survey': 28591, 'user': 12736, 'human': 31002, 'time': 29104, 'interface': 12466, 'response': 5232}
+ expected = {
+ 'minors': 15001, 'graph': 18451, 'system': 5798, 'trees': 23844,
+ 'eps': 31049, 'computer': 10608, 'survey': 28591, 'user': 12736,
+ 'human': 31002, 'time': 29104, 'interface': 12466, 'response': 5232
+ }
for ex in expected:
self.assertEqual(d.token2id[ex], expected[ex])
@@ -149,7 +161,10 @@ def testFilter(self):
d = HashDictionary(self.texts, myhash=zlib.adler32)
d.filter_extremes(no_below=0, no_above=0.3)
- expected = {29104: 2, 31049: 2, 28591: 2, 5232: 2, 10608: 2, 12466: 2, 15001: 2, 31002: 2}
+ expected = {
+ 29104: 2, 31049: 2, 28591: 2, 5232: 2,
+ 10608: 2, 12466: 2, 15001: 2, 31002: 2
+ }
self.assertEqual(d.dfs, expected)
d = HashDictionary(self.texts, myhash=zlib.adler32)
diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
index ad4eb4c976..9a66c50b0a 100644
--- a/gensim/test/test_doc2vec.py
+++ b/gensim/test/test_doc2vec.py
@@ -229,20 +229,26 @@ def test_dbow_hs(self):
def test_dmm_hs(self):
"""Test DM/mean doc2vec training."""
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0,
- alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_mean=1, size=24, window=4,
+ hs=1, negative=0, alpha=0.05, min_count=2, iter=20
+ )
self.model_sanity(model)
def test_dms_hs(self):
"""Test DM/sum doc2vec training."""
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=1, negative=0,
- alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=1,
+ negative=0, alpha=0.05, min_count=2, iter=20
+ )
self.model_sanity(model)
def test_dmc_hs(self):
"""Test DM/concatenate doc2vec training."""
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=1, negative=0,
- alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_concat=1, size=24, window=4,
+ hs=1, negative=0, alpha=0.05, min_count=2, iter=20
+ )
self.model_sanity(model)
def test_dbow_neg(self):
@@ -252,20 +258,26 @@ def test_dbow_neg(self):
def test_dmm_neg(self):
"""Test DM/mean doc2vec training."""
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=10,
- alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0,
+ negative=10, alpha=0.05, min_count=2, iter=20
+ )
self.model_sanity(model)
def test_dms_neg(self):
"""Test DM/sum doc2vec training."""
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=0, negative=10,
- alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=0,
+ negative=10, alpha=0.05, min_count=2, iter=20
+ )
self.model_sanity(model)
def test_dmc_neg(self):
"""Test DM/concatenate doc2vec training."""
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0, negative=10,
- alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0,
+ negative=10, alpha=0.05, min_count=2, iter=20
+ )
self.model_sanity(model)
def test_parallel(self):
@@ -296,10 +308,14 @@ def test_deterministic_neg(self):
def test_deterministic_dmc(self):
"""Test doc2vec results identical with identical RNG seed."""
# bigger, dmc
- model = doc2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3,
- seed=42, workers=1)
- model2 = doc2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3,
- seed=42, workers=1)
+ model = doc2vec.Doc2Vec(
+ DocsLeeCorpus(), dm=1, dm_concat=1, size=24,
+ window=4, hs=1, negative=3, seed=42, workers=1
+ )
+ model2 = doc2vec.Doc2Vec(
+ DocsLeeCorpus(), dm=1, dm_concat=1, size=24,
+ window=4, hs=1, negative=3, seed=42, workers=1
+ )
self.models_equal(model, model2)
def test_mixed_tag_types(self):
@@ -341,12 +357,18 @@ def test_delete_temporary_training_data(self):
self.assertTrue(not hasattr(model, 'syn0_lockf'))
self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0'))
self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0_lockf'))
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1,
+ negative=0, alpha=0.05, min_count=2, iter=20
+ )
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
self.assertTrue(model.docvecs and hasattr(model.docvecs, 'doctag_syn0'))
self.assertTrue(hasattr(model, 'syn1'))
self.model_sanity(model, keep_training=False)
- model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=1, alpha=0.05, min_count=2, iter=20)
+ model = doc2vec.Doc2Vec(
+ list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0,
+ negative=1, alpha=0.05, min_count=2, iter=20
+ )
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
self.model_sanity(model, keep_training=False)
self.assertTrue(hasattr(model, 'syn1neg'))
@@ -436,11 +458,13 @@ def read_su_sentiment_rotten_tomatoes(dirname, lowercase=True):
http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
has been expanded. It's not too big, so compose entirely into memory.
"""
- logging.info("loading corpus from %s" % dirname)
+ logging.info("loading corpus from %s", dirname)
# many mangled chars in sentences (datasetSentences.txt)
- chars_sst_mangled = ['à', 'á', 'â', 'ã', 'æ', 'ç', 'è', 'é', 'í',
- 'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'û', 'ü']
+ chars_sst_mangled = [
+ 'à', 'á', 'â', 'ã', 'æ', 'ç', 'è', 'é', 'í',
+ 'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'û', 'ü'
+ ]
sentence_fixups = [(char.encode('utf-8').decode('latin1'), char) for char in chars_sst_mangled]
# more junk, and the replace necessary for sentence-phrase consistency
sentence_fixups.extend([
@@ -502,8 +526,10 @@ def read_su_sentiment_rotten_tomatoes(dirname, lowercase=True):
assert len([phrase for phrase in phrases if phrase.split == 'test']) == 2210 # 'test'
assert len([phrase for phrase in phrases if phrase.split == 'dev']) == 1100 # 'dev'
- logging.info("loaded corpus with %i sentences and %i phrases from %s",
- len(info_by_sentence), len(phrases), dirname)
+ logging.info(
+ "loaded corpus with %i sentences and %i phrases from %s",
+ len(info_by_sentence), len(phrases), dirname
+ )
return phrases
diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py
index 81b48374ab..231bbb1932 100644
--- a/gensim/test/test_dtm.py
+++ b/gensim/test/test_dtm.py
@@ -39,7 +39,8 @@ def testDtm(self):
model = gensim.models.wrappers.DtmModel(
self.dtm_path, self.corpus, self.time_slices, num_topics=2,
id2word=self.id2word, model='dtm', initialize_lda=True,
- rng_seed=1)
+ rng_seed=1
+ )
topics = model.show_topics(num_topics=2, times=2, num_words=10)
self.assertEqual(len(topics), 4)
@@ -52,7 +53,8 @@ def testDim(self):
model = gensim.models.wrappers.DtmModel(
self.dtm_path, self.corpus, self.time_slices, num_topics=2,
id2word=self.id2word, model='fixed', initialize_lda=True,
- rng_seed=1)
+ rng_seed=1
+ )
topics = model.show_topics(num_topics=2, times=2, num_words=10)
self.assertEqual(len(topics), 4)
@@ -67,7 +69,8 @@ def testCalledProcessError(self):
gensim.models.wrappers.DtmModel(
self.dtm_path, self.corpus, self.time_slices, num_topics=2,
id2word=self.id2word, model='dtm', initialize_lda=False,
- rng_seed=1)
+ rng_seed=1
+ )
if __name__ == '__main__':
diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py
index 6a7a7b09ed..3c00a8f35a 100644
--- a/gensim/test/test_fasttext_wrapper.py
+++ b/gensim/test/test_fasttext_wrapper.py
@@ -56,7 +56,8 @@ def testTraining(self):
return # Use self.skipTest once python < 2.7 is no longer supported
vocab_size, model_size = 1763, 10
trained_model = fasttext.FastText.train(
- self.ft_path, self.corpus_file, size=model_size, output_file=testfile())
+ self.ft_path, self.corpus_file, size=model_size, output_file=testfile()
+ )
self.assertEqual(trained_model.wv.syn0.shape, (vocab_size, model_size))
self.assertEqual(len(trained_model.wv.vocab), vocab_size)
@@ -72,11 +73,13 @@ def testMinCount(self):
logger.info("FT_HOME env variable not set, skipping test")
return # Use self.skipTest once python < 2.7 is no longer supported
test_model_min_count_5 = fasttext.FastText.train(
- self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=5)
+ self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=5
+ )
self.assertTrue('forests' not in test_model_min_count_5.wv.vocab)
test_model_min_count_1 = fasttext.FastText.train(
- self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=1)
+ self.ft_path, self.corpus_file, output_file=testfile(), size=10, min_count=1
+ )
self.assertTrue('forests' in test_model_min_count_1.wv.vocab)
def testModelSize(self):
@@ -85,7 +88,8 @@ def testModelSize(self):
logger.info("FT_HOME env variable not set, skipping test")
return # Use self.skipTest once python < 2.7 is no longer supported
test_model_size_20 = fasttext.FastText.train(
- self.ft_path, self.corpus_file, output_file=testfile(), size=20)
+ self.ft_path, self.corpus_file, output_file=testfile(), size=20
+ )
self.assertEqual(test_model_size_20.vector_size, 20)
self.assertEqual(test_model_size_20.wv.syn0.shape[1], 20)
self.assertEqual(test_model_size_20.wv.syn0_all.shape[1], 20)
@@ -245,8 +249,10 @@ def testNSimilarity(self):
self.assertEqual(self.test_model.n_similarity(['the'], ['and']), self.test_model.n_similarity(['and'], ['the']))
# Out of vocab check
self.assertTrue(numpy.allclose(self.test_model.n_similarity(['night', 'nights'], ['nights', 'night']), 1.0))
- self.assertEqual(self.test_model.n_similarity(['night'], ['nights']),
- self.test_model.n_similarity(['nights'], ['night']))
+ self.assertEqual(
+ self.test_model.n_similarity(['night'], ['nights']),
+ self.test_model.n_similarity(['nights'], ['night'])
+ )
def testSimilarity(self):
"""Test similarity for in-vocab and out-of-vocab words"""
diff --git a/gensim/test/test_glove2word2vec.py b/gensim/test/test_glove2word2vec.py
index b638ca927d..2226bf9fd2 100644
--- a/gensim/test/test_glove2word2vec.py
+++ b/gensim/test/test_glove2word2vec.py
@@ -31,7 +31,10 @@ def setUp(self):
self.output_file = testfile()
def testConversion(self):
- output = check_output(args=['python', '-m', 'gensim.scripts.glove2word2vec', '--input', self.datapath, '--output', self.output_file]) # noqa:F841
+ check_output(args=[
+ 'python', '-m', 'gensim.scripts.glove2word2vec',
+ '--input', self.datapath, '--output', self.output_file
+ ])
# test that the converted model loads successfully
try:
self.test_model = gensim.models.KeyedVectors.load_word2vec_format(self.output_file)
@@ -40,7 +43,9 @@ def testConversion(self):
if os.path.isfile(os.path.join(self.output_file)):
self.fail('model file %s was created but could not be loaded.' % self.output_file)
else:
- self.fail('model file %s creation failed, check the parameters and input file format.' % self.output_file)
+ self.fail(
+ 'model file %s creation failed, check the parameters and input file format.' % self.output_file
+ )
if __name__ == '__main__':
diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py
index 647b31ad7e..b3cf8bdde1 100644
--- a/gensim/test/test_hdpmodel.py
+++ b/gensim/test/test_hdpmodel.py
@@ -17,7 +17,7 @@
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import hdpmodel
-from gensim.test import basetests
+from gensim.test import basetmtests
import numpy as np
@@ -26,15 +26,17 @@
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
@@ -44,7 +46,7 @@ def testfile():
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
-class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel):
+class TestHdpModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = hdpmodel.HdpModel
diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py
index 7f321c6565..996f1b7f7b 100644
--- a/gensim/test/test_keras_integration.py
+++ b/gensim/test/test_keras_integration.py
@@ -79,7 +79,10 @@ def testEmbeddingLayerCosineSim(self):
word_a = 'graph'
word_b = 'trees'
- output = model.predict([np.asarray([keras_w2v_model.wv.vocab[word_a].index]), np.asarray([keras_w2v_model.wv.vocab[word_b].index])])
+ output = model.predict([
+ np.asarray([keras_w2v_model.wv.vocab[word_a].index]),
+ np.asarray([keras_w2v_model.wv.vocab[word_b].index])
+ ])
# output is the cosine distance between the two words (as a similarity measure)
self.assertTrue(type(output[0][0][0]) == np.float32) # verify that a float is returned
@@ -113,7 +116,7 @@ def testEmbeddingLayer20NewsGroup(self):
texts_w2v.append(sentence.split(' '))
labels.append(label_id)
except Exception:
- None
+ pass
# Vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer()
@@ -128,11 +131,11 @@ def testEmbeddingLayer20NewsGroup(self):
y_train = labels
# prepare the embedding layer using the wrapper
- Keras_w2v = self.model_twenty_ng
- Keras_w2v.build_vocab(texts_w2v)
- Keras_w2v.train(texts, total_examples=Keras_w2v.corpus_count, epochs=Keras_w2v.iter)
- Keras_w2v_wv = Keras_w2v.wv
- embedding_layer = Keras_w2v_wv.get_embedding_layer()
+ keras_w2v = self.model_twenty_ng
+ keras_w2v.build_vocab(texts_w2v)
+ keras_w2v.train(texts, total_examples=keras_w2v.corpus_count, epochs=keras_w2v.iter)
+ keras_w2v_wv = keras_w2v.wv
+ embedding_layer = keras_w2v_wv.get_embedding_layer()
# create a 1D convnet to solve our classification task
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py
index 8a3be3af5c..76bd448d5c 100644
--- a/gensim/test/test_keywords.py
+++ b/gensim/test/test_keywords.py
@@ -35,7 +35,7 @@ def test_text_keywords(self):
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
kw = f.read().strip().split("\n")
- self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw)))
+ self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
def test_text_keywords_words(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
@@ -55,13 +55,13 @@ def test_text_keywords_pos(self):
text = f.read()
# calculate keywords using only certain parts of speech
- generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)
+ generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)
# To be compared to the reference.
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
kw = f.read().strip().split("\n")
- self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw)))
+ self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})
def test_text_summarization_raises_exception_on_short_input_text(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py
index a7d64839bb..5ed4486e16 100644
--- a/gensim/test/test_ldamallet_wrapper.py
+++ b/gensim/test/test_ldamallet_wrapper.py
@@ -21,21 +21,23 @@
from gensim.models.wrappers import ldamallet
from gensim import matutils
from gensim.models import ldamodel
-from gensim.test import basetests
+from gensim.test import basetmtests
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
@@ -46,7 +48,7 @@ def testfile():
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
-class TestLdaMallet(unittest.TestCase, basetests.TestBaseTopicModel):
+class TestLdaMallet(unittest.TestCase, basetmtests.TestBaseTopicModel):
def setUp(self):
mallet_home = os.environ.get('MALLET_HOME', None)
self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
@@ -72,8 +74,10 @@ def testTransform(self):
passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering
if passed:
break
- logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
- (i, sorted(vec), sorted(expected)))
+ logging.warning(
+ "LDA failed to converge on attempt %i (got %s, expected %s)",
+ i, sorted(vec), sorted(expected)
+ )
self.assertTrue(passed)
def testSparseTransform(self):
@@ -82,7 +86,9 @@ def testSparseTransform(self):
passed = False
for i in range(5): # restart at most 5 times
# create the sparse transformation model with the appropriate topic_threshold
- model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5)
+ model = ldamallet.LdaMallet(
+ self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200, topic_threshold=0.5
+ )
# transform one document
doc = list(corpus)[0]
transformed = model[doc]
@@ -91,8 +97,10 @@ def testSparseTransform(self):
passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
if passed:
break
- logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
- (i, sorted(vec), sorted(expected)))
+ logging.warning(
+ "LDA failed to converge on attempt %i (got %s, expected %s)",
+ i, sorted(vec), sorted(expected)
+ )
self.assertTrue(passed)
def testMallet2Model(self):
diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
index 71cc10f70c..c1d35c2661 100644
--- a/gensim/test/test_ldamodel.py
+++ b/gensim/test/test_ldamodel.py
@@ -22,22 +22,24 @@
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import ldamodel, ldamulticore
from gensim import matutils, utils
-from gensim.test import basetests
+from gensim.test import basetmtests
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
@@ -54,7 +56,7 @@ def testRandomState():
assert(isinstance(utils.get_random_state(testcase), np.random.RandomState))
-class TestLdaModel(unittest.TestCase, basetests.TestBaseTopicModel):
+class TestLdaModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = ldamodel.LdaModel
@@ -79,8 +81,7 @@ def testTransform(self):
passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering
if passed:
break
- logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
- (i, sorted(vec), sorted(expected)))
+ logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected))
self.assertTrue(passed)
def testAlphaAuto(self):
@@ -231,7 +232,9 @@ def testGetTopicTerms(self):
def testGetDocumentTopics(self):
- model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0))
+ model = self.class_(
+ self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)
+ )
doc_topics = model.get_document_topics(self.corpus)
@@ -264,7 +267,9 @@ def testGetDocumentTopics(self):
doc_topic_count_na = 0
word_phi_count_na = 0
- all_topics = model.get_document_topics(self.corpus, minimum_probability=0.8, minimum_phi_value=1.0, per_word_topics=True)
+ all_topics = model.get_document_topics(
+ self.corpus, minimum_probability=0.8, minimum_phi_value=1.0, per_word_topics=True
+ )
self.assertEqual(model.state.numdocs, len(corpus))
@@ -313,7 +318,9 @@ def testGetDocumentTopics(self):
def testTermTopics(self):
- model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0))
+ model = self.class_(
+ self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)
+ )
# check with word_type
result = model.get_term_topics(2)
@@ -355,7 +362,7 @@ def testPasses(self):
for test_rhot in test_rhots:
model.update(self.corpus)
- msg = ", ".join(map(str, [passes, model.num_updates, model.state.numdocs]))
+ msg = ", ".join(str(x) for x in [passes, model.num_updates, model.state.numdocs])
self.assertAlmostEqual(final_rhot(), test_rhot, msg=msg)
self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots))
@@ -379,7 +386,7 @@ def testPasses(self):
# model = self.class_(id2word=dictionary, num_topics=2, passes=200, eta=eta)
# model.update(self.corpus)
- # topics = [dict((word, p) for p, word in model.show_topic(j, topn=None)) for j in range(2)]
+ # topics = [{word: p for p, word in model.show_topic(j, topn=None)} for j in range(2)]
# # check that the word 'system' in the topic we seeded got a high weight,
# # and the word 'trees' (the main word in the other topic) a low weight --
@@ -413,8 +420,8 @@ def testModelCompatibilityWithPythonVersions(self):
self.assertTrue(np.allclose(model_2_7.expElogbeta, model_3_5.expElogbeta))
tstvec = []
self.assertTrue(np.allclose(model_2_7[tstvec], model_3_5[tstvec])) # try projecting an empty vector
- id2word_2_7 = dict((k, v) for k, v in model_2_7.id2word.iteritems())
- id2word_3_5 = dict((k, v) for k, v in model_3_5.id2word.iteritems())
+ id2word_2_7 = dict(model_2_7.id2word.iteritems())
+ id2word_3_5 = dict(model_3_5.id2word.iteritems())
self.assertEqual(set(id2word_2_7.keys()), set(id2word_3_5.keys()))
def testPersistenceIgnore(self):
diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py
index 4ca7b104fc..d38c01868c 100644
--- a/gensim/test/test_ldaseqmodel.py
+++ b/gensim/test/test_ldaseqmodel.py
@@ -20,25 +20,175 @@ class TestLdaSeq(unittest.TestCase):
# we are setting up a DTM model and fitting it, and checking topic-word and doc-topic results.
def setUp(self):
texts = [
- [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],
- [u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],
- [u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],
- [u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'],
- [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'],
- [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],
- [u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'],
- [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],
- [u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'],
- [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'],
- [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],
- [u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'],
- [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'],
- [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'],
- [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'],
- [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],
- [u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'],
- [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'],
- [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'],
+ [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently',
+ u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],
+ [u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily',
+ u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams',
+ u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional',
+ u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions',
+ u'engineering', u'participates', u'participates', u'reviews', u'participates',
+ u'hiring', u'conducting', u'interviews'],
+ [u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews',
+ u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically',
+ u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers',
+ u'define', u'schedules', u'milestones', u'participating'],
+ [u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge',
+ u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills',
+ u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise',
+ u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively',
+ u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership',
+ u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'],
+ [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility',
+ u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical',
+ u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating',
+ u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports',
+ u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles',
+ u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization',
+ u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup',
+ u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations',
+ u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules',
+ u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts',
+ u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup',
+ u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define',
+ u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness',
+ u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives',
+ u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers',
+ u'knowledge', u'skills', u'engineering', u'quality', u'engineering'],
+ [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge',
+ u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven',
+ u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve',
+ u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications',
+ u'manipulate', u'applications', u'engineering'],
+ [u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment',
+ u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational',
+ u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies',
+ u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest',
+ u'insurers', u'operates', u'investment'],
+ [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical',
+ u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships',
+ u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships',
+ u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex',
+ u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering',
+ u'infrastructures', u'engineers', u'documented', u'management', u'engineering',
+ u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution',
+ u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],
+ [u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification',
+ u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates',
+ u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization',
+ u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad',
+ u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation',
+ u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation',
+ u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran',
+ u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity',
+ u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'],
+ [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx',
+ u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers',
+ u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve',
+ u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns',
+ u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx',
+ u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership',
+ u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers',
+ u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare',
+ u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance',
+ u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests',
+ u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers',
+ u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio',
+ u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation',
+ u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories',
+ u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'],
+ [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification',
+ u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical',
+ u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware',
+ u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex',
+ u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking',
+ u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual',
+ u'automated', u'participate', u'ongoing'],
+ [u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation',
+ u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical',
+ u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts',
+ u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated',
+ u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking',
+ u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize',
+ u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations',
+ u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical',
+ u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware',
+ u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists',
+ u'laboratory', u'technology', u'availability', u'click', u'attach'],
+ [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology',
+ u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'],
+ [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel',
+ u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation',
+ u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation',
+ u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs',
+ u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology',
+ u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness',
+ u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability',
+ u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming',
+ u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance',
+ u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management',
+ u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface',
+ u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge',
+ u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact',
+ u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries',
+ u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location',
+ u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'],
+ [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer',
+ u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge',
+ u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity',
+ u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting',
+ u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely',
+ u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation',
+ u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware',
+ u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting',
+ u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably',
+ u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal',
+ u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation',
+ u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative',
+ u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'],
+ [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking',
+ u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests',
+ u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys',
+ u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded',
+ u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement',
+ u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms',
+ u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],
+ [u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators',
+ u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion',
+ u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'],
+ [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource',
+ u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational',
+ u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications',
+ u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical',
+ u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships',
+ u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate',
+ u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization',
+ u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage',
+ u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited',
+ u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques',
+ u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies',
+ u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills',
+ u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance',
+ u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'],
+ [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse',
+ u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented',
+ u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes',
+ u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots',
+ u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies',
+ u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries',
+ u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops',
+ u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures',
+ u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring',
+ u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware',
+ u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database',
+ u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted',
+ u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers',
+ u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet',
+ u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting',
+ u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol',
+ u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl',
+ u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration',
+ u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement',
+ u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'],
[u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'],
['bank', 'river', 'shore', 'water'],
['river', 'water', 'flow', 'fast', 'tree'],
@@ -56,7 +206,10 @@ def setUp(self):
sstats = np.loadtxt(datapath('sstats_test.txt'))
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
- self.ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats)
+ self.ldaseq = ldaseqmodel.LdaSeqModel(
+ corpus=corpus, id2word=dictionary, num_topics=2,
+ time_slice=[10, 10, 11], initialize='own', sstats=sstats
+ )
# testing topic word proportions
def testTopicWord(self):
diff --git a/gensim/test/test_ldavowpalwabbit_wrapper.py b/gensim/test/test_ldavowpalwabbit_wrapper.py
index 3cf6f9f6bb..d14723de59 100644
--- a/gensim/test/test_ldavowpalwabbit_wrapper.py
+++ b/gensim/test/test_ldavowpalwabbit_wrapper.py
@@ -33,11 +33,11 @@
# set up vars used in testing ("Deerwester" from the web tutorial)
TOPIC_WORDS = [
-'cat lion leopard mouse jaguar lynx cheetah tiger kitten puppy'.split(),
-'engine car wheel brakes tyre motor suspension cylinder exhaust clutch'.split(),
-'alice bob robert tim sue rachel dave harry alex jim'.split(),
-'c cplusplus go python haskell scala java ruby csharp erlang'.split(),
-'eggs ham mushrooms cereal coffee beans tea juice sausages bacon'.split()
+ 'cat lion leopard mouse jaguar lynx cheetah tiger kitten puppy'.split(),
+ 'engine car wheel brakes tyre motor suspension cylinder exhaust clutch'.split(),
+ 'alice bob robert tim sue rachel dave harry alex jim'.split(),
+ 'c cplusplus go python haskell scala java ruby csharp erlang'.split(),
+ 'eggs ham mushrooms cereal coffee beans tea juice sausages bacon'.split()
]
@@ -71,30 +71,29 @@ def test_save_load(self):
"""Test loading/saving LdaVowpalWabbit model."""
if not self.vw_path: # for python 2.6
return
- lda = LdaVowpalWabbit(self.vw_path,
- corpus=self.corpus,
- passes=10,
- chunksize=256,
- id2word=self.dictionary,
- cleanup_files=True,
- alpha=0.1,
- eta=0.1,
- num_topics=len(TOPIC_WORDS),
- random_seed=1)
+ lda = LdaVowpalWabbit(
+ self.vw_path, corpus=self.corpus, passes=10, chunksize=256,
+ id2word=self.dictionary, cleanup_files=True, alpha=0.1,
+ eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1
+ )
with tempfile.NamedTemporaryFile() as fhandle:
lda.save(fhandle.name)
lda2 = LdaVowpalWabbit.load(fhandle.name)
# ensure public fields are saved/loaded correctly
- saved_fields = [lda.alpha, lda.chunksize, lda.cleanup_files,
- lda.decay, lda.eta, lda.gamma_threshold,
- lda.id2word, lda.num_terms, lda.num_topics,
- lda.passes, lda.random_seed, lda.vw_path]
- loaded_fields = [lda2.alpha, lda2.chunksize, lda2.cleanup_files,
- lda2.decay, lda2.eta, lda2.gamma_threshold,
- lda2.id2word, lda2.num_terms, lda2.num_topics,
- lda2.passes, lda2.random_seed, lda2.vw_path]
+ saved_fields = [
+ lda.alpha, lda.chunksize, lda.cleanup_files,
+ lda.decay, lda.eta, lda.gamma_threshold,
+ lda.id2word, lda.num_terms, lda.num_topics,
+ lda.passes, lda.random_seed, lda.vw_path
+ ]
+ loaded_fields = [
+ lda2.alpha, lda2.chunksize, lda2.cleanup_files,
+ lda2.decay, lda2.eta, lda2.gamma_threshold,
+ lda2.id2word, lda2.num_terms, lda2.num_topics,
+ lda2.passes, lda2.random_seed, lda2.vw_path
+ ]
self.assertEqual(saved_fields, loaded_fields)
# ensure topic matrices are saved/loaded correctly
@@ -106,16 +105,11 @@ def test_model_update(self):
"""Test updating existing LdaVowpalWabbit model."""
if not self.vw_path: # for python 2.6
return
- lda = LdaVowpalWabbit(self.vw_path,
- corpus=[self.corpus[0]],
- passes=10,
- chunksize=256,
- id2word=self.dictionary,
- cleanup_files=True,
- alpha=0.1,
- eta=0.1,
- num_topics=len(TOPIC_WORDS),
- random_seed=1)
+ lda = LdaVowpalWabbit(
+ self.vw_path, corpus=[self.corpus[0]], passes=10, chunksize=256,
+ id2word=self.dictionary, cleanup_files=True, alpha=0.1,
+ eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1
+ )
lda.update(self.corpus[1:])
result = lda.log_perplexity(self.corpus)
@@ -126,16 +120,10 @@ def test_perplexity(self):
"""Test LdaVowpalWabbit perplexity is within expected range."""
if not self.vw_path: # for python 2.6
return
- lda = LdaVowpalWabbit(self.vw_path,
- corpus=self.corpus,
- passes=10,
- chunksize=256,
- id2word=self.dictionary,
- cleanup_files=True,
- alpha=0.1,
- eta=0.1,
- num_topics=len(TOPIC_WORDS),
- random_seed=1)
+ lda = LdaVowpalWabbit(
+ self.vw_path, corpus=self.corpus, passes=10, chunksize=256,
+ id2word=self.dictionary, cleanup_files=True, alpha=0.1,
+ eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1)
# varies, but should be between -1 and -5
result = lda.log_perplexity(self.corpus)
@@ -147,16 +135,11 @@ def test_topic_coherence(self):
if not self.vw_path: # for python 2.6
return
corpus, dictionary = get_corpus()
- lda = LdaVowpalWabbit(self.vw_path,
- corpus=corpus,
- passes=10,
- chunksize=256,
- id2word=dictionary,
- cleanup_files=True,
- alpha=0.1,
- eta=0.1,
- num_topics=len(TOPIC_WORDS),
- random_seed=1)
+ lda = LdaVowpalWabbit(
+ self.vw_path, corpus=corpus, passes=10, chunksize=256,
+ id2word=dictionary, cleanup_files=True, alpha=0.1,
+ eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1
+ )
lda.print_topics(5, 10)
# map words in known topic to an ID
@@ -198,11 +181,13 @@ def test_corpus_to_vw(self):
"""Test corpus to Vowpal Wabbit format conversion."""
if not self.vw_path: # for python 2.6
return
- corpus = [[(0, 5), (7, 1), (5, 3), (0, 2)],
- [(7, 2), (2, 1), (3, 11)],
- [(1, 1)],
- [],
- [(5, 2), (0, 1)]]
+ corpus = [
+ [(0, 5), (7, 1), (5, 3), (0, 2)],
+ [(7, 2), (2, 1), (3, 11)],
+ [(1, 1)],
+ [],
+ [(5, 2), (0, 1)]
+ ]
expected = """
| 0:5 7:1 5:3 0:2
| 7:2 2:1 3:11
diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py
index b51101c8b1..33cce71e52 100644
--- a/gensim/test/test_lee.py
+++ b/gensim/test/test_lee.py
@@ -63,7 +63,7 @@ def setUp(self):
# read the human similarity data
sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
sim_m_size = np.shape(sim_matrix)[0]
- human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
+ human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)]
def test_corpus(self):
"""availability and integrity of corpus"""
@@ -100,36 +100,12 @@ def test_lee(self):
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
res[i, j] = matutils.cossim(par1, par2)
- flat = res[matutils.triu_indices(len(corpus), 1)]
+ flat = res[np.triu_indices(len(corpus), 1)]
cor = np.corrcoef(flat, human_sim_vector)[0, 1]
- logging.info("LSI correlation coefficient is %s" % cor)
+ logging.info("LSI correlation coefficient is %s", cor)
self.assertTrue(cor > 0.6)
- # def test_lee_mallet(self):
- # global bg_corpus, corpus, bg_corpus2, corpus2
-
- # # create a dictionary and corpus (bag of words)
- # dictionary = corpora.Dictionary(bg_corpus2)
- # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
- # corpus = [dictionary.doc2bow(text) for text in corpus2]
-
- # # initialize an LDA transformation from background corpus
- # lda = models.wrappers.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
- # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
- # corpus_lda = lda[corpus]
-
- # # compute pairwise similarity matrix and extract upper triangular
- # res = np.zeros((len(corpus), len(corpus)))
- # for i, par1 in enumerate(corpus_lda):
- # for j, par2 in enumerate(corpus_lda):
- # res[i, j] = matutils.cossim(par1, par2)
- # flat = res[matutils.triu_indices(len(corpus), 1)]
-
- # cor = np.corrcoef(flat, human_sim_vector)[0, 1]
- # logging.info("LDA correlation coefficient is %s" % cor)
- # self.assertTrue(cor > 0.35)
-
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
diff --git a/gensim/test/test_logentropy_model.py b/gensim/test/test_logentropy_model.py
index 07f982dad9..22ca09be0d 100644
--- a/gensim/test/test_logentropy_model.py
+++ b/gensim/test/test_logentropy_model.py
@@ -25,15 +25,17 @@
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
@@ -56,9 +58,11 @@ def testTransform(self):
doc = list(self.corpus_ok)[0]
transformed = model[doc]
- expected = [(0, 0.3748900964125389),
- (1, 0.30730215324230725),
- (3, 1.20941755462856)]
+ expected = [
+ (0, 0.3748900964125389),
+ (1, 0.30730215324230725),
+ (3, 1.20941755462856)
+ ]
self.assertTrue(np.allclose(transformed, expected))
def testPersistence(self):
@@ -78,7 +82,6 @@ def testPersistenceCompressed(self):
self.assertTrue(model.entr == model2.entr)
tstvec = []
self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))
-# endclass TestLogEntropyModel
if __name__ == '__main__':
diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py
index e2c32bda66..6a1d2ef995 100644
--- a/gensim/test/test_lsimodel.py
+++ b/gensim/test/test_lsimodel.py
@@ -21,7 +21,7 @@
from gensim import matutils
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import lsimodel
-from gensim.test import basetests
+from gensim.test import basetmtests
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
@@ -31,15 +31,17 @@ def datapath(fname):
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
@@ -49,7 +51,7 @@ def testfile():
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
-class TestLsiModel(unittest.TestCase, basetests.TestBaseTopicModel):
+class TestLsiModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
@@ -84,7 +86,8 @@ def testCorpusTransform(self):
[0.01274618, -0.49016181],
[0.04888203, -1.11294699],
[0.08063836, -1.56345594],
- [0.27381003, -1.34694159]])
+ [0.27381003, -1.34694159]
+ ])
self.assertTrue(np.allclose(abs(got), abs(expected))) # must equal up to sign
def testOnlineTransform(self):
@@ -178,7 +181,7 @@ def testDocsProcessed(self):
self.assertEqual(self.model.docs_processed, 9)
self.assertEqual(self.model.docs_processed, self.corpus.num_docs)
- def testGetTopics(self):
+ def test_get_topics(self):
topics = self.model.get_topics()
vocab_size = len(self.model.id2word)
for topic in topics:
@@ -188,8 +191,6 @@ def testGetTopics(self):
# LSI topics are not probability distributions
# self.assertAlmostEqual(np.sum(topic), 1.0, 5)
-# endclass TestLsiModel
-
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
diff --git a/gensim/test/test_normmodel.py b/gensim/test/test_normmodel.py
index 77221b4a4d..339680d085 100644
--- a/gensim/test/test_normmodel.py
+++ b/gensim/test/test_normmodel.py
@@ -65,18 +65,22 @@ def test_sparseCSRInput_l1(self):
def test_numpyndarrayInput_l1(self):
"""Test for np ndarray input for l1 transformation"""
- ndarray_matrix = np.array([[1, 0, 2],
- [0, 0, 3],
- [4, 5, 6]])
+ ndarray_matrix = np.array([
+ [1, 0, 2],
+ [0, 0, 3],
+ [4, 5, 6]
+ ])
normalized = self.model_l1.normalize(ndarray_matrix)
# Check if output is of same type
self.assertTrue(isinstance(normalized, np.ndarray))
# Check if output is correct
- expected = np.array([[0.04761905, 0., 0.0952381],
- [0., 0., 0.14285714],
- [0.19047619, 0.23809524, 0.28571429]])
+ expected = np.array([
+ [0.04761905, 0., 0.0952381],
+ [0., 0., 0.14285714],
+ [0.19047619, 0.23809524, 0.28571429]
+ ])
self.assertTrue(np.allclose(normalized, expected))
# Test if error is raised on unsupported input type
@@ -101,25 +105,31 @@ def test_sparseCSRInput_l2(self):
self.assertTrue(issparse(normalized))
# Check if output is correct
- expected = np.array([[0.10482848, 0., 0.20965697],
- [0., 0., 0.31448545],
- [0.41931393, 0.52414242, 0.6289709]])
+ expected = np.array([
+ [0.10482848, 0., 0.20965697],
+ [0., 0., 0.31448545],
+ [0.41931393, 0.52414242, 0.6289709]
+ ])
self.assertTrue(np.allclose(normalized.toarray(), expected))
def test_numpyndarrayInput_l2(self):
"""Test for np ndarray input for l2 transformation"""
- ndarray_matrix = np.array([[1, 0, 2],
- [0, 0, 3],
- [4, 5, 6]])
+ ndarray_matrix = np.array([
+ [1, 0, 2],
+ [0, 0, 3],
+ [4, 5, 6]
+ ])
normalized = self.model_l2.normalize(ndarray_matrix)
# Check if output is of same type
self.assertTrue(isinstance(normalized, np.ndarray))
# Check if output is correct
- expected = np.array([[0.10482848, 0., 0.20965697],
- [0., 0., 0.31448545],
- [0.41931393, 0.52414242, 0.6289709]])
+ expected = np.array([
+ [0.10482848, 0., 0.20965697],
+ [0., 0., 0.31448545],
+ [0.41931393, 0.52414242, 0.6289709]
+ ])
self.assertTrue(np.allclose(normalized, expected))
# Test if error is raised on unsupported input type
diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py
index fd0429e9f6..02ca13fb6b 100644
--- a/gensim/test/test_parsing.py
+++ b/gensim/test/test_parsing.py
@@ -36,8 +36,7 @@
for many searching purposes, a little fuzziness would help. """
-dataset = map(lambda x: strip_punctuation2(x.lower()),
- [doc1, doc2, doc3, doc4])
+dataset = [strip_punctuation2(x.lower()) for x in [doc1, doc2, doc3, doc4]]
# doc1 and doc2 have class 0, doc3 and doc4 avec class 1
classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]])
@@ -45,40 +44,33 @@
class TestPreprocessing(unittest.TestCase):
def testStripNumeric(self):
- self.assertEqual(strip_numeric("salut les amis du 59"),
- "salut les amis du ")
+ self.assertEqual(strip_numeric("salut les amis du 59"), "salut les amis du ")
def testStripShort(self):
- self.assertEqual(strip_short("salut les amis du 59", 3),
- "salut les amis")
+ self.assertEqual(strip_short("salut les amis du 59", 3), "salut les amis")
def testStripTags(self):
- self.assertEqual(strip_tags("Hello World!"),
- "Hello World!")
+ self.assertEqual(strip_tags("Hello World!"), "Hello World!")
def testStripMultipleWhitespaces(self):
- self.assertEqual(strip_multiple_whitespaces("salut les\r\nloulous!"),
- "salut les loulous!")
+ self.assertEqual(strip_multiple_whitespaces("salut les\r\nloulous!"), "salut les loulous!")
def testStripNonAlphanum(self):
- self.assertEqual(strip_non_alphanum("toto nf-kappa titi"),
- "toto nf kappa titi")
+ self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), "toto nf kappa titi")
def testSplitAlphanum(self):
- self.assertEqual(split_alphanum("toto diet1 titi"),
- "toto diet 1 titi")
- self.assertEqual(split_alphanum("toto 1diet titi"),
- "toto 1 diet titi")
+ self.assertEqual(split_alphanum("toto diet1 titi"), "toto diet 1 titi")
+ self.assertEqual(split_alphanum("toto 1diet titi"), "toto 1 diet titi")
def testStripStopwords(self):
- self.assertEqual(remove_stopwords("the world is square"),
- "world square")
+ self.assertEqual(remove_stopwords("the world is square"), "world square")
def testStemText(self):
- target = "while it is quit us to be abl to search a larg " + \
- "collect of document almost instantli for a joint occurr " + \
- "of a collect of exact words, for mani search purposes, " + \
- "a littl fuzzi would help."
+ target = \
+ "while it is quit us to be abl to search a larg " + \
+ "collect of document almost instantli for a joint occurr " + \
+ "of a collect of exact words, for mani search purposes, " + \
+ "a littl fuzzi would help."
self.assertEqual(stem_text(doc5), target)
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 5397d6e4c3..868947defb 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -133,11 +133,7 @@ def testExportPhrases(self):
for phrase, score in bigram.export_phrases(sentences):
seen_bigrams.add(phrase)
- assert seen_bigrams == set([
- b'response time',
- b'graph minors',
- b'human interface'
- ])
+ assert seen_bigrams == {b'response time', b'graph minors', b'human interface'}
def testMultipleBigramsSingleEntry(self):
""" a single entry should produce multiple bigrams. """
@@ -149,10 +145,7 @@ def testMultipleBigramsSingleEntry(self):
for phrase, score in bigram.export_phrases(test_sentences):
seen_bigrams.add(phrase)
- assert seen_bigrams == set([
- b'graph minors',
- b'human interface'
- ])
+ assert seen_bigrams == {b'graph minors', b'human interface'}
def testScoringDefault(self):
""" test the default scoring, from the mikolov word2vec paper """
@@ -164,10 +157,10 @@ def testScoringDefault(self):
for phrase, score in bigram.export_phrases(test_sentences):
seen_scores.add(round(score, 3))
- assert seen_scores == set([
+ assert seen_scores == {
5.167, # score for graph minors
3.444 # score for human interface
- ])
+ }
def testScoringNpmi(self):
""" test normalized pointwise mutual information scoring """
@@ -179,10 +172,10 @@ def testScoringNpmi(self):
for phrase, score in bigram.export_phrases(test_sentences):
seen_scores.add(round(score, 3))
- assert seen_scores == set([
+ assert seen_scores == {
.882, # score for graph minors
.714 # score for human interface
- ])
+ }
def testBadParameters(self):
"""Test the phrases module with bad parameters."""
diff --git a/gensim/test/test_rpmodel.py b/gensim/test/test_rpmodel.py
index 2de5dd6546..94c1abce84 100644
--- a/gensim/test/test_rpmodel.py
+++ b/gensim/test/test_rpmodel.py
@@ -26,15 +26,17 @@
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
@@ -80,7 +82,6 @@ def testPersistenceCompressed(self):
self.assertTrue(np.allclose(model.projection, model2.projection))
tstvec = []
self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
-# endclass TestRpModel
if __name__ == '__main__':
diff --git a/gensim/test/test_segmentation.py b/gensim/test/test_segmentation.py
index a4c9356a26..512121a055 100644
--- a/gensim/test/test_segmentation.py
+++ b/gensim/test/test_segmentation.py
@@ -20,30 +20,40 @@
class TestSegmentation(unittest.TestCase):
def setUp(self):
- self.topics = [array([9, 4, 6]), array([9, 10, 7]), array([5, 2, 7])]
+ self.topics = [
+ array([9, 4, 6]),
+ array([9, 10, 7]),
+ array([5, 2, 7])
+ ]
def testSOnePre(self):
"""Test s_one_pre segmentation."""
actual = segmentation.s_one_pre(self.topics)
- expected = [[(4, 9), (6, 9), (6, 4)],
- [(10, 9), (7, 9), (7, 10)],
- [(2, 5), (7, 5), (7, 2)]]
+ expected = [
+ [(4, 9), (6, 9), (6, 4)],
+ [(10, 9), (7, 9), (7, 10)],
+ [(2, 5), (7, 5), (7, 2)]
+ ]
self.assertTrue(np.allclose(actual, expected))
def testSOneOne(self):
"""Test s_one_one segmentation."""
actual = segmentation.s_one_one(self.topics)
- expected = [[(9, 4), (9, 6), (4, 9), (4, 6), (6, 9), (6, 4)],
- [(9, 10), (9, 7), (10, 9), (10, 7), (7, 9), (7, 10)],
- [(5, 2), (5, 7), (2, 5), (2, 7), (7, 5), (7, 2)]]
+ expected = [
+ [(9, 4), (9, 6), (4, 9), (4, 6), (6, 9), (6, 4)],
+ [(9, 10), (9, 7), (10, 9), (10, 7), (7, 9), (7, 10)],
+ [(5, 2), (5, 7), (2, 5), (2, 7), (7, 5), (7, 2)]
+ ]
self.assertTrue(np.allclose(actual, expected))
def testSOneSet(self):
"""Test s_one_set segmentation."""
actual = segmentation.s_one_set(self.topics)
- expected = [[(9, array([9, 4, 6])), (4, array([9, 4, 6])), (6, array([9, 4, 6]))],
- [(9, array([9, 10, 7])), (10, array([9, 10, 7])), (7, array([9, 10, 7]))],
- [(5, array([5, 2, 7])), (2, array([5, 2, 7])), (7, array([5, 2, 7]))]]
+ expected = [
+ [(9, array([9, 4, 6])), (4, array([9, 4, 6])), (6, array([9, 4, 6]))],
+ [(9, array([9, 10, 7])), (10, array([9, 10, 7])), (7, array([9, 10, 7]))],
+ [(5, array([5, 2, 7])), (2, array([5, 2, 7])), (7, array([5, 2, 7]))]
+ ]
for s_i in range(len(actual)):
for j in range(len(actual[s_i])):
self.assertEqual(actual[s_i][j][0], expected[s_i][j][0])
diff --git a/gensim/test/test_sharded_corpus.py b/gensim/test/test_sharded_corpus.py
index 871048ea4e..cc70ee8f49 100644
--- a/gensim/test/test_sharded_corpus.py
+++ b/gensim/test/test_sharded_corpus.py
@@ -87,8 +87,7 @@ def test_sparse_serialization(self):
no_exception = True
try:
- dataset = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, # noqa:F841
- dim=self.dim, sparse_serialization=True)
+ ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=True)
except Exception:
no_exception = False
raise
@@ -97,9 +96,10 @@ def test_sparse_serialization(self):
def test_getitem_dense2dense(self):
- corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
- dim=self.dim, sparse_serialization=False,
- sparse_retrieval=False)
+ corpus = ShardedCorpus(
+ self.tmp_fname, self.data, shardsize=100, dim=self.dim,
+ sparse_serialization=False, sparse_retrieval=False
+ )
item = corpus[3]
self.assertTrue(isinstance(item, np.ndarray))
@@ -117,9 +117,10 @@ def test_getitem_dense2dense(self):
def test_getitem_dense2sparse(self):
- corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
- dim=self.dim, sparse_serialization=False,
- sparse_retrieval=True)
+ corpus = ShardedCorpus(
+ self.tmp_fname, self.data, shardsize=100, dim=self.dim,
+ sparse_serialization=False, sparse_retrieval=True
+ )
item = corpus[3]
self.assertTrue(isinstance(item, sparse.csr_matrix))
@@ -138,13 +139,15 @@ def test_getitem_dense2sparse(self):
def test_getitem_sparse2sparse(self):
sp_tmp_fname = self.tmp_fname + '.sparse'
- corpus = ShardedCorpus(sp_tmp_fname, self.data, shardsize=100,
- dim=self.dim, sparse_serialization=True,
- sparse_retrieval=True)
+ corpus = ShardedCorpus(
+ sp_tmp_fname, self.data, shardsize=100, dim=self.dim,
+ sparse_serialization=True, sparse_retrieval=True
+ )
- dense_corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
- dim=self.dim, sparse_serialization=False,
- sparse_retrieval=True)
+ dense_corpus = ShardedCorpus(
+ self.tmp_fname, self.data, shardsize=100, dim=self.dim,
+ sparse_serialization=False, sparse_retrieval=True
+ )
item = corpus[3]
self.assertTrue(isinstance(item, sparse.csr_matrix))
@@ -168,13 +171,15 @@ def test_getitem_sparse2sparse(self):
def test_getitem_sparse2dense(self):
sp_tmp_fname = self.tmp_fname + '.sparse'
- corpus = ShardedCorpus(sp_tmp_fname, self.data, shardsize=100,
- dim=self.dim, sparse_serialization=True,
- sparse_retrieval=False)
+ corpus = ShardedCorpus(
+ sp_tmp_fname, self.data, shardsize=100, dim=self.dim,
+ sparse_serialization=True, sparse_retrieval=False
+ )
- dense_corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
- dim=self.dim, sparse_serialization=False,
- sparse_retrieval=False)
+ dense_corpus = ShardedCorpus(
+ self.tmp_fname, self.data, shardsize=100, dim=self.dim,
+ sparse_serialization=False, sparse_retrieval=False
+ )
item = corpus[3]
self.assertTrue(isinstance(item, np.ndarray))
@@ -195,9 +200,10 @@ def test_getitem_sparse2dense(self):
def test_getitem_dense2gensim(self):
- corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
- dim=self.dim, sparse_serialization=False,
- gensim=True)
+ corpus = ShardedCorpus(
+ self.tmp_fname, self.data, shardsize=100, dim=self.dim,
+ sparse_serialization=False, gensim=True
+ )
item = corpus[3]
self.assertTrue(isinstance(item, list))
@@ -211,8 +217,7 @@ def test_getitem_dense2gensim(self):
self.assertTrue(isinstance(dslice[0][0], tuple))
iscorp, _ = is_corpus(dslice)
- self.assertTrue(iscorp, "Is the object returned by slice notation "
- "a gensim corpus?")
+ self.assertTrue(iscorp, "Is the object returned by slice notation a gensim corpus?")
ilist = corpus[[2, 3, 4, 5]]
self.assertTrue(next(ilist) == corpus[2])
@@ -235,8 +240,7 @@ def test_getitem_dense2gensim(self):
str(dslice[i][j])))
iscorp, _ = is_corpus(ilist)
- self.assertTrue(iscorp, "Is the object returned by list notation "
- "a gensim corpus?")
+ self.assertTrue(iscorp, "Is the object returned by list notation a gensim corpus?")
def test_resize(self):
@@ -252,8 +256,6 @@ def test_resize(self):
fname = dataset._shard_name(n)
self.assertTrue(os.path.isfile(fname))
-##############################################################################
-
if __name__ == '__main__':
suite = unittest.TestSuite()
diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
index 9ce5263df7..93c0f8a3f7 100644
--- a/gensim/test/test_similarities.py
+++ b/gensim/test/test_similarities.py
@@ -36,20 +36,21 @@
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
-sentences = [doc2vec.TaggedDocument(words, [i])
- for i, words in enumerate(texts)]
+sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(texts)]
def testfile():
@@ -78,7 +79,7 @@ def testFull(self, num_best=None, shardsize=100):
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.70710677, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.57735026],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026],
- ], dtype=numpy.float32)
+ ], dtype=numpy.float32)
# HACK: dictionary can be in different order, so compare in sorted order
self.assertTrue(numpy.allclose(sorted(expected.flat), sorted(index.index.flat)))
index.num_best = num_best
@@ -137,15 +138,17 @@ def testChunking(self):
[0.99999994, 0.23570226, 0.28867513, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.23570226, 1.0, 0.40824831, 0.33333334, 0.70710677, 0.0, 0.0, 0.0, 0.23570226],
[0.28867513, 0.40824831, 1.0, 0.61237246, 0.28867513, 0.0, 0.0, 0.0, 0.0]
- ], dtype=numpy.float32)
+ ], dtype=numpy.float32)
self.assertTrue(numpy.allclose(expected, sims))
# test the same thing but with num_best
index.num_best = 3
sims = index[query]
- expected = [[(0, 0.99999994), (2, 0.28867513), (1, 0.23570226)],
- [(1, 1.0), (4, 0.70710677), (2, 0.40824831)],
- [(2, 1.0), (3, 0.61237246), (1, 0.40824831)]]
+ expected = [
+ [(0, 0.99999994), (2, 0.28867513), (1, 0.23570226)],
+ [(1, 1.0), (4, 0.70710677), (2, 0.40824831)],
+ [(2, 1.0), (3, 0.61237246), (1, 0.40824831)]
+ ]
self.assertTrue(numpy.allclose(expected, sims))
if self.cls == similarities.Similarity:
index.destroy()
@@ -166,7 +169,7 @@ def testIter(self):
[0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.99999994, 0.81649655, 0.40824828],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.81649655, 0.99999994, 0.66666663],
[0.0, 0.23570226, 0.0, 0.0, 0.0, 0.0, 0.40824828, 0.66666663, 0.99999994]
- ], dtype=numpy.float32)
+ ], dtype=numpy.float32)
self.assertTrue(numpy.allclose(expected, sims))
if self.cls == similarities.Similarity:
index.destroy()
diff --git a/gensim/test/test_similarity_metrics.py b/gensim/test/test_similarity_metrics.py
index 23bf0e60f7..27066ff09d 100644
--- a/gensim/test/test_similarity_metrics.py
+++ b/gensim/test/test_similarity_metrics.py
@@ -24,15 +24,17 @@
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 0e17905c2b..07411aa9b9 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -43,10 +43,19 @@
]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
-author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]}
+author2doc = {
+ 'john': [0, 1, 2, 3, 4, 5, 6],
+ 'jane': [2, 3, 4, 5, 6, 7, 8],
+ 'jack': [0, 2, 4, 6, 8],
+ 'jill': [1, 3, 5, 7]
+}
texts_new = texts[0:3]
-author2doc_new = {'jill': [0], 'bob': [0, 1], 'sally': [1, 2]}
+author2doc_new = {
+ 'jill': [0],
+ 'bob': [0, 1],
+ 'sally': [1, 2]
+}
dictionary_new = Dictionary(texts_new)
corpus_new = [dictionary_new.doc2bow(text) for text in texts_new]
@@ -91,11 +100,16 @@
['geometry', 'is', 'the', 'study', 'of', 'shape'],
['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'],
['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'],
- ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and', 'the', 'areas', 'under', 'and', 'between', 'curves'],
- ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter', 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'],
+ ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and',
+ 'the', 'areas', 'under', 'and', 'between', 'curves'],
+ ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter',
+ 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'],
['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'],
- ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new', 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'],
- ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society']
+ ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new',
+ 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'],
+ ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics',
+ 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically',
+ 'transformed', 'modern', 'day', 'society']
]
d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)]
@@ -129,7 +143,9 @@
class TestLdaWrapper(unittest.TestCase):
def setUp(self):
numpy.random.seed(0) # set fixed seed to get similar values everytime
- self.model = LdaTransformer(id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0))
+ self.model = LdaTransformer(
+ id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
+ )
self.model.fit(corpus)
def testTransform(self):
@@ -157,11 +173,16 @@ def testPartialFit(self):
def testConsistencyWithGensimModel(self):
# training an LdaTransformer with `num_topics`=10
- self.model = LdaTransformer(id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0))
+ self.model = LdaTransformer(
+ id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
+ )
self.model.fit(corpus)
# training a Gensim LdaModel with the same params
- gensim_ldamodel = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0))
+ gensim_ldamodel = models.LdaModel(
+ corpus=corpus, id2word=dictionary, num_topics=10, passes=100,
+ minimum_probability=0, random_state=numpy.random.seed(0)
+ )
texts_new = ['graph', 'eulerian']
bow = self.model.id2word.doc2bow(texts_new)
@@ -190,7 +211,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
- id2word = Dictionary(map(lambda x: x.split(), data.data))
+ id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -238,7 +259,10 @@ def testPersistence(self):
self.assertTrue(passed)
def testModelNotFitted(self):
- lda_wrapper = LdaTransformer(id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0))
+ lda_wrapper = LdaTransformer(
+ id2word=dictionary, num_topics=2, passes=100,
+ minimum_probability=0, random_state=numpy.random.seed(0)
+ )
texts_new = ['graph', 'eulerian']
bow = lda_wrapper.id2word.doc2bow(texts_new)
self.assertRaises(NotFittedError, lda_wrapper.transform, bow)
@@ -280,7 +304,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
- id2word = Dictionary(map(lambda x: x.split(), data.data))
+ id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -336,14 +360,14 @@ def testModelNotFitted(self):
class TestLdaSeqWrapper(unittest.TestCase):
def setUp(self):
- self.model = LdaSeqTransformer(id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim')
+ self.model = LdaSeqTransformer(
+ id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim'
+ )
self.model.fit(corpus_ldaseq)
def testTransform(self):
# transforming two documents
- docs = []
- docs.append(list(corpus_ldaseq)[0])
- docs.append(list(corpus_ldaseq)[1])
+ docs = [list(corpus_ldaseq)[0], list(corpus_ldaseq)[1]]
transformed_vecs = self.model.transform(docs)
self.assertEqual(transformed_vecs.shape[0], 2)
self.assertEqual(transformed_vecs.shape[1], self.model.num_topics)
@@ -363,7 +387,7 @@ def testPipeline(self):
data = cache
test_data = data.data[0:2]
test_target = data.target[0:2]
- id2word = Dictionary(map(lambda x: x.split(), test_data))
+ id2word = Dictionary([x.split() for x in test_data])
corpus = [id2word.doc2bow(i.split()) for i in test_data]
model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -412,9 +436,7 @@ def setUp(self):
def testTransform(self):
# tranform two documents
- docs = []
- docs.append(list(self.corpus)[0])
- docs.append(list(self.corpus)[1])
+ docs = [list(self.corpus)[0], list(self.corpus)[1]]
matrix = self.model.transform(docs)
self.assertEqual(matrix.shape[0], 2)
self.assertEqual(matrix.shape[1], self.model.num_topics)
@@ -433,7 +455,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
- id2word = Dictionary(map(lambda x: x.split(), data.data))
+ id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
@@ -514,11 +536,13 @@ def testPipeline(self):
class_dict = {'mathematics': 1, 'physics': 0}
train_data = [
- ('calculus', 'mathematics'), ('mathematical', 'mathematics'), ('geometry', 'mathematics'), ('operations', 'mathematics'), ('curves', 'mathematics'),
- ('natural', 'physics'), ('nuclear', 'physics'), ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')
+ ('calculus', 'mathematics'), ('mathematical', 'mathematics'),
+ ('geometry', 'mathematics'), ('operations', 'mathematics'),
+ ('curves', 'mathematics'), ('natural', 'physics'), ('nuclear', 'physics'),
+ ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')
]
- train_input = list(map(lambda x: x[0], train_data))
- train_target = list(map(lambda x: class_dict[x[1]], train_data))
+ train_input = [x[0] for x in train_data]
+ train_target = [class_dict[x[1]] for x in train_data]
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
clf.fit(model.transform(train_input), train_target)
@@ -648,10 +672,7 @@ def setUp(self):
def testTransform(self):
# tranform multiple documents
- docs = []
- docs.append(w2v_texts[0])
- docs.append(w2v_texts[1])
- docs.append(w2v_texts[2])
+ docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]]
matrix = self.model.transform(docs)
self.assertEqual(matrix.shape[0], 3)
self.assertEqual(matrix.shape[1], self.model.size)
@@ -682,8 +703,8 @@ def testPipeline(self):
(['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),
(['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')
]
- train_input = list(map(lambda x: x[0], train_data))
- train_target = list(map(lambda x: class_dict[x[1]], train_data))
+ train_input = [x[0] for x in train_data]
+ train_target = [class_dict[x[1]] for x in train_data]
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
clf.fit(model.transform(train_input), train_target)
@@ -737,7 +758,7 @@ def testTransform(self):
doc = ['computer system interface time computer system']
bow_vec = self.model.transform(doc)[0]
expected_values = [1, 1, 2, 2] # comparing only the word-counts
- values = list(map(lambda x: x[1], bow_vec))
+ values = [x[1] for x in bow_vec]
self.assertEqual(sorted(expected_values), sorted(values))
def testSetGetParams(self):
@@ -794,8 +815,11 @@ def testTransform(self):
# tranform multiple documents
docs = [corpus[0], corpus[1]]
transformed_docs = self.model.transform(docs)
- expected_docs = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)],
- [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]]
+ expected_docs = [
+ [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)],
+ [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555),
+ (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]
+ ]
self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
@@ -815,7 +839,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
- id2word = Dictionary(map(lambda x: x.split(), data.data))
+ id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
tfidf_model = TfIdfTransformer()
tfidf_model.fit(corpus)
@@ -854,14 +878,20 @@ def testTransform(self):
# tranform one document
doc = self.corpus[0]
transformed_doc = self.model.transform(doc)
- expected_doc = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]]
+ expected_doc = [
+ [0.81043386270128193, 0.049357139518070477, 0.035840906753517532,
+ 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]
+ ]
self.assertTrue(numpy.allclose(transformed_doc, expected_doc, atol=1e-2))
# tranform multiple documents
docs = [self.corpus[0], self.corpus[1]]
transformed_docs = self.model.transform(docs)
- expected_docs = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148],
- [0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.]]
+ expected_docs = [
+ [0.81043386270128193, 0.049357139518070477, 0.035840906753517532,
+ 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148],
+ [0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.]
+ ]
self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0], atol=1e-2))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1], atol=1e-2))
@@ -881,7 +911,7 @@ def testPipeline(self):
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
- id2word = Dictionary(map(lambda x: x.split(), data.data))
+ id2word = Dictionary([x.split() for x in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]
model = HdpTransformer(id2word=id2word)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index bb00b5482d..65e2939857 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -25,15 +25,17 @@
# set up vars used in testing ("Deerwester" from the web tutorial)
-texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+texts = [
+ ['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']
+]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py
index 9bdbcbdb8d..ca81d6e51a 100644
--- a/gensim/test/test_wikicorpus.py
+++ b/gensim/test/test_wikicorpus.py
@@ -28,7 +28,7 @@ class TestWikiCorpus(unittest.TestCase):
# #TODO: sporadic failure to be investigated
# def test_get_texts_returns_generator_of_lists(self):
- # logger.debug("Current Python Version is " + str(sys.version_info))
+ # logger.debug("Current Python Version is %s", str(sys.version_info))
# if sys.version_info < (2, 7, 0):
# return
#
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 29ae713b90..81123ccd7a 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -140,14 +140,18 @@ def test_sg_neg_online(self):
def test_cbow_hs_online(self):
"""Test CBOW w/ hierarchical softmax"""
- model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
- min_count=3, iter=10, seed=42, workers=2)
+ model = word2vec.Word2Vec(
+ sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
+ min_count=3, iter=10, seed=42, workers=2
+ )
self.onlineSanity(model)
def test_cbow_neg_online(self):
"""Test CBOW w/ negative sampling"""
- model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
- min_count=5, iter=10, seed=42, workers=2, sample=0)
+ model = word2vec.Word2Vec(
+ sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
+ min_count=5, iter=10, seed=42, workers=2, sample=0
+ )
self.onlineSanity(model)
def testPersistence(self):
@@ -241,7 +245,9 @@ def testPersistenceWord2VecFormat(self):
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3)
self.assertEquals(len(limited_model_kv.syn0), 3)
- half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, datatype=np.float16)
+ half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(
+ testfile(), binary=True, datatype=np.float16
+ )
self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
def testNoTrainingCFormat(self):
@@ -284,7 +290,9 @@ def testPersistenceWord2VecFormatNonBinary(self):
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False)
norm_only_model.init_sims(True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6))
- self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4))
+ self.assertTrue(np.allclose(
+ model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4
+ ))
def testPersistenceWord2VecFormatWithVocab(self):
"""Test storing/loading the entire model and vocabulary in word2vec format."""
@@ -450,14 +458,18 @@ def test_sg_neg(self):
def test_cbow_hs(self):
"""Test CBOW w/ hierarchical softmax"""
- model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0,
- min_count=5, iter=10, workers=2, batch_words=1000)
+ model = word2vec.Word2Vec(
+ sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0,
+ min_count=5, iter=10, workers=2, batch_words=1000
+ )
self.model_sanity(model)
def test_cbow_neg(self):
"""Test CBOW w/ negative sampling"""
- model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
- min_count=5, iter=10, workers=2, sample=0)
+ model = word2vec.Word2Vec(
+ sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
+ min_count=5, iter=10, workers=2, sample=0
+ )
self.model_sanity(model)
def test_cosmul(self):
@@ -654,8 +666,10 @@ def testBuildVocabWarning(self, l):
@log_capture()
def testTrainWarning(self, l):
"""Test if warning is raised if alpha rises during subsequent calls to train()"""
- sentences = [['human'],
- ['graph', 'trees']]
+ sentences = [
+ ['human'],
+ ['graph', 'trees']
+ ]
model = word2vec.Word2Vec(min_count=1)
model.build_vocab(sentences)
for epoch in range(10):
@@ -814,6 +828,7 @@ def assertLess(self, a, b, msg=None):
if __name__ == '__main__':
logging.basicConfig(
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
- level=logging.DEBUG)
+ level=logging.DEBUG
+ )
logging.info("using optimization %s", word2vec.FAST_VERSION)
unittest.main()
diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py
index 8f8d5b5f9d..4ecb9f7c70 100644
--- a/gensim/test/test_wordrank_wrapper.py
+++ b/gensim/test/test_wordrank_wrapper.py
@@ -36,7 +36,10 @@ def setUp(self):
self.wr_file = datapath('test_glove.txt')
if not self.wr_path:
return
- self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, np=2, cleanup_files=True)
+ self.test_model = wordrank.Wordrank.train(
+ self.wr_path, self.corpus_file, self.out_name, iter=6,
+ dump_period=5, period=5, np=2, cleanup_files=True
+ )
def testLoadWordrankFormat(self):
"""Test model successfully loaded from Wordrank format file"""
diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
index 247d8c146d..a3b3463391 100644
--- a/gensim/topic_coherence/direct_confirmation_measure.py
+++ b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -9,7 +9,6 @@
"""
import logging
-
import numpy as np
logger = logging.getLogger(__name__)
@@ -38,8 +37,10 @@ def log_conditional_probability(segmented_topics, accumulator):
for w_prime, w_star in s_i:
w_star_count = accumulator[w_star]
if w_star_count == 0:
- raise ValueError("Topic with id %d not found in corpus used to compute coherence. "
- "Try using a larger corpus with a smaller vocobulary and/or setting a smaller value of `topn` for `CoherenceModel`." % (w_star))
+ raise ValueError(
+ "Topic with id %d not found in corpus used to compute coherence. "
+ "Try using a larger corpus with a smaller vocabulary and/or setting a smaller value of `topn` for `CoherenceModel`." % w_star
+ )
co_occur_count = accumulator[w_prime, w_star]
m_lc_i = np.log(((co_occur_count / num_docs) + EPSILON) / (w_star_count / num_docs))
diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
index 7832494a5c..1ddd70cbb0 100644
--- a/gensim/topic_coherence/probability_estimation.py
+++ b/gensim/topic_coherence/probability_estimation.py
@@ -11,8 +11,7 @@
import itertools
import logging
-from gensim.topic_coherence.text_analysis import \
- CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
+from gensim.topic_coherence.text_analysis import CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator
logger = logging.getLogger(__name__)
diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py
index 9097036914..2db0d695d2 100644
--- a/gensim/topic_coherence/segmentation.py
+++ b/gensim/topic_coherence/segmentation.py
@@ -27,18 +27,18 @@ def s_one_pre(topics):
topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]
Returns:
- s_one_pre : list of list of (W', W*) tuples for all unique topic ids
+ s_one_pre_res : list of list of (W', W*) tuples for all unique topic ids
"""
- s_one_pre = []
+ s_one_pre_res = []
for top_words in topics:
s_one_pre_t = []
for w_prime_index, w_prime in enumerate(top_words[1:]):
for w_star in top_words[:w_prime_index + 1]:
s_one_pre_t.append((w_prime, w_star))
- s_one_pre.append(s_one_pre_t)
+ s_one_pre_res.append(s_one_pre_t)
- return s_one_pre
+ return s_one_pre_res
def s_one_one(topics):
@@ -55,9 +55,9 @@ def s_one_one(topics):
topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]
Returns:
- s_one_one : list of list of (W', W*) tuples for all unique topic ids
+ s_one_one_res : list of list of (W', W*) tuples for all unique topic ids
"""
- s_one_one = []
+ s_one_one_res = []
for top_words in topics:
s_one_one_t = []
@@ -67,9 +67,9 @@ def s_one_one(topics):
continue
else:
s_one_one_t.append((w_prime, w_star))
- s_one_one.append(s_one_one_t)
+ s_one_one_res.append(s_one_one_t)
- return s_one_one
+ return s_one_one_res
def s_one_set(topics):
@@ -87,14 +87,14 @@ def s_one_set(topics):
topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]
Returns:
- s_one_set : list of list of (W', W*) tuples for all unique topic ids.
+ s_one_set_res : list of list of (W', W*) tuples for all unique topic ids.
"""
- s_one_set = []
+ s_one_set_res = []
for top_words in topics:
s_one_set_t = []
for w_prime in top_words:
s_one_set_t.append((w_prime, top_words))
- s_one_set.append(s_one_set_t)
+ s_one_set_res.append(s_one_set_t)
- return s_one_set
+ return s_one_set_res
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 7305fe9792..3254a34885 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -64,9 +64,7 @@ def num_docs(self):
def num_docs(self, num):
self._num_docs = num
if self._num_docs % self.log_every == 0:
- logger.info(
- "%s accumulated stats from %d documents",
- self.__class__.__name__, self._num_docs)
+ logger.info("%s accumulated stats from %d documents", self.__class__.__name__, self._num_docs)
def analyze_text(self, text, doc_num=None):
raise NotImplementedError("Base classes should implement analyze_text.")
@@ -369,9 +367,7 @@ def queue_all_texts(self, q, texts, window_size):
before = self._num_docs / self.log_every
self._num_docs += sum(len(doc) - window_size + 1 for doc in batch)
if before < (self._num_docs / self.log_every):
- logger.info(
- "%d batches submitted to accumulate stats from %d documents (%d virtual)",
- (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs)
+ logger.info("%d batches submitted to accumulate stats from %d documents (%d virtual)", (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs)
def terminate_workers(self, input_q, output_q, workers, interrupted=False):
"""Wait until all workers have transmitted their WordOccurrenceAccumulator instances,
@@ -414,9 +410,7 @@ def merge_accumulators(self, accumulators):
# Workers do partial accumulation, so none of the co-occurrence matrices are symmetrized.
# This is by design, to avoid unnecessary matrix additions/conversions during accumulation.
accumulator._symmetrize()
- logger.info(
- "accumulated word occurrence stats for %d virtual documents",
- accumulator.num_docs)
+ logger.info("accumulated word occurrence stats for %d virtual documents", accumulator.num_docs)
return accumulator
@@ -435,9 +429,7 @@ def run(self):
try:
self._run()
except KeyboardInterrupt:
- logger.info(
- "%s interrupted after processing %d documents",
- self.__class__.__name__, self.accumulator.num_docs)
+ logger.info("%s interrupted after processing %d documents", self.__class__.__name__, self.accumulator.num_docs)
except Exception:
logger.exception("worker encountered unexpected exception")
finally:
@@ -455,13 +447,9 @@ def _run(self):
self.accumulator.partial_accumulate(docs, self.window_size)
n_docs += len(docs)
- logger.debug(
- "completed batch %d; %d documents processed (%d virtual)",
- batch_num, n_docs, self.accumulator.num_docs)
+ logger.debug("completed batch %d; %d documents processed (%d virtual)", batch_num, n_docs, self.accumulator.num_docs)
- logger.debug(
- "finished all batches; %d documents processed (%d virtual)",
- n_docs, self.accumulator.num_docs)
+ logger.debug("finished all batches; %d documents processed (%d virtual)", n_docs, self.accumulator.num_docs)
def reply_to_master(self):
logger.info("serializing accumulator to return to master...")
diff --git a/gensim/utils.py b/gensim/utils.py
index 47d7bc98cd..10555d2b51 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -106,12 +106,12 @@ def _synched(func):
@wraps(func)
def _synchronizer(self, *args, **kwargs):
tlock = getattr(self, tlockname)
- logger.debug("acquiring lock %r for %s" % (tlockname, func.__name__))
+ logger.debug("acquiring lock %r for %s", tlockname, func.__name__)
with tlock: # use lock as a context manager to perform safe acquire/release pairs
- logger.debug("acquired lock %r for %s" % (tlockname, func.__name__))
+ logger.debug("acquired lock %r for %s", tlockname, func.__name__)
result = func(self, *args, **kwargs)
- logger.debug("releasing lock %r for %s" % (tlockname, func.__name__))
+ logger.debug("releasing lock %r for %s", tlockname, func.__name__)
return result
return _synchronizer
return _synched
@@ -181,8 +181,7 @@ def copytree_hardlink(source, dest):
shutil.copy2 = copy2
-def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False,
- lower=False):
+def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False):
"""
Iteratively yield tokens as unicode strings, removing accent marks
and optionally lowercasing the unidoce string by assigning True
@@ -275,7 +274,7 @@ def load(cls, fname, mmap=None):
is encountered.
"""
- logger.info("loading %s object from %s" % (cls.__name__, fname))
+ logger.info("loading %s object from %s", cls.__name__, fname)
compress, subname = SaveLoad._adapt_by_suffix(fname)
@@ -292,17 +291,16 @@ def _load_specials(self, fname, mmap, compress, subname):
"""
mmap_error = lambda x, y: IOError(
'Cannot mmap compressed object %s in file %s. ' % (x, y) +
- 'Use `load(fname, mmap=None)` or uncompress files manually.')
+ 'Use `load(fname, mmap=None)` or uncompress files manually.'
+ )
for attrib in getattr(self, '__recursive_saveloads', []):
cfname = '.'.join((fname, attrib))
- logger.info("loading %s recursively from %s.* with mmap=%s" % (
- attrib, cfname, mmap))
+ logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap)
getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
for attrib in getattr(self, '__numpys', []):
- logger.info("loading %s from %s with mmap=%s" % (
- attrib, subname(fname, attrib), mmap))
+ logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
if compress:
if mmap:
@@ -315,8 +313,7 @@ def _load_specials(self, fname, mmap, compress, subname):
setattr(self, attrib, val)
for attrib in getattr(self, '__scipys', []):
- logger.info("loading %s from %s with mmap=%s" % (
- attrib, subname(fname, attrib), mmap))
+ logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
sparse = unpickle(subname(fname, attrib))
if compress:
if mmap:
@@ -334,7 +331,7 @@ def _load_specials(self, fname, mmap, compress, subname):
setattr(self, attrib, sparse)
for attrib in getattr(self, '__ignoreds', []):
- logger.info("setting ignored attribute %s to None" % (attrib))
+ logger.info("setting ignored attribute %s to None", attrib)
setattr(self, attrib, None)
@staticmethod
@@ -346,10 +343,9 @@ def _adapt_by_suffix(fname):
else:
compress = False
subname = lambda *args: '.'.join(list(args) + ['npy'])
- return (compress, subname)
+ return compress, subname
- def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2,
- ignore=frozenset(), pickle_protocol=2):
+ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
"""
Save the object to file (also see `load`).
@@ -370,9 +366,7 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2,
in both Python 2 and 3.
"""
- logger.info(
- "saving %s object under %s, separately %s" % (
- self.__class__.__name__, fname, separately))
+ logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
compress, subname = SaveLoad._adapt_by_suffix(fname)
@@ -419,17 +413,14 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
recursive_saveloads.append(attrib)
cfname = '.'.join((fname, attrib))
- restores.extend(val._save_specials(
- cfname, None, sep_limit, ignore,
- pickle_protocol, compress, subname))
+ restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
try:
numpys, scipys, ignoreds = [], [], []
for attrib, val in iteritems(asides):
if isinstance(val, np.ndarray) and attrib not in ignore:
numpys.append(attrib)
- logger.info("storing np array '%s' to %s" % (
- attrib, subname(fname, attrib)))
+ logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
if compress:
np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
@@ -438,15 +429,15 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
scipys.append(attrib)
- logger.info("storing scipy.sparse array '%s' under %s" % (
- attrib, subname(fname, attrib)))
+ logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib))
if compress:
np.savez_compressed(
subname(fname, attrib, 'sparse'),
data=val.data,
indptr=val.indptr,
- indices=val.indices)
+ indices=val.indices
+ )
else:
np.save(subname(fname, attrib, 'data'), val.data)
np.save(subname(fname, attrib, 'indptr'), val.indptr)
@@ -461,7 +452,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
finally:
val.data, val.indptr, val.indices = data, indptr, indices
else:
- logger.info("not storing attribute %s" % (attrib))
+ logger.info("not storing attribute %s", attrib)
ignoreds.append(attrib)
self.__dict__['__numpys'] = numpys
@@ -475,8 +466,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
raise
return restores + [(self, asides)]
- def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2,
- ignore=frozenset(), pickle_protocol=2):
+ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
"""
Save the object to file (also see `load`).
@@ -504,11 +494,10 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2,
"""
try:
_pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
- logger.info("saved %s object" % self.__class__.__name__)
+ logger.info("saved %s object", self.__class__.__name__)
except TypeError: # `fname_or_handle` does not have write attribute
self._smart_save(fname_or_handle, separately, sep_limit, ignore,
pickle_protocol=pickle_protocol)
-# endclass SaveLoad
def identity(p):
@@ -864,10 +853,8 @@ def run(self):
qsize = self.q.qsize()
except NotImplementedError:
qsize = '?'
- logger.debug("prepared another chunk of %i documents (qsize=%s)" %
- (len(wrapped_chunk[0]), qsize))
+ logger.debug("prepared another chunk of %i documents (qsize=%s)", len(wrapped_chunk[0]), qsize)
self.q.put(wrapped_chunk.pop(), block=True)
-# endclass InputQueue
if os.name == 'nt':
@@ -957,7 +944,7 @@ def revdict(d):
result (which one is kept is arbitrary).
"""
- return dict((v, k) for (k, v) in iteritems(dict(d)))
+ return {v: k for (k, v) in iteritems(dict(d))}
def toptexts(query, texts, index, n=10):
@@ -974,10 +961,7 @@ def toptexts(query, texts, index, n=10):
sims = index[query] # perform a similarity query against the corpus
sims = sorted(enumerate(sims), key=lambda item: -item[1])
- result = []
- for topid, topcosine in sims[:n]: # only consider top-n most similar docs
- result.append((topid, topcosine, texts[topid]))
- return result
+ return [(topid, topcosine, texts[topid]) for topid, topcosine in sims[:n]] # only consider top-n most similar docs
def randfname(prefix='gensim'):
@@ -997,7 +981,7 @@ def upload_chunked(server, docs, chunksize=1000, preprocess=None):
start = 0
for chunk in grouper(docs, chunksize):
end = start + len(chunk)
- logger.info("uploading documents %i-%i" % (start, end - 1))
+ logger.info("uploading documents %i-%i", start, end - 1)
if preprocess is not None:
pchunk = []
for doc in chunk:
@@ -1039,7 +1023,7 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None
uri = daemon.register(obj, name)
ns.remove(name)
ns.register(name, uri)
- logger.info("%s registered with nameserver (URI '%s')" % (name, uri))
+ logger.info("%s registered with nameserver (URI '%s')", name, uri)
daemon.requestLoop()
@@ -1054,8 +1038,7 @@ def has_pattern():
return False
-def lemmatize(
- content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
+def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.
@@ -1109,9 +1092,7 @@ def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
"""
nnz = np.random.uniform(size=(dim,))
- data = [(i, float(np.random.poisson(lam=lam) + 1.0))
- for i in xrange(dim) if nnz[i] < prob_nnz]
- return data
+ return [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz]
def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):
@@ -1120,9 +1101,7 @@ def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):
to be used as a mock corpus.
"""
- data = [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam)
- for _ in xrange(n_items)]
- return data
+ return [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)]
def prune_vocab(vocab, min_reduce, trim_rule=None):
@@ -1138,8 +1117,7 @@ def prune_vocab(vocab, min_reduce, trim_rule=None):
if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce:
result += vocab[w]
del vocab[w]
- logger.info("pruned out %i tokens with count <=%i (before %i, after %i)",
- old_len - len(vocab), min_reduce, old_len, len(vocab))
+ logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", old_len - len(vocab), min_reduce, old_len, len(vocab))
return result
diff --git a/setup.py b/setup.py
index afeff174cd..90fd484c13 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def finalize_options(self):
cmdclass = {'build_ext': custom_build_ext}
-WHEELHOUSE_UPLOADER_COMMANDS = set(['fetch_artifacts', 'upload_all'])
+WHEELHOUSE_UPLOADER_COMMANDS = {'fetch_artifacts', 'upload_all'}
if WHEELHOUSE_UPLOADER_COMMANDS.intersection(sys.argv):
import wheelhouse_uploader.cmd
cmdclass.update(vars(wheelhouse_uploader.cmd))