From f3cf463c0f0e28c97c9a3b319a58a7e099092041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@gmail.com>
Date: Mon, 14 Jan 2019 15:33:59 +0100
Subject: [PATCH] Implement Levenshtein term similarity matrix and fast SCM
 between corpora (#2016)

* Wrap docstring for WordEmbeddingsKeyedVectors.similarity_matrix

* Add the gensim.models.levenshtein module

* Add projected density to term similarity matrix logs

* Add tests for the gensim.models.levenshtein.similarity_matrix function

* Separate similarity_matrix methods into director and builder classes.

* Add symmetric parameter to SparseTermSimilarityMatrix

* Add corpus support to SparseTermSimilarityMatrix.inner_product

* Replace scipy.sparse.dok_matrix.has_key with the in operator

* Fix handling of unicode in Python 3 in levsim

* Remove temporary method similarity of LevenshteinSimilarityIndex

* Move models.term_similarity, and levenshtein to similarities

* Make python-Levenshtein a conditional import

* Add default values to gensim.similarities.levenshtein.levsim arguments

* Remove extraneous addition operators from @deprecated annotations

* Remove @deprecated annotation from tests

* Merge test_term_similarity, and test_levenshtein with test_similarities

* Reword TermSimilarityIndex docstring

* Consume no more than topn similarities produced by a TermSimilarityIndex

* Use short uints (<64b) for dok_matrix keys and num_nonzero array

* Write to matrix_nonzero only when building a symmetric matrix

* Ensure UniformTermSimilarityIndex does not yield only topn - 1 values

* Document _shortest_uint_dtype

* Add soft cosine measure benchmark, part 1

* Add soft cosine measure benchmark, part 2

* Make similarity_matrix support non-contiguous dictionaries
Closes #2041

* Support fast inner product between a document and a corpus

* Support fast inner product between a document and a corpus (python 2.7)

* Add faster sparse matrix slicing

* Make Soft Cosine Measure support non-contiguous dictionaries

* Remove gensim::similarities::levenshtein::similarity_matrix facade

* Implement SoftCosineSimilarity using the inner_product method

* Fix flake8 warnings

* Make Soft Cosine Measure support non-contiguous dictionaries (cont)

* Remove parallelization in gensim::similarities::levenshtein

* Document future work

* Update Soft Cosine Measure benchmark after commits 093d569, and c316b95

* Update SCM tutorial after PR 2016

* Add example to gensim::similarities::termsim::SparseTermSimilarityMatrix

* Add max_distance kwarg to gensim::similarities::levenshtein::levsim

* Replace max_distance kwarg in levsim with min_similarity, add tests

* Remove conditional expression from levsim

* Use less confusing wording in docsting for min_similarity / max_distance

* Defer thresholding in LevenshteinSimilarityIndex.most_similar to levsim

* Allow None value of nonzero_limit parameter in SparseTermSimilarityMatrix

* Add positive_definite parameter to SparseTermSimilarityMatrix

* Split test_building test into a number of atomic unit tests

* Presort dictionary keys in UniformTermSimilarityIndex constructor

* Make documentation of SparseTermSimilarityMatrix more accurate

* Make SparseTermSimilarityMatrix expect negative similarities

* Avoid expensive array copying in dot_product

* Update SCM tutorial, and benchmark after PR 2016

* Remove fluff from stderr in the SCM tutorial notebook

* Add a paper reference to the SCM tutorial notebook

* Directly import Levenshtein package in levdist

* Use embedded URI instead of indirect hyperlink target in documentation

* Assume that max of lens is always an integer

* Make LevenshteinSimilarityIndex.most_similar easier to read

* Make LevenshteinSimilarityIndex.most_similar easier to read

* Add an ordering test for LevenshteinSimilarityIndex.most_similar

* Make WordEmbeddingSimilarityIndex.most_similar easier to read
---
 docs/notebooks/soft_cosine_benchmark.ipynb | 4605 ++++++++++++++++++++
 docs/notebooks/soft_cosine_tutorial.ipynb  |  125 +-
 gensim/matutils.py                         |   10 +-
 gensim/models/__init__.py                  |    2 +-
 gensim/models/keyedvectors.py              |  139 +-
 gensim/similarities/__init__.py            |    2 +
 gensim/similarities/docsim.py              |   77 +-
 gensim/similarities/levenshtein.py         |  153 +
 gensim/similarities/termsim.py             |  394 ++
 gensim/test/test_keyedvectors.py           |   99 +-
 gensim/test/test_similarities.py           |  393 +-
 setup.py                                   |    1 +
 12 files changed, 5771 insertions(+), 229 deletions(-)
 create mode 100644 docs/notebooks/soft_cosine_benchmark.ipynb
 create mode 100644 gensim/similarities/levenshtein.py
 create mode 100644 gensim/similarities/termsim.py

diff --git a/docs/notebooks/soft_cosine_benchmark.ipynb b/docs/notebooks/soft_cosine_benchmark.ipynb
new file mode 100644
index 0000000000..9421b84c17
--- /dev/null
+++ b/docs/notebooks/soft_cosine_benchmark.ipynb
@@ -0,0 +1,4605 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Benchmark: Implement Levenshtein term similarity matrix and fast SCM between corpora ([RaRe-Technologies/gensim PR #2016][#2016])\n",
+    "\n",
+    " [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "d429fedf094e00c4bb5c27589d5befb53b2e4b13\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!git rev-parse HEAD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from copy import deepcopy\n",
+    "from datetime import timedelta\n",
+    "from itertools import product\n",
+    "import logging\n",
+    "from math import floor, ceil, log10\n",
+    "import pickle\n",
+    "from random import sample, seed, shuffle\n",
+    "from time import time\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm_notebook\n",
+    "\n",
+    "def tqdm(iterable, total=None, desc=None):\n",
+    "    if total is None:\n",
+    "        total = len(iterable)\n",
+    "    for num_done, element in enumerate(tqdm_notebook(iterable, total=total)):\n",
+    "        logger.info(\"%s: %d / %d\", desc, num_done, total)\n",
+    "        yield element\n",
+    "\n",
+    "from gensim.corpora import Dictionary\n",
+    "import gensim.downloader as api\n",
+    "from gensim.similarities.index import AnnoyIndexer\n",
+    "from gensim.similarities import SparseTermSimilarityMatrix\n",
+    "from gensim.similarities import UniformTermSimilarityIndex\n",
+    "from gensim.similarities import LevenshteinSimilarityIndex\n",
+    "from gensim.models import WordEmbeddingSimilarityIndex\n",
+    "from gensim.utils import simple_preprocess\n",
+    "\n",
+    "RANDOM_SEED = 12345\n",
+    "\n",
+    "logger = logging.getLogger()\n",
+    "fhandler = logging.FileHandler(filename='matrix_speed.log', mode='a')\n",
+    "formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n",
+    "fhandler.setFormatter(formatter)\n",
+    "logger.addHandler(fhandler)\n",
+    "logger.setLevel(logging.INFO)\n",
+    "\n",
+    "pd.set_option('display.max_rows', None, 'display.max_seq_items', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Repeatedly run a benchmark callable given various configurations and\n",
+    "get a list of results.\n",
+    "\n",
+    "Return a list of results of repeatedly running a benchmark callable.\n",
+    "\n",
+    "Parameters\n",
+    "----------\n",
+    "benchmark : callable tuple -> dict\n",
+    "    A benchmark callable that accepts a configuration and returns results.\n",
+    "configurations : iterable of tuple\n",
+    "    An iterable of configurations that are used for calling the benchmark function.\n",
+    "results_filename : str\n",
+    "    A filename of a file that will be used to persistently store the results using\n",
+    "    pickle. If the file exists, then the function will load the stored results\n",
+    "    instead of calling the benchmark callable.\n",
+    "\n",
+    "Returns\n",
+    "-------\n",
+    "iterable of tuple\n",
+    "    The return values of the individual invocations of the benchmark callable.\n",
+    "\n",
+    "\"\"\"\n",
+    "def benchmark_results(benchmark, configurations, results_filename):\n",
+    "    try:\n",
+    "        with open(results_filename, \"rb\") as file:\n",
+    "            results = pickle.load(file)\n",
+    "    except IOError:\n",
+    "        configurations = list(configurations)\n",
+    "        shuffle(configurations)\n",
+    "        results = list(tqdm(\n",
+    "            (benchmark(configuration) for configuration in configurations),\n",
+    "            total=len(configurations), desc=\"benchmark\"))\n",
+    "        with open(results_filename, \"wb\") as file:\n",
+    "            pickle.dump(results, file)\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Implement Levenshtein term similarity matrix\n",
+    "\n",
+    "In Gensim PR [#1827][], we added a base implementation of the soft cosine measure (SCM). The base implementation would create term similarity matrices using a single complex procedure. In the Gensim PR [#2016][], we split the procedure into:\n",
+    "\n",
+    "- **TermSimilarityIndex** builder classes that produce the $k$ most similar terms for a given term $t$ that are distinct from $t$ along with the term similarities, and\n",
+    "- the **SparseTermSimilarityMatrix** director class that constructs term similarity matrices and consumes term similarities produced by **TermSimilarityIndex** instances.\n",
+    "\n",
+    "One of the benefits of this separation is that we can easily measure the speed at which a **TermSimilarityIndex** builder class produces term similarities and compare this speed with the speed at which the **SparseTermSimilarityMatrix** director class consumes term similarities. This allows us to see which of the classes are a bottleneck that slows down the construction of term similarity matrices.\n",
+    "\n",
+    "In this notebook, we measure all the currently available builder and director classes. For the measurements, we use the [Google News word embeddings][word2vec-google-news-300] distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01M terms.\n",
+    "\n",
+    " [word2vec-google-news-300]: https://github.com/mmihaltz/word2vec-GoogleNews-vectors (word2vec-GoogleNews-vectors)\n",
+    " [#1827]: https://github.com/RaRe-Technologies/gensim/pull/1827 (Implement Soft Cosine Measure - Pull Request #1827)\n",
+    " [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "full_model = api.load(\"word2vec-google-news-300\")\n",
+    "\n",
+    "try:\n",
+    "    full_dictionary = Dictionary.load(\"matrix_speed.dictionary\")\n",
+    "except IOError:\n",
+    "    full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()])\n",
+    "    full_dictionary.save(\"matrix_speed.dictionary\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Director class benchmark\n",
+    "#### SparseTermSimilarityMatrix\n",
+    "First, we measure the speed at which the **SparseTermSimilarityMatrix** director class consumes term similarities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "def benchmark(configuration):\n",
+    "    dictionary, nonzero_limit, symmetric, positive_definite, repetition = configuration\n",
+    "    index = UniformTermSimilarityIndex(dictionary)\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    matrix = SparseTermSimilarityMatrix(\n",
+    "        index, dictionary, nonzero_limit=nonzero_limit, symmetric=symmetric,\n",
+    "        positive_definite=positive_definite, dtype=np.float16).matrix\n",
+    "    end_time = time()\n",
+    "    \n",
+    "    duration = end_time - start_time\n",
+    "    return {\n",
+    "        \"dictionary_size\": len(dictionary),\n",
+    "        \"nonzero_limit\": nonzero_limit,\n",
+    "        \"matrix_nonzero\": matrix.nnz,\n",
+    "        \"repetition\": repetition,\n",
+    "        \"symmetric\": symmetric,\n",
+    "        \"positive_definite\": positive_definite,\n",
+    "        \"duration\": duration, }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4aef903a70e24247ad3c889237ed4c48",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=4), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "dictionary_sizes = [10**k for k in range(3, int(ceil(log10(len(full_dictionary)))))]\n",
+    "seed(RANDOM_SEED)\n",
+    "dictionaries = []\n",
+    "for size in tqdm(dictionary_sizes, desc=\"dictionaries\"):\n",
+    "    dictionary = Dictionary([sample(list(full_dictionary.values()), size)])\n",
+    "    dictionaries.append(dictionary)\n",
+    "dictionaries.append(full_dictionary)\n",
+    "nonzero_limits = [1, 10, 100]\n",
+    "symmetry = (True, False)\n",
+    "positive_definiteness = (True, False)\n",
+    "repetitions = range(10)\n",
+    "\n",
+    "configurations = product(dictionaries, nonzero_limits, symmetry, positive_definiteness, repetitions)\n",
+    "results = benchmark_results(benchmark, configurations, \"matrix_speed.director_results\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following tables show how long it takes to construct a term similarity matrix (the **duration** column), how many nonzero elements there are in the matrix (the **matrix_nonzero** column) and the mean term similarity consumption speed (the **consumption_speed** column) as we vary the dictionary size (the **dictionary_size** column) the maximum number of nonzero elements outside the diagonal in every column of the matrix (the **nonzero_limit** column), the matrix symmetry constraint (the **symmetric** column), and the matrix positive definiteness constraing (the **positive_definite** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n",
+    "\n",
+    "We can see that the symmetry and positive definiteness constraints severely limit the number of nonzero elements in the resulting matrix. This in turn increases the consumption speed, since we end up throwing away most of the elements that we consume. The effects of the dictionary size on the mean term similarity consumption speed are minor to none."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(results)\n",
+    "df[\"consumption_speed\"] = df.dictionary_size * df.nonzero_limit / df.duration\n",
+    "df = df.groupby([\"dictionary_size\", \"nonzero_limit\", \"symmetric\", \"positive_definite\"])\n",
+    "\n",
+    "def display(df):\n",
+    "    df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n",
+    "    df[\"matrix_nonzero\"] = [int(nonzero) for nonzero in df[\"matrix_nonzero\"]]\n",
+    "    df[\"consumption_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"consumption_speed\"]]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>consumption_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>symmetric</th>\n",
+       "      <th>positive_definite</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"12\" valign=\"top\">10000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.435533</td>\n",
+       "      <td>20000</td>\n",
+       "      <td>22.96 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.492606</td>\n",
+       "      <td>20000</td>\n",
+       "      <td>20.30 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.185563</td>\n",
+       "      <td>10002</td>\n",
+       "      <td>53.90 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.240471</td>\n",
+       "      <td>10002</td>\n",
+       "      <td>41.59 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">10</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:02.687836</td>\n",
+       "      <td>110000</td>\n",
+       "      <td>37.21 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.615492</td>\n",
+       "      <td>20000</td>\n",
+       "      <td>162.49 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.501188</td>\n",
+       "      <td>10118</td>\n",
+       "      <td>199.53 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:01.380586</td>\n",
+       "      <td>10010</td>\n",
+       "      <td>72.44 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:25.262807</td>\n",
+       "      <td>1010000</td>\n",
+       "      <td>39.58 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:01.132524</td>\n",
+       "      <td>20000</td>\n",
+       "      <td>883.02 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:03.595666</td>\n",
+       "      <td>20198</td>\n",
+       "      <td>278.13 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:11.818912</td>\n",
+       "      <td>10100</td>\n",
+       "      <td>84.61 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"12\" valign=\"top\">2010000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:01:31.786585</td>\n",
+       "      <td>4020000</td>\n",
+       "      <td>21.90 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:01:40.954580</td>\n",
+       "      <td>4020000</td>\n",
+       "      <td>19.91 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:39.050064</td>\n",
+       "      <td>2010002</td>\n",
+       "      <td>51.48 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:49.238437</td>\n",
+       "      <td>2010002</td>\n",
+       "      <td>40.82 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">10</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:09:35.470373</td>\n",
+       "      <td>22110000</td>\n",
+       "      <td>34.93 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:02:02.920334</td>\n",
+       "      <td>4020000</td>\n",
+       "      <td>163.52 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:01:39.576693</td>\n",
+       "      <td>2010118</td>\n",
+       "      <td>201.88 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:04:35.646501</td>\n",
+       "      <td>2010010</td>\n",
+       "      <td>72.92 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>01:42:01.747568</td>\n",
+       "      <td>203010000</td>\n",
+       "      <td>32.88 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:03:36.420778</td>\n",
+       "      <td>4020000</td>\n",
+       "      <td>928.75 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:10:58.434060</td>\n",
+       "      <td>2020198</td>\n",
+       "      <td>305.30 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:39:40.319479</td>\n",
+       "      <td>2010100</td>\n",
+       "      <td>84.44 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                 duration  \\\n",
+       "dictionary_size nonzero_limit symmetric positive_definite                   \n",
+       "10000           1             False     False             00:00:00.435533   \n",
+       "                                        True              00:00:00.492606   \n",
+       "                              True      False             00:00:00.185563   \n",
+       "                                        True              00:00:00.240471   \n",
+       "                10            False     False             00:00:02.687836   \n",
+       "                                        True              00:00:00.615492   \n",
+       "                              True      False             00:00:00.501188   \n",
+       "                                        True              00:00:01.380586   \n",
+       "                100           False     False             00:00:25.262807   \n",
+       "                                        True              00:00:01.132524   \n",
+       "                              True      False             00:00:03.595666   \n",
+       "                                        True              00:00:11.818912   \n",
+       "2010000         1             False     False             00:01:31.786585   \n",
+       "                                        True              00:01:40.954580   \n",
+       "                              True      False             00:00:39.050064   \n",
+       "                                        True              00:00:49.238437   \n",
+       "                10            False     False             00:09:35.470373   \n",
+       "                                        True              00:02:02.920334   \n",
+       "                              True      False             00:01:39.576693   \n",
+       "                                        True              00:04:35.646501   \n",
+       "                100           False     False             01:42:01.747568   \n",
+       "                                        True              00:03:36.420778   \n",
+       "                              True      False             00:10:58.434060   \n",
+       "                                        True              00:39:40.319479   \n",
+       "\n",
+       "                                                           matrix_nonzero  \\\n",
+       "dictionary_size nonzero_limit symmetric positive_definite                   \n",
+       "10000           1             False     False                       20000   \n",
+       "                                        True                        20000   \n",
+       "                              True      False                       10002   \n",
+       "                                        True                        10002   \n",
+       "                10            False     False                      110000   \n",
+       "                                        True                        20000   \n",
+       "                              True      False                       10118   \n",
+       "                                        True                        10010   \n",
+       "                100           False     False                     1010000   \n",
+       "                                        True                        20000   \n",
+       "                              True      False                       20198   \n",
+       "                                        True                        10100   \n",
+       "2010000         1             False     False                     4020000   \n",
+       "                                        True                      4020000   \n",
+       "                              True      False                     2010002   \n",
+       "                                        True                      2010002   \n",
+       "                10            False     False                    22110000   \n",
+       "                                        True                      4020000   \n",
+       "                              True      False                     2010118   \n",
+       "                                        True                      2010010   \n",
+       "                100           False     False                   203010000   \n",
+       "                                        True                      4020000   \n",
+       "                              True      False                     2020198   \n",
+       "                                        True                      2010100   \n",
+       "\n",
+       "                                                                consumption_speed  \n",
+       "dictionary_size nonzero_limit symmetric positive_definite                          \n",
+       "10000           1             False     False               22.96 Kword pairs / s  \n",
+       "                                        True                20.30 Kword pairs / s  \n",
+       "                              True      False               53.90 Kword pairs / s  \n",
+       "                                        True                41.59 Kword pairs / s  \n",
+       "                10            False     False               37.21 Kword pairs / s  \n",
+       "                                        True               162.49 Kword pairs / s  \n",
+       "                              True      False              199.53 Kword pairs / s  \n",
+       "                                        True                72.44 Kword pairs / s  \n",
+       "                100           False     False               39.58 Kword pairs / s  \n",
+       "                                        True               883.02 Kword pairs / s  \n",
+       "                              True      False              278.13 Kword pairs / s  \n",
+       "                                        True                84.61 Kword pairs / s  \n",
+       "2010000         1             False     False               21.90 Kword pairs / s  \n",
+       "                                        True                19.91 Kword pairs / s  \n",
+       "                              True      False               51.48 Kword pairs / s  \n",
+       "                                        True                40.82 Kword pairs / s  \n",
+       "                10            False     False               34.93 Kword pairs / s  \n",
+       "                                        True               163.52 Kword pairs / s  \n",
+       "                              True      False              201.88 Kword pairs / s  \n",
+       "                                        True                72.92 Kword pairs / s  \n",
+       "                100           False     False               32.88 Kword pairs / s  \n",
+       "                                        True               928.75 Kword pairs / s  \n",
+       "                              True      False              305.30 Kword pairs / s  \n",
+       "                                        True                84.44 Kword pairs / s  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.mean()).loc[\n",
+    "    [10000, len(full_dictionary)], :, :].loc[\n",
+    "    :, [\"duration\", \"matrix_nonzero\", \"consumption_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>consumption_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>symmetric</th>\n",
+       "      <th>positive_definite</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"12\" valign=\"top\">10000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.005334</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.28 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.004072</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.17 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.003124</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.90 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.001797</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.31 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">10</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.011986</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.17 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.005972</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.59 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.002869</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.15 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.011411</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.60 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.111118</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.17 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.007611</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5.94 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.030875</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2.38 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.050198</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.36 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"12\" valign=\"top\">2010000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.767305</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.18 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.172432</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.03 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.346239</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.46 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.177075</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.15 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">10</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:05.156655</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.31 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.631676</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.83 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:01.216067</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2.41 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.547773</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.14 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:04:10.371035</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.24 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.634416</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2.73 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:06.586767</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3.05 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:09.030932</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.32 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                 duration  \\\n",
+       "dictionary_size nonzero_limit symmetric positive_definite                   \n",
+       "10000           1             False     False             00:00:00.005334   \n",
+       "                                        True              00:00:00.004072   \n",
+       "                              True      False             00:00:00.003124   \n",
+       "                                        True              00:00:00.001797   \n",
+       "                10            False     False             00:00:00.011986   \n",
+       "                                        True              00:00:00.005972   \n",
+       "                              True      False             00:00:00.002869   \n",
+       "                                        True              00:00:00.011411   \n",
+       "                100           False     False             00:00:00.111118   \n",
+       "                                        True              00:00:00.007611   \n",
+       "                              True      False             00:00:00.030875   \n",
+       "                                        True              00:00:00.050198   \n",
+       "2010000         1             False     False             00:00:00.767305   \n",
+       "                                        True              00:00:00.172432   \n",
+       "                              True      False             00:00:00.346239   \n",
+       "                                        True              00:00:00.177075   \n",
+       "                10            False     False             00:00:05.156655   \n",
+       "                                        True              00:00:00.631676   \n",
+       "                              True      False             00:00:01.216067   \n",
+       "                                        True              00:00:00.547773   \n",
+       "                100           False     False             00:04:10.371035   \n",
+       "                                        True              00:00:00.634416   \n",
+       "                              True      False             00:00:06.586767   \n",
+       "                                        True              00:00:09.030932   \n",
+       "\n",
+       "                                                           matrix_nonzero  \\\n",
+       "dictionary_size nonzero_limit symmetric positive_definite                   \n",
+       "10000           1             False     False                           0   \n",
+       "                                        True                            0   \n",
+       "                              True      False                           0   \n",
+       "                                        True                            0   \n",
+       "                10            False     False                           0   \n",
+       "                                        True                            0   \n",
+       "                              True      False                           0   \n",
+       "                                        True                            0   \n",
+       "                100           False     False                           0   \n",
+       "                                        True                            0   \n",
+       "                              True      False                           0   \n",
+       "                                        True                            0   \n",
+       "2010000         1             False     False                           0   \n",
+       "                                        True                            0   \n",
+       "                              True      False                           0   \n",
+       "                                        True                            0   \n",
+       "                10            False     False                           0   \n",
+       "                                        True                            0   \n",
+       "                              True      False                           0   \n",
+       "                                        True                            0   \n",
+       "                100           False     False                           0   \n",
+       "                                        True                            0   \n",
+       "                              True      False                           0   \n",
+       "                                        True                            0   \n",
+       "\n",
+       "                                                              consumption_speed  \n",
+       "dictionary_size nonzero_limit symmetric positive_definite                        \n",
+       "10000           1             False     False              0.28 Kword pairs / s  \n",
+       "                                        True               0.17 Kword pairs / s  \n",
+       "                              True      False              0.90 Kword pairs / s  \n",
+       "                                        True               0.31 Kword pairs / s  \n",
+       "                10            False     False              0.17 Kword pairs / s  \n",
+       "                                        True               1.59 Kword pairs / s  \n",
+       "                              True      False              1.15 Kword pairs / s  \n",
+       "                                        True               0.60 Kword pairs / s  \n",
+       "                100           False     False              0.17 Kword pairs / s  \n",
+       "                                        True               5.94 Kword pairs / s  \n",
+       "                              True      False              2.38 Kword pairs / s  \n",
+       "                                        True               0.36 Kword pairs / s  \n",
+       "2010000         1             False     False              0.18 Kword pairs / s  \n",
+       "                                        True               0.03 Kword pairs / s  \n",
+       "                              True      False              0.46 Kword pairs / s  \n",
+       "                                        True               0.15 Kword pairs / s  \n",
+       "                10            False     False              0.31 Kword pairs / s  \n",
+       "                                        True               0.83 Kword pairs / s  \n",
+       "                              True      False              2.41 Kword pairs / s  \n",
+       "                                        True               0.14 Kword pairs / s  \n",
+       "                100           False     False              1.24 Kword pairs / s  \n",
+       "                                        True               2.73 Kword pairs / s  \n",
+       "                              True      False              3.05 Kword pairs / s  \n",
+       "                                        True               0.32 Kword pairs / s  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n",
+    "    [10000, len(full_dictionary)], :, :].loc[\n",
+    "    :, [\"duration\", \"matrix_nonzero\", \"consumption_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Builder class benchmark\n",
+    "#### UniformTermSimilarityIndex\n",
+    "First, we measure the speed at which the **UniformTermSimilarityIndex** builder class produces term similarities. **UniformTermSimilarityIndex** is a dummy class that just generates a sequence of constants. It produces much more term similarities per second than the **SparseTermSimilarityMatrix** is capable of consuming and its results will serve as an upper limit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark(configuration):\n",
+    "    dictionary, nonzero_limit, repetition = configuration\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    index = UniformTermSimilarityIndex(dictionary)\n",
+    "    end_time = time()\n",
+    "    constructor_duration = end_time - start_time\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    for term in dictionary.values():\n",
+    "        for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):\n",
+    "            pass\n",
+    "    end_time = time()\n",
+    "    production_duration = end_time - start_time\n",
+    "    \n",
+    "    return {\n",
+    "        \"dictionary_size\": len(dictionary),\n",
+    "        \"nonzero_limit\": nonzero_limit,\n",
+    "        \"repetition\": repetition,\n",
+    "        \"constructor_duration\": constructor_duration,\n",
+    "        \"production_duration\": production_duration, }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nonzero_limits = [1, 10, 100, 1000]\n",
+    "\n",
+    "configurations = product(dictionaries, nonzero_limits, repetitions)\n",
+    "results = benchmark_results(benchmark, configurations, \"matrix_speed.builder_results.uniform\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following tables show how long it takes to retrieve the most similar terms for all terms in a dictionary (the **production_duration** column) and the mean term similarity production speed (the **production_speed** column) as we vary the dictionary size (the **dictionary_size** column), and the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n",
+    "\n",
+    "The **production_speed** is proportional to **nonzero_limit**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(results)\n",
+    "df[\"processing_speed\"] = df.dictionary_size ** 2 / df.production_duration\n",
+    "df[\"production_speed\"] = df.dictionary_size * df.nonzero_limit / df.production_duration\n",
+    "df = df.groupby([\"dictionary_size\", \"nonzero_limit\"])\n",
+    "\n",
+    "def display(df):\n",
+    "    df[\"constructor_duration\"] = [timedelta(0, duration) for duration in df[\"constructor_duration\"]]\n",
+    "    df[\"production_duration\"] = [timedelta(0, duration) for duration in df[\"production_duration\"]]\n",
+    "    df[\"processing_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"processing_speed\"]]\n",
+    "    df[\"production_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"production_speed\"]]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>production_duration</th>\n",
+       "      <th>production_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:00.002973</td>\n",
+       "      <td>336.41 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:00.005372</td>\n",
+       "      <td>1861.64 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:00.026752</td>\n",
+       "      <td>3738.79 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1000</th>\n",
+       "      <td>00:00:00.290265</td>\n",
+       "      <td>3449.16 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">2010000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:06.318446</td>\n",
+       "      <td>318.12 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:10.783611</td>\n",
+       "      <td>1863.96 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:53.108644</td>\n",
+       "      <td>3785.04 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1000</th>\n",
+       "      <td>00:09:45.103741</td>\n",
+       "      <td>3437.36 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              production_duration         production_speed\n",
+       "dictionary_size nonzero_limit                                             \n",
+       "1000            1                 00:00:00.002973   336.41 Kword pairs / s\n",
+       "                10                00:00:00.005372  1861.64 Kword pairs / s\n",
+       "                100               00:00:00.026752  3738.79 Kword pairs / s\n",
+       "                1000              00:00:00.290265  3449.16 Kword pairs / s\n",
+       "2010000         1                 00:00:06.318446   318.12 Kword pairs / s\n",
+       "                10                00:00:10.783611  1863.96 Kword pairs / s\n",
+       "                100               00:00:53.108644  3785.04 Kword pairs / s\n",
+       "                1000              00:09:45.103741  3437.36 Kword pairs / s"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.mean()).loc[\n",
+    "    [1000, len(full_dictionary)], :, :].loc[\n",
+    "    :, [\"production_duration\", \"production_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>production_duration</th>\n",
+       "      <th>production_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:00.000017</td>\n",
+       "      <td>1.93 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:00.000062</td>\n",
+       "      <td>21.50 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:00.000408</td>\n",
+       "      <td>56.66 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1000</th>\n",
+       "      <td>00:00:00.010500</td>\n",
+       "      <td>123.82 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">2010000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:00.023495</td>\n",
+       "      <td>1.18 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:00.035587</td>\n",
+       "      <td>6.16 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:00.535765</td>\n",
+       "      <td>37.76 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1000</th>\n",
+       "      <td>00:00:15.037816</td>\n",
+       "      <td>89.56 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              production_duration        production_speed\n",
+       "dictionary_size nonzero_limit                                            \n",
+       "1000            1                 00:00:00.000017    1.93 Kword pairs / s\n",
+       "                10                00:00:00.000062   21.50 Kword pairs / s\n",
+       "                100               00:00:00.000408   56.66 Kword pairs / s\n",
+       "                1000              00:00:00.010500  123.82 Kword pairs / s\n",
+       "2010000         1                 00:00:00.023495    1.18 Kword pairs / s\n",
+       "                10                00:00:00.035587    6.16 Kword pairs / s\n",
+       "                100               00:00:00.535765   37.76 Kword pairs / s\n",
+       "                1000              00:00:15.037816   89.56 Kword pairs / s"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n",
+    "    [1000, len(full_dictionary)], :, :].loc[\n",
+    "    :, [\"production_duration\", \"production_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### LevenshteinSimilarityIndex\n",
+    "Next, we measure the speed at which the **LevenshteinSimilarityIndex** builder class produces term similarities. **LevenshteinSimilarityIndex** is currently just a naïve implementation that produces much fewer term similarities per second than the **SparseTermSimilarityMatrix** class is capable of consuming."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark(configuration):\n",
+    "    dictionary, nonzero_limit, query_terms, repetition = configuration\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    index = LevenshteinSimilarityIndex(dictionary)\n",
+    "    end_time = time()\n",
+    "    constructor_duration = end_time - start_time\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    for term in query_terms:\n",
+    "        for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):\n",
+    "            pass\n",
+    "    end_time = time()\n",
+    "    production_duration = end_time - start_time\n",
+    "    \n",
+    "    return {\n",
+    "        \"dictionary_size\": len(dictionary),\n",
+    "        \"mean_query_term_length\": np.mean([len(term) for term in query_terms]),\n",
+    "        \"nonzero_limit\": nonzero_limit,\n",
+    "        \"repetition\": repetition,\n",
+    "        \"constructor_duration\": constructor_duration,\n",
+    "        \"production_duration\": production_duration, }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nonzero_limits = [1, 10, 100]\n",
+    "seed(RANDOM_SEED)\n",
+    "min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1]\n",
+    "query_terms = sample(list(min_dictionary.values()), 10)\n",
+    "\n",
+    "configurations = product(dictionaries, nonzero_limits, [query_terms], repetitions)\n",
+    "results = benchmark_results(benchmark, configurations, \"matrix_speed.builder_results.levenshtein\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following tables show how long it takes to retrieve the most similar terms for ten randomly sampled terms from a dictionary (the **production_duration** column), the mean term similarity production speed (the **production_speed** column) and the mean term similarity processing speed (the **processing_speed** column) as we vary the dictionary size (the **dictionary_size** column), and the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n",
+    "\n",
+    "The **production_speed** is proportional to **nonzero_limit / dictionary_size**. The **processing_speed** is constant."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(results)\n",
+    "df[\"processing_speed\"] = df.dictionary_size * len(query_terms) / df.production_duration\n",
+    "df[\"production_speed\"] = df.nonzero_limit * len(query_terms) / df.production_duration\n",
+    "df = df.groupby([\"dictionary_size\", \"nonzero_limit\"])\n",
+    "\n",
+    "def display(df):\n",
+    "    df[\"constructor_duration\"] = [timedelta(0, duration) for duration in df[\"constructor_duration\"]]\n",
+    "    df[\"production_duration\"] = [timedelta(0, duration) for duration in df[\"production_duration\"]]\n",
+    "    df[\"processing_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"processing_speed\"]]\n",
+    "    df[\"production_speed\"] = [\"%.02f word pairs / s\" % speed for speed in df[\"production_speed\"]]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>production_duration</th>\n",
+       "      <th>production_speed</th>\n",
+       "      <th>processing_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:00.055994</td>\n",
+       "      <td>178.61 word pairs / s</td>\n",
+       "      <td>178.61 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:00.056097</td>\n",
+       "      <td>1782.70 word pairs / s</td>\n",
+       "      <td>178.27 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:00.056212</td>\n",
+       "      <td>17791.65 word pairs / s</td>\n",
+       "      <td>177.92 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1000000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:01:20.618070</td>\n",
+       "      <td>0.12 word pairs / s</td>\n",
+       "      <td>124.05 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:01:20.048238</td>\n",
+       "      <td>1.25 word pairs / s</td>\n",
+       "      <td>124.92 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:01:20.064999</td>\n",
+       "      <td>12.49 word pairs / s</td>\n",
+       "      <td>124.90 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">2010000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:02:44.069399</td>\n",
+       "      <td>0.06 word pairs / s</td>\n",
+       "      <td>122.51 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:02:43.914601</td>\n",
+       "      <td>0.61 word pairs / s</td>\n",
+       "      <td>122.63 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:02:43.892408</td>\n",
+       "      <td>6.10 word pairs / s</td>\n",
+       "      <td>122.64 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              production_duration         production_speed  \\\n",
+       "dictionary_size nonzero_limit                                                \n",
+       "1000            1                 00:00:00.055994    178.61 word pairs / s   \n",
+       "                10                00:00:00.056097   1782.70 word pairs / s   \n",
+       "                100               00:00:00.056212  17791.65 word pairs / s   \n",
+       "1000000         1                 00:01:20.618070      0.12 word pairs / s   \n",
+       "                10                00:01:20.048238      1.25 word pairs / s   \n",
+       "                100               00:01:20.064999     12.49 word pairs / s   \n",
+       "2010000         1                 00:02:44.069399      0.06 word pairs / s   \n",
+       "                10                00:02:43.914601      0.61 word pairs / s   \n",
+       "                100               00:02:43.892408      6.10 word pairs / s   \n",
+       "\n",
+       "                                     processing_speed  \n",
+       "dictionary_size nonzero_limit                          \n",
+       "1000            1              178.61 Kword pairs / s  \n",
+       "                10             178.27 Kword pairs / s  \n",
+       "                100            177.92 Kword pairs / s  \n",
+       "1000000         1              124.05 Kword pairs / s  \n",
+       "                10             124.92 Kword pairs / s  \n",
+       "                100            124.90 Kword pairs / s  \n",
+       "2010000         1              122.51 Kword pairs / s  \n",
+       "                10             122.63 Kword pairs / s  \n",
+       "                100            122.64 Kword pairs / s  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.mean()).loc[\n",
+    "    [1000, 1000000, len(full_dictionary)], :].loc[\n",
+    "    :, [\"production_duration\", \"production_speed\", \"processing_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>production_duration</th>\n",
+       "      <th>production_speed</th>\n",
+       "      <th>processing_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:00.000673</td>\n",
+       "      <td>2.16 word pairs / s</td>\n",
+       "      <td>2.16 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:00.000409</td>\n",
+       "      <td>13.06 word pairs / s</td>\n",
+       "      <td>1.31 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:00.000621</td>\n",
+       "      <td>196.80 word pairs / s</td>\n",
+       "      <td>1.97 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1000000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:00.810661</td>\n",
+       "      <td>0.00 word pairs / s</td>\n",
+       "      <td>1.23 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:00.110013</td>\n",
+       "      <td>0.00 word pairs / s</td>\n",
+       "      <td>0.17 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:00.164959</td>\n",
+       "      <td>0.03 word pairs / s</td>\n",
+       "      <td>0.26 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">2010000</th>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:01.159273</td>\n",
+       "      <td>0.00 word pairs / s</td>\n",
+       "      <td>0.85 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>00:00:00.429011</td>\n",
+       "      <td>0.00 word pairs / s</td>\n",
+       "      <td>0.32 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:00.433687</td>\n",
+       "      <td>0.02 word pairs / s</td>\n",
+       "      <td>0.32 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              production_duration       production_speed  \\\n",
+       "dictionary_size nonzero_limit                                              \n",
+       "1000            1                 00:00:00.000673    2.16 word pairs / s   \n",
+       "                10                00:00:00.000409   13.06 word pairs / s   \n",
+       "                100               00:00:00.000621  196.80 word pairs / s   \n",
+       "1000000         1                 00:00:00.810661    0.00 word pairs / s   \n",
+       "                10                00:00:00.110013    0.00 word pairs / s   \n",
+       "                100               00:00:00.164959    0.03 word pairs / s   \n",
+       "2010000         1                 00:00:01.159273    0.00 word pairs / s   \n",
+       "                10                00:00:00.429011    0.00 word pairs / s   \n",
+       "                100               00:00:00.433687    0.02 word pairs / s   \n",
+       "\n",
+       "                                   processing_speed  \n",
+       "dictionary_size nonzero_limit                        \n",
+       "1000            1              2.16 Kword pairs / s  \n",
+       "                10             1.31 Kword pairs / s  \n",
+       "                100            1.97 Kword pairs / s  \n",
+       "1000000         1              1.23 Kword pairs / s  \n",
+       "                10             0.17 Kword pairs / s  \n",
+       "                100            0.26 Kword pairs / s  \n",
+       "2010000         1              0.85 Kword pairs / s  \n",
+       "                10             0.32 Kword pairs / s  \n",
+       "                100            0.32 Kword pairs / s  "
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n",
+    "    [1000, 1000000, len(full_dictionary)], :].loc[\n",
+    "    :, [\"production_duration\", \"production_speed\", \"processing_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### WordEmbeddingSimilarityIndex\n",
+    "Lastly, we measure the speed at which the **WordEmbeddingSimilarityIndex** builder class constructs an instance and produces term similarities. Gensim currently supports slow and precise nearest neighbor search, and also approximate nearest neighbor search using [ANNOY][]. We evaluate both options.\n",
+    "\n",
+    " [ANNOY]: https://github.com/spotify/annoy (Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark(configuration):\n",
+    "    (model, dictionary), nonzero_limit, annoy_n_trees, query_terms, repetition = configuration\n",
+    "    use_annoy = annoy_n_trees > 0\n",
+    "    model.init_sims()\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    if use_annoy:\n",
+    "        annoy = AnnoyIndexer(model, annoy_n_trees)\n",
+    "        kwargs = {\"indexer\": annoy}\n",
+    "    else:\n",
+    "        kwargs = {}\n",
+    "    index = WordEmbeddingSimilarityIndex(model, kwargs=kwargs)\n",
+    "    end_time = time()\n",
+    "    constructor_duration = end_time - start_time\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    for term in query_terms:\n",
+    "        for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):\n",
+    "            pass\n",
+    "    end_time = time()\n",
+    "    production_duration = end_time - start_time\n",
+    "    \n",
+    "    return {\n",
+    "        \"dictionary_size\": len(dictionary),\n",
+    "        \"mean_query_term_length\": np.mean([len(term) for term in query_terms]),\n",
+    "        \"nonzero_limit\": nonzero_limit,\n",
+    "        \"use_annoy\": use_annoy,\n",
+    "        \"annoy_n_trees\": annoy_n_trees,\n",
+    "        \"repetition\": repetition,\n",
+    "        \"constructor_duration\": constructor_duration,\n",
+    "        \"production_duration\": production_duration, }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "842bb1a60f814110a8f20eb44a973397",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=5), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "models = []\n",
+    "for dictionary in tqdm(dictionaries, desc=\"models\"):\n",
+    "    if dictionary == full_dictionary:\n",
+    "        models.append(full_model)\n",
+    "        continue\n",
+    "    model = full_model.__class__(full_model.vector_size)\n",
+    "    model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()}\n",
+    "    model.index2entity = []\n",
+    "    vector_indices = []\n",
+    "    for index, word in enumerate(full_model.index2entity):\n",
+    "        if word in model.vocab.keys():\n",
+    "            model.index2entity.append(word)\n",
+    "            model.vocab[word].index = len(vector_indices)\n",
+    "            vector_indices.append(index)\n",
+    "    model.vectors = full_model.vectors[vector_indices]\n",
+    "    models.append(model)\n",
+    "annoy_n_trees = [0] + [10**k for k in range(3)]\n",
+    "seed(RANDOM_SEED)\n",
+    "query_terms = sample(list(min_dictionary.values()), 1000)\n",
+    "\n",
+    "configurations = product(zip(models, dictionaries), nonzero_limits, annoy_n_trees, [query_terms], repetitions)\n",
+    "results = benchmark_results(benchmark, configurations, \"matrix_speed.builder_results.wordembeddings\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following tables show how long it takes to construct an ANNOY index and the builder class instance (the **constructor_duration** column), how long it takes to retrieve the most similar terms for 1,000 randomly sampled terms from a dictionary (the **production_duration** column), the mean term similarity production speed (the **production_speed** column) and the mean term similarity processing speed (the **processing_speed** column) as we vary the dictionary size (the **dictionary_size** column), the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column), and the number of constructed ANNOY trees (the **annoy_n_trees** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n",
+    "\n",
+    "If we do not use ANNOY (**annoy_n_trees**${}=0$), then **production_speed** is proportional to **nonzero_limit / dictionary_size**. \n",
+    "If we do use ANNOY (**annoy_n_trees**${}>0$), then **production_speed** is proportional to **nonzero_limit / (annoy_n_trees)**${}^{1/2}$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(results)\n",
+    "df[\"processing_speed\"] = df.dictionary_size * len(query_terms) / df.production_duration\n",
+    "df[\"production_speed\"] = df.nonzero_limit * len(query_terms) / df.production_duration\n",
+    "df = df.groupby([\"dictionary_size\", \"nonzero_limit\", \"annoy_n_trees\"])\n",
+    "\n",
+    "def display(df):\n",
+    "    df[\"constructor_duration\"] = [timedelta(0, duration) for duration in df[\"constructor_duration\"]]\n",
+    "    df[\"production_duration\"] = [timedelta(0, duration) for duration in df[\"production_duration\"]]\n",
+    "    df[\"processing_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"processing_speed\"]]\n",
+    "    df[\"production_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"production_speed\"]]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>constructor_duration</th>\n",
+       "      <th>production_duration</th>\n",
+       "      <th>production_speed</th>\n",
+       "      <th>processing_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>annoy_n_trees</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"6\" valign=\"top\">1000000</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000007</td>\n",
+       "      <td>00:00:19.962977</td>\n",
+       "      <td>0.05 Kword pairs / s</td>\n",
+       "      <td>50094.22 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:30.268797</td>\n",
+       "      <td>00:00:00.097011</td>\n",
+       "      <td>10.32 Kword pairs / s</td>\n",
+       "      <td>10320061.76 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:06:23.415982</td>\n",
+       "      <td>00:00:00.160870</td>\n",
+       "      <td>6.24 Kword pairs / s</td>\n",
+       "      <td>6236688.27 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">100</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000008</td>\n",
+       "      <td>00:00:22.868372</td>\n",
+       "      <td>4.37 Kword pairs / s</td>\n",
+       "      <td>43729.34 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:31.154876</td>\n",
+       "      <td>00:00:00.156238</td>\n",
+       "      <td>641.91 Kword pairs / s</td>\n",
+       "      <td>6419086.99 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:06:23.290572</td>\n",
+       "      <td>00:00:01.297445</td>\n",
+       "      <td>77.13 Kword pairs / s</td>\n",
+       "      <td>771277.71 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"6\" valign=\"top\">2010000</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000007</td>\n",
+       "      <td>00:01:55.303216</td>\n",
+       "      <td>0.01 Kword pairs / s</td>\n",
+       "      <td>17432.79 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:01:34.004196</td>\n",
+       "      <td>00:00:00.190463</td>\n",
+       "      <td>5.25 Kword pairs / s</td>\n",
+       "      <td>10561607.14 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:23:29.796006</td>\n",
+       "      <td>00:00:00.339500</td>\n",
+       "      <td>2.96 Kword pairs / s</td>\n",
+       "      <td>5954865.50 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">100</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000007</td>\n",
+       "      <td>00:02:11.926861</td>\n",
+       "      <td>0.76 Kword pairs / s</td>\n",
+       "      <td>15236.46 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:01:35.813414</td>\n",
+       "      <td>00:00:00.301120</td>\n",
+       "      <td>332.38 Kword pairs / s</td>\n",
+       "      <td>6680879.02 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:23:05.155399</td>\n",
+       "      <td>00:00:03.031527</td>\n",
+       "      <td>33.42 Kword pairs / s</td>\n",
+       "      <td>671683.05 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            constructor_duration  \\\n",
+       "dictionary_size nonzero_limit annoy_n_trees                        \n",
+       "1000000         1             0                  00:00:00.000007   \n",
+       "                              1                  00:00:30.268797   \n",
+       "                              100                00:06:23.415982   \n",
+       "                100           0                  00:00:00.000008   \n",
+       "                              1                  00:00:31.154876   \n",
+       "                              100                00:06:23.290572   \n",
+       "2010000         1             0                  00:00:00.000007   \n",
+       "                              1                  00:01:34.004196   \n",
+       "                              100                00:23:29.796006   \n",
+       "                100           0                  00:00:00.000007   \n",
+       "                              1                  00:01:35.813414   \n",
+       "                              100                00:23:05.155399   \n",
+       "\n",
+       "                                            production_duration  \\\n",
+       "dictionary_size nonzero_limit annoy_n_trees                       \n",
+       "1000000         1             0                 00:00:19.962977   \n",
+       "                              1                 00:00:00.097011   \n",
+       "                              100               00:00:00.160870   \n",
+       "                100           0                 00:00:22.868372   \n",
+       "                              1                 00:00:00.156238   \n",
+       "                              100               00:00:01.297445   \n",
+       "2010000         1             0                 00:01:55.303216   \n",
+       "                              1                 00:00:00.190463   \n",
+       "                              100               00:00:00.339500   \n",
+       "                100           0                 00:02:11.926861   \n",
+       "                              1                 00:00:00.301120   \n",
+       "                              100               00:00:03.031527   \n",
+       "\n",
+       "                                                   production_speed  \\\n",
+       "dictionary_size nonzero_limit annoy_n_trees                           \n",
+       "1000000         1             0                0.05 Kword pairs / s   \n",
+       "                              1               10.32 Kword pairs / s   \n",
+       "                              100              6.24 Kword pairs / s   \n",
+       "                100           0                4.37 Kword pairs / s   \n",
+       "                              1              641.91 Kword pairs / s   \n",
+       "                              100             77.13 Kword pairs / s   \n",
+       "2010000         1             0                0.01 Kword pairs / s   \n",
+       "                              1                5.25 Kword pairs / s   \n",
+       "                              100              2.96 Kword pairs / s   \n",
+       "                100           0                0.76 Kword pairs / s   \n",
+       "                              1              332.38 Kword pairs / s   \n",
+       "                              100             33.42 Kword pairs / s   \n",
+       "\n",
+       "                                                        processing_speed  \n",
+       "dictionary_size nonzero_limit annoy_n_trees                               \n",
+       "1000000         1             0                 50094.22 Kword pairs / s  \n",
+       "                              1              10320061.76 Kword pairs / s  \n",
+       "                              100             6236688.27 Kword pairs / s  \n",
+       "                100           0                 43729.34 Kword pairs / s  \n",
+       "                              1               6419086.99 Kword pairs / s  \n",
+       "                              100              771277.71 Kword pairs / s  \n",
+       "2010000         1             0                 17432.79 Kword pairs / s  \n",
+       "                              1              10561607.14 Kword pairs / s  \n",
+       "                              100             5954865.50 Kword pairs / s  \n",
+       "                100           0                 15236.46 Kword pairs / s  \n",
+       "                              1               6680879.02 Kword pairs / s  \n",
+       "                              100              671683.05 Kword pairs / s  "
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.mean()).loc[\n",
+    "    [1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[\n",
+    "    :, [\"constructor_duration\", \"production_duration\", \"production_speed\", \"processing_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>constructor_duration</th>\n",
+       "      <th>production_duration</th>\n",
+       "      <th>production_speed</th>\n",
+       "      <th>processing_speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>annoy_n_trees</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"6\" valign=\"top\">1000000</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000002</td>\n",
+       "      <td>00:00:00.115644</td>\n",
+       "      <td>0.00 Kword pairs / s</td>\n",
+       "      <td>286.27 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:01.854097</td>\n",
+       "      <td>00:00:00.003517</td>\n",
+       "      <td>0.37 Kword pairs / s</td>\n",
+       "      <td>367959.55 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:04.702035</td>\n",
+       "      <td>00:00:00.010444</td>\n",
+       "      <td>0.35 Kword pairs / s</td>\n",
+       "      <td>350506.05 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">100</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000002</td>\n",
+       "      <td>00:00:00.104872</td>\n",
+       "      <td>0.02 Kword pairs / s</td>\n",
+       "      <td>198.86 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:01.163678</td>\n",
+       "      <td>00:00:00.008939</td>\n",
+       "      <td>36.14 Kword pairs / s</td>\n",
+       "      <td>361441.71 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:06.818568</td>\n",
+       "      <td>00:00:00.036979</td>\n",
+       "      <td>2.07 Kword pairs / s</td>\n",
+       "      <td>20741.69 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"6\" valign=\"top\">2010000</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">1</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000001</td>\n",
+       "      <td>00:00:00.653177</td>\n",
+       "      <td>0.00 Kword pairs / s</td>\n",
+       "      <td>97.50 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:04.677209</td>\n",
+       "      <td>00:00:00.005679</td>\n",
+       "      <td>0.16 Kword pairs / s</td>\n",
+       "      <td>311832.91 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:01:38.562684</td>\n",
+       "      <td>00:00:00.029887</td>\n",
+       "      <td>0.22 Kword pairs / s</td>\n",
+       "      <td>434681.25 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">100</th>\n",
+       "      <th>0</th>\n",
+       "      <td>00:00:00.000001</td>\n",
+       "      <td>00:00:00.979613</td>\n",
+       "      <td>0.01 Kword pairs / s</td>\n",
+       "      <td>111.85 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>00:00:03.207474</td>\n",
+       "      <td>00:00:00.009479</td>\n",
+       "      <td>10.18 Kword pairs / s</td>\n",
+       "      <td>204614.80 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>00:00:55.119595</td>\n",
+       "      <td>00:00:00.419531</td>\n",
+       "      <td>3.46 Kword pairs / s</td>\n",
+       "      <td>69543.35 Kword pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            constructor_duration  \\\n",
+       "dictionary_size nonzero_limit annoy_n_trees                        \n",
+       "1000000         1             0                  00:00:00.000002   \n",
+       "                              1                  00:00:01.854097   \n",
+       "                              100                00:00:04.702035   \n",
+       "                100           0                  00:00:00.000002   \n",
+       "                              1                  00:00:01.163678   \n",
+       "                              100                00:00:06.818568   \n",
+       "2010000         1             0                  00:00:00.000001   \n",
+       "                              1                  00:00:04.677209   \n",
+       "                              100                00:01:38.562684   \n",
+       "                100           0                  00:00:00.000001   \n",
+       "                              1                  00:00:03.207474   \n",
+       "                              100                00:00:55.119595   \n",
+       "\n",
+       "                                            production_duration  \\\n",
+       "dictionary_size nonzero_limit annoy_n_trees                       \n",
+       "1000000         1             0                 00:00:00.115644   \n",
+       "                              1                 00:00:00.003517   \n",
+       "                              100               00:00:00.010444   \n",
+       "                100           0                 00:00:00.104872   \n",
+       "                              1                 00:00:00.008939   \n",
+       "                              100               00:00:00.036979   \n",
+       "2010000         1             0                 00:00:00.653177   \n",
+       "                              1                 00:00:00.005679   \n",
+       "                              100               00:00:00.029887   \n",
+       "                100           0                 00:00:00.979613   \n",
+       "                              1                 00:00:00.009479   \n",
+       "                              100               00:00:00.419531   \n",
+       "\n",
+       "                                                  production_speed  \\\n",
+       "dictionary_size nonzero_limit annoy_n_trees                          \n",
+       "1000000         1             0               0.00 Kword pairs / s   \n",
+       "                              1               0.37 Kword pairs / s   \n",
+       "                              100             0.35 Kword pairs / s   \n",
+       "                100           0               0.02 Kword pairs / s   \n",
+       "                              1              36.14 Kword pairs / s   \n",
+       "                              100             2.07 Kword pairs / s   \n",
+       "2010000         1             0               0.00 Kword pairs / s   \n",
+       "                              1               0.16 Kword pairs / s   \n",
+       "                              100             0.22 Kword pairs / s   \n",
+       "                100           0               0.01 Kword pairs / s   \n",
+       "                              1              10.18 Kword pairs / s   \n",
+       "                              100             3.46 Kword pairs / s   \n",
+       "\n",
+       "                                                      processing_speed  \n",
+       "dictionary_size nonzero_limit annoy_n_trees                             \n",
+       "1000000         1             0                 286.27 Kword pairs / s  \n",
+       "                              1              367959.55 Kword pairs / s  \n",
+       "                              100            350506.05 Kword pairs / s  \n",
+       "                100           0                 198.86 Kword pairs / s  \n",
+       "                              1              361441.71 Kword pairs / s  \n",
+       "                              100             20741.69 Kword pairs / s  \n",
+       "2010000         1             0                  97.50 Kword pairs / s  \n",
+       "                              1              311832.91 Kword pairs / s  \n",
+       "                              100            434681.25 Kword pairs / s  \n",
+       "                100           0                 111.85 Kword pairs / s  \n",
+       "                              1              204614.80 Kword pairs / s  \n",
+       "                              100             69543.35 Kword pairs / s  "
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n",
+    "    [1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[\n",
+    "    :, [\"constructor_duration\", \"production_duration\", \"production_speed\", \"processing_speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Implement fast SCM between corpora\n",
+    "\n",
+    "In Gensim PR [#1827][], we added a base implementation of the soft cosine measure (SCM). The base implementation would compute SCM between single documents using the **softcossim** function. In the Gensim PR [#2016][], we intruduced the **SparseTermSimilarityMatrix.inner_product** method, which computes SCM not only between single documents, but also between a document and a corpus, and between two corpora.\n",
+    "\n",
+    "For the measurements, we use the [Google News word embeddings][word2vec-google-news-300] distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01m terms. As a corpus, we will use a random sample of 100K articles from the 4.92m English [Wikipedia articles][enwiki].\n",
+    "\n",
+    " [word2vec-google-news-300]: https://github.com/mmihaltz/word2vec-GoogleNews-vectors (word2vec-GoogleNews-vectors)\n",
+    " [enwiki]: https://github.com/RaRe-Technologies/gensim-data/releases/tag/wiki-english-20171001 (wiki-english-20171001)\n",
+    " [#1827]: https://github.com/RaRe-Technologies/gensim/pull/1827 (Implement Soft Cosine Measure - Pull Request #1827)\n",
+    " [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "full_model = api.load(\"word2vec-google-news-300\")\n",
+    "\n",
+    "try:\n",
+    "    with open(\"matrix_speed.corpus\", \"rb\") as file:\n",
+    "        full_corpus = pickle.load(file)        \n",
+    "except IOError:\n",
+    "    original_corpus = list(tqdm(api.load(\"wiki-english-20171001\"), desc=\"original_corpus\", total=4924894))\n",
+    "    seed(RANDOM_SEED)\n",
+    "    full_corpus = [\n",
+    "        simple_preprocess(u'\\n'.join(article[\"section_texts\"]))\n",
+    "        for article in tqdm(sample(original_corpus, 10**5), desc=\"full_corpus\", total=10**5)]\n",
+    "    del original_corpus\n",
+    "    with open(\"matrix_speed.corpus\", \"wb\") as file:\n",
+    "        pickle.dump(full_corpus, file)\n",
+    "\n",
+    "try:\n",
+    "    full_dictionary = Dictionary.load(\"matrix_speed.dictionary\")\n",
+    "except IOError:\n",
+    "    full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()])\n",
+    "    full_dictionary.save(\"matrix_speed.dictionary\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SCM between two documents\n",
+    "First, we measure the speed at which the **inner_product** method produces term similarities between single documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark(configuration):\n",
+    "    (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration\n",
+    "    corpus_size = len(corpus)\n",
+    "    corpus = [dictionary.doc2bow(doc) for doc in corpus]\n",
+    "    corpus = [vec for vec in corpus if len(vec) > 0]\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    for vec1 in corpus:\n",
+    "        for vec2 in corpus:\n",
+    "            matrix.inner_product(vec1, vec2, normalized=normalized)\n",
+    "    end_time = time()\n",
+    "    duration = end_time - start_time\n",
+    "    \n",
+    "    return {\n",
+    "        \"dictionary_size\": matrix.matrix.shape[0],\n",
+    "        \"matrix_nonzero\": matrix.matrix.nnz,\n",
+    "        \"nonzero_limit\": nonzero_limit,\n",
+    "        \"normalized\": normalized,\n",
+    "        \"corpus_size\": corpus_size,\n",
+    "        \"corpus_actual_size\": len(corpus),\n",
+    "        \"corpus_nonzero\": sum(len(vec) for vec in corpus),\n",
+    "        \"mean_document_length\": np.mean([len(doc) for doc in corpus]),\n",
+    "        \"repetition\": repetition,\n",
+    "        \"duration\": duration, }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "110675d5552847819754f0dc5b1c19e1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "744e400d597440f79b5923dafb1974fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0f84efc0c79a4628a9543736fc5f0c9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a185a8e530e4481b90056222f5f0a1c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=6), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/mnt/storage/home/novotny/.virtualenvs/gensim/lib/python3.4/site-packages/gensim/matutils.py:738: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "seed(RANDOM_SEED)\n",
+    "dictionary_sizes = [1000, 100000]\n",
+    "dictionaries = []\n",
+    "for size in tqdm(dictionary_sizes, desc=\"dictionaries\"):\n",
+    "    dictionary = Dictionary([sample(list(full_dictionary.values()), size)])\n",
+    "    dictionaries.append(dictionary)\n",
+    "min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1]\n",
+    "\n",
+    "corpus_sizes = [100, 1000]\n",
+    "corpora = []\n",
+    "for size in tqdm(corpus_sizes, desc=\"corpora\"):\n",
+    "    corpus = sample(full_corpus, size)\n",
+    "    corpora.append(corpus)\n",
+    "\n",
+    "models = []\n",
+    "for dictionary in tqdm(dictionaries, desc=\"models\"):\n",
+    "    if dictionary == full_dictionary:\n",
+    "        models.append(full_model)\n",
+    "        continue\n",
+    "    model = full_model.__class__(full_model.vector_size)\n",
+    "    model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()}\n",
+    "    model.index2entity = []\n",
+    "    vector_indices = []\n",
+    "    for index, word in enumerate(full_model.index2entity):\n",
+    "        if word in model.vocab.keys():\n",
+    "            model.index2entity.append(word)\n",
+    "            model.vocab[word].index = len(vector_indices)\n",
+    "            vector_indices.append(index)\n",
+    "    model.vectors = full_model.vectors[vector_indices]\n",
+    "    models.append(model)\n",
+    "\n",
+    "nonzero_limits = [1, 10, 100]\n",
+    "matrices = []\n",
+    "for (model, dictionary), nonzero_limit in tqdm(\n",
+    "        list(product(zip(models, dictionaries), nonzero_limits)), desc=\"matrices\"):\n",
+    "    annoy = AnnoyIndexer(model, 1)\n",
+    "    index = WordEmbeddingSimilarityIndex(model, kwargs={\"indexer\": annoy})\n",
+    "    matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit)\n",
+    "    matrices.append((matrix, dictionary, nonzero_limit))\n",
+    "    del annoy\n",
+    "\n",
+    "normalization = (True, False)\n",
+    "repetitions = range(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "configurations = product(matrices, corpora, normalization, repetitions)\n",
+    "results = benchmark_results(benchmark, configurations, \"matrix_speed.inner-product_results.doc_doc\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following tables show how long it takes to compute the **inner_product** method between all document vectors in a corpus (the **duration** column), how many nonzero elements there are in a corpus matrix (the **corpus_nonzero** column), how many nonzero elements there are in a term similarity matrix (the **matrix_nonzero** column) and the mean document similarity production speed (the **speed** column) as we vary the dictionary size (the **dictionary_size** column), the size of the corpus (the **corpus_size** column), the maximum number of nonzero elements in a single column of the matrix (the **nonzero_limit** column), and the matrix symmetry constraint (the **symmetric** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n",
+    "\n",
+    "The **speed** is proportional to the square of the number of unique terms shared by the two document vectors. In our scenario as well as the standard IR scenario, this means **speed** is constant. Computing a normalized inner product (**normalized**${}={}$True) results in a constant speed decrease."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(results)\n",
+    "df[\"speed\"] = df.corpus_actual_size**2 / df.duration\n",
+    "del df[\"corpus_actual_size\"]\n",
+    "df = df.groupby([\"dictionary_size\", \"corpus_size\", \"nonzero_limit\", \"normalized\"])\n",
+    "\n",
+    "def display(df):\n",
+    "    df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n",
+    "    df[\"speed\"] = [\"%.02f Kdoc pairs / s\" % (speed / 1000) for speed in df[\"speed\"]]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>corpus_nonzero</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>corpus_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>normalized</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.007383</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>1.23 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.009028</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>1.01 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.007657</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>1.19 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.008238</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>1.10 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.414364</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>1.39 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.473789</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>1.22 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.430833</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>1.35 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.453477</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>1.27 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:05.236376</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>1.29 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:05.623463</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>1.20 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:05.083829</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>1.33 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:05.576003</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>1.21 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:08:59.285347</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>1.26 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:09:57.693219</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>1.14 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:09:23.213450</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>1.21 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:10:10.612458</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>1.12 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                            duration  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False      00:00:00.007383   \n",
+       "                                          True       00:00:00.009028   \n",
+       "                            100           False      00:00:00.007657   \n",
+       "                                          True       00:00:00.008238   \n",
+       "                1000        1             False      00:00:00.414364   \n",
+       "                                          True       00:00:00.473789   \n",
+       "                            100           False      00:00:00.430833   \n",
+       "                                          True       00:00:00.453477   \n",
+       "100000          100         1             False      00:00:05.236376   \n",
+       "                                          True       00:00:05.623463   \n",
+       "                            100           False      00:00:05.083829   \n",
+       "                                          True       00:00:05.576003   \n",
+       "                1000        1             False      00:08:59.285347   \n",
+       "                                          True       00:09:57.693219   \n",
+       "                            100           False      00:09:23.213450   \n",
+       "                                          True       00:10:10.612458   \n",
+       "\n",
+       "                                                      corpus_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                            100           False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                1000        1             False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "                            100           False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "100000          100         1             False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                            100           False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                1000        1             False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "                            100           False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "\n",
+       "                                                      matrix_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False               1000.0   \n",
+       "                                          True                1000.0   \n",
+       "                            100           False              84944.0   \n",
+       "                                          True               84944.0   \n",
+       "                1000        1             False               1000.0   \n",
+       "                                          True                1000.0   \n",
+       "                            100           False              84944.0   \n",
+       "                                          True               84944.0   \n",
+       "100000          100         1             False             101868.0   \n",
+       "                                          True              101868.0   \n",
+       "                            100           False            8202884.0   \n",
+       "                                          True             8202884.0   \n",
+       "                1000        1             False             101868.0   \n",
+       "                                          True              101868.0   \n",
+       "                            100           False            8202884.0   \n",
+       "                                          True             8202884.0   \n",
+       "\n",
+       "                                                                    speed  \n",
+       "dictionary_size corpus_size nonzero_limit normalized                       \n",
+       "1000            100         1             False       1.23 Kdoc pairs / s  \n",
+       "                                          True        1.01 Kdoc pairs / s  \n",
+       "                            100           False       1.19 Kdoc pairs / s  \n",
+       "                                          True        1.10 Kdoc pairs / s  \n",
+       "                1000        1             False       1.39 Kdoc pairs / s  \n",
+       "                                          True        1.22 Kdoc pairs / s  \n",
+       "                            100           False       1.35 Kdoc pairs / s  \n",
+       "                                          True        1.27 Kdoc pairs / s  \n",
+       "100000          100         1             False       1.29 Kdoc pairs / s  \n",
+       "                                          True        1.20 Kdoc pairs / s  \n",
+       "                            100           False       1.33 Kdoc pairs / s  \n",
+       "                                          True        1.21 Kdoc pairs / s  \n",
+       "                1000        1             False       1.26 Kdoc pairs / s  \n",
+       "                                          True        1.14 Kdoc pairs / s  \n",
+       "                            100           False       1.21 Kdoc pairs / s  \n",
+       "                                          True        1.12 Kdoc pairs / s  "
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.mean()).loc[\n",
+    "    [1000, 100000], :, [1, 100], :].loc[\n",
+    "    :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>corpus_nonzero</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>corpus_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>normalized</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000871</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.13 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.001315</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.14 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000893</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.12 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000631</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.08 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.014460</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.025250</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.07 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.039088</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.11 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.023602</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.06 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.276359</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.07 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.278806</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.06 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.286781</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.07 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.313397</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.06 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:14.321101</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.03 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:23.526104</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:05.899527</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.01 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:24.454422</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.05 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                            duration  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False      00:00:00.000871   \n",
+       "                                          True       00:00:00.001315   \n",
+       "                            100           False      00:00:00.000893   \n",
+       "                                          True       00:00:00.000631   \n",
+       "                1000        1             False      00:00:00.014460   \n",
+       "                                          True       00:00:00.025250   \n",
+       "                            100           False      00:00:00.039088   \n",
+       "                                          True       00:00:00.023602   \n",
+       "100000          100         1             False      00:00:00.276359   \n",
+       "                                          True       00:00:00.278806   \n",
+       "                            100           False      00:00:00.286781   \n",
+       "                                          True       00:00:00.313397   \n",
+       "                1000        1             False      00:00:14.321101   \n",
+       "                                          True       00:00:23.526104   \n",
+       "                            100           False      00:00:05.899527   \n",
+       "                                          True       00:00:24.454422   \n",
+       "\n",
+       "                                                      corpus_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "100000          100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "\n",
+       "                                                      matrix_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "100000          100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "\n",
+       "                                                                    speed  \n",
+       "dictionary_size corpus_size nonzero_limit normalized                       \n",
+       "1000            100         1             False       0.13 Kdoc pairs / s  \n",
+       "                                          True        0.14 Kdoc pairs / s  \n",
+       "                            100           False       0.12 Kdoc pairs / s  \n",
+       "                                          True        0.08 Kdoc pairs / s  \n",
+       "                1000        1             False       0.05 Kdoc pairs / s  \n",
+       "                                          True        0.07 Kdoc pairs / s  \n",
+       "                            100           False       0.11 Kdoc pairs / s  \n",
+       "                                          True        0.06 Kdoc pairs / s  \n",
+       "100000          100         1             False       0.07 Kdoc pairs / s  \n",
+       "                                          True        0.06 Kdoc pairs / s  \n",
+       "                            100           False       0.07 Kdoc pairs / s  \n",
+       "                                          True        0.06 Kdoc pairs / s  \n",
+       "                1000        1             False       0.03 Kdoc pairs / s  \n",
+       "                                          True        0.05 Kdoc pairs / s  \n",
+       "                            100           False       0.01 Kdoc pairs / s  \n",
+       "                                          True        0.05 Kdoc pairs / s  "
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n",
+    "    [1000, 100000], :, [1, 100], :].loc[\n",
+    "    :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SCM between a document and a corpus\n",
+    "Next, we measure the speed at which the **inner_product** method produces term similarities between documents and a corpus."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark(configuration):\n",
+    "    (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration\n",
+    "    corpus_size = len(corpus)\n",
+    "    corpus = [dictionary.doc2bow(doc) for doc in corpus if doc]\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    for vec in corpus:\n",
+    "        matrix.inner_product(vec, corpus, normalized=normalized)\n",
+    "    end_time = time()\n",
+    "    duration = end_time - start_time\n",
+    "    \n",
+    "    return {\n",
+    "        \"dictionary_size\": matrix.matrix.shape[0],\n",
+    "        \"matrix_nonzero\": matrix.matrix.nnz,\n",
+    "        \"nonzero_limit\": nonzero_limit,\n",
+    "        \"normalized\": normalized,\n",
+    "        \"corpus_size\": corpus_size,\n",
+    "        \"corpus_actual_size\": len(corpus),\n",
+    "        \"corpus_nonzero\": sum(len(vec) for vec in corpus),\n",
+    "        \"mean_document_length\": np.mean([len(doc) for doc in corpus]),\n",
+    "        \"repetition\": repetition,\n",
+    "        \"duration\": duration, }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "configurations = product(matrices, corpora, normalization, repetitions)\n",
+    "results = benchmark_results(benchmark, configurations, \"matrix_speed.inner-product_results.doc_corpus\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The **speed** is inversely proportional to **matrix_nonzero**. Computing a normalized inner product (**normalized**${}={}$True) results in a constant speed decrease."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(results)\n",
+    "df[\"speed\"] = df.corpus_actual_size**2 / df.duration\n",
+    "del df[\"corpus_actual_size\"]\n",
+    "df = df.groupby([\"dictionary_size\", \"corpus_size\", \"nonzero_limit\", \"normalized\"])\n",
+    "\n",
+    "def display(df):\n",
+    "    df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n",
+    "    df[\"speed\"] = [\"%.02f Kdoc pairs / s\" % (speed / 1000) for speed in df[\"speed\"]]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>corpus_nonzero</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>corpus_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>normalized</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.009363</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>1117.12 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.010948</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>954.13 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.014128</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>728.91 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.018164</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>551.78 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.072091</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>13872.12 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.079284</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>12615.36 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.162483</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>6188.43 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.203081</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>4924.48 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.278253</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>36.05 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.298519</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>33.56 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:36.326167</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>0.28 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:36.928802</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>0.27 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:07.403301</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>135.08 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:07.794943</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>128.29 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:05:55.674712</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>2.81 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:06:05.561398</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>2.74 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                            duration  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False      00:00:00.009363   \n",
+       "                                          True       00:00:00.010948   \n",
+       "                            100           False      00:00:00.014128   \n",
+       "                                          True       00:00:00.018164   \n",
+       "                1000        1             False      00:00:00.072091   \n",
+       "                                          True       00:00:00.079284   \n",
+       "                            100           False      00:00:00.162483   \n",
+       "                                          True       00:00:00.203081   \n",
+       "100000          100         1             False      00:00:00.278253   \n",
+       "                                          True       00:00:00.298519   \n",
+       "                            100           False      00:00:36.326167   \n",
+       "                                          True       00:00:36.928802   \n",
+       "                1000        1             False      00:00:07.403301   \n",
+       "                                          True       00:00:07.794943   \n",
+       "                            100           False      00:05:55.674712   \n",
+       "                                          True       00:06:05.561398   \n",
+       "\n",
+       "                                                      corpus_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                            100           False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                1000        1             False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "                            100           False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "100000          100         1             False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                            100           False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                1000        1             False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "                            100           False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "\n",
+       "                                                      matrix_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False               1000.0   \n",
+       "                                          True                1000.0   \n",
+       "                            100           False              84944.0   \n",
+       "                                          True               84944.0   \n",
+       "                1000        1             False               1000.0   \n",
+       "                                          True                1000.0   \n",
+       "                            100           False              84944.0   \n",
+       "                                          True               84944.0   \n",
+       "100000          100         1             False             101868.0   \n",
+       "                                          True              101868.0   \n",
+       "                            100           False            8202884.0   \n",
+       "                                          True             8202884.0   \n",
+       "                1000        1             False             101868.0   \n",
+       "                                          True              101868.0   \n",
+       "                            100           False            8202884.0   \n",
+       "                                          True             8202884.0   \n",
+       "\n",
+       "                                                                        speed  \n",
+       "dictionary_size corpus_size nonzero_limit normalized                           \n",
+       "1000            100         1             False        1117.12 Kdoc pairs / s  \n",
+       "                                          True          954.13 Kdoc pairs / s  \n",
+       "                            100           False         728.91 Kdoc pairs / s  \n",
+       "                                          True          551.78 Kdoc pairs / s  \n",
+       "                1000        1             False       13872.12 Kdoc pairs / s  \n",
+       "                                          True        12615.36 Kdoc pairs / s  \n",
+       "                            100           False        6188.43 Kdoc pairs / s  \n",
+       "                                          True         4924.48 Kdoc pairs / s  \n",
+       "100000          100         1             False          36.05 Kdoc pairs / s  \n",
+       "                                          True           33.56 Kdoc pairs / s  \n",
+       "                            100           False           0.28 Kdoc pairs / s  \n",
+       "                                          True            0.27 Kdoc pairs / s  \n",
+       "                1000        1             False         135.08 Kdoc pairs / s  \n",
+       "                                          True          128.29 Kdoc pairs / s  \n",
+       "                            100           False           2.81 Kdoc pairs / s  \n",
+       "                                          True            2.74 Kdoc pairs / s  "
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.mean()).loc[\n",
+    "    [1000, 100000], :, [1, 100], :].loc[\n",
+    "    :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>corpus_nonzero</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>corpus_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>normalized</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.002120</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>242.09 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.002387</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>207.64 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.002531</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>130.94 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000911</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>27.68 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000587</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>112.92 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.001191</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>187.31 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.011944</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>513.79 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.001793</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>43.54 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.016156</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.06 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.013451</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.47 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:01.339787</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.01 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:01.617340</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.01 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.038961</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.71 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.024154</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.40 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:07.604805</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.06 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:14.799519</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.10 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                            duration  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False      00:00:00.002120   \n",
+       "                                          True       00:00:00.002387   \n",
+       "                            100           False      00:00:00.002531   \n",
+       "                                          True       00:00:00.000911   \n",
+       "                1000        1             False      00:00:00.000587   \n",
+       "                                          True       00:00:00.001191   \n",
+       "                            100           False      00:00:00.011944   \n",
+       "                                          True       00:00:00.001793   \n",
+       "100000          100         1             False      00:00:00.016156   \n",
+       "                                          True       00:00:00.013451   \n",
+       "                            100           False      00:00:01.339787   \n",
+       "                                          True       00:00:01.617340   \n",
+       "                1000        1             False      00:00:00.038961   \n",
+       "                                          True       00:00:00.024154   \n",
+       "                            100           False      00:00:07.604805   \n",
+       "                                          True       00:00:14.799519   \n",
+       "\n",
+       "                                                      corpus_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "100000          100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "\n",
+       "                                                      matrix_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "100000          100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "\n",
+       "                                                                      speed  \n",
+       "dictionary_size corpus_size nonzero_limit normalized                         \n",
+       "1000            100         1             False       242.09 Kdoc pairs / s  \n",
+       "                                          True        207.64 Kdoc pairs / s  \n",
+       "                            100           False       130.94 Kdoc pairs / s  \n",
+       "                                          True         27.68 Kdoc pairs / s  \n",
+       "                1000        1             False       112.92 Kdoc pairs / s  \n",
+       "                                          True        187.31 Kdoc pairs / s  \n",
+       "                            100           False       513.79 Kdoc pairs / s  \n",
+       "                                          True         43.54 Kdoc pairs / s  \n",
+       "100000          100         1             False         2.06 Kdoc pairs / s  \n",
+       "                                          True          1.47 Kdoc pairs / s  \n",
+       "                            100           False         0.01 Kdoc pairs / s  \n",
+       "                                          True          0.01 Kdoc pairs / s  \n",
+       "                1000        1             False         0.71 Kdoc pairs / s  \n",
+       "                                          True          0.40 Kdoc pairs / s  \n",
+       "                            100           False         0.06 Kdoc pairs / s  \n",
+       "                                          True          0.10 Kdoc pairs / s  "
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n",
+    "    [1000, 100000], :, [1, 100], :].loc[\n",
+    "    :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SCM between two corpora\n",
+    "Lastly, we measure the speed at which the **inner_product** method produces term similarities between entire corpora."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark(configuration):\n",
+    "    (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration\n",
+    "    corpus_size = len(corpus)\n",
+    "    corpus = [dictionary.doc2bow(doc) for doc in corpus]\n",
+    "    corpus = [vec for vec in corpus if len(vec) > 0]\n",
+    "    \n",
+    "    start_time = time()\n",
+    "    matrix.inner_product(corpus, corpus, normalized=normalized)\n",
+    "    end_time = time()\n",
+    "    duration = end_time - start_time\n",
+    "    \n",
+    "    return {\n",
+    "        \"dictionary_size\": matrix.matrix.shape[0],\n",
+    "        \"matrix_nonzero\": matrix.matrix.nnz,\n",
+    "        \"nonzero_limit\": nonzero_limit,\n",
+    "        \"normalized\": normalized,\n",
+    "        \"corpus_size\": corpus_size,\n",
+    "        \"corpus_actual_size\": len(corpus),\n",
+    "        \"corpus_nonzero\": sum(len(vec) for vec in corpus),\n",
+    "        \"mean_document_length\": np.mean([len(doc) for doc in corpus]),\n",
+    "        \"repetition\": repetition,\n",
+    "        \"duration\": duration, }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "84e1344be5d944fa98368e6b3994944a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/mnt/storage/home/novotny/.virtualenvs/gensim/lib/python3.4/site-packages/gensim/matutils.py:738: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "nonzero_limits = [1000]\n",
+    "dense_matrices = []\n",
+    "for (model, dictionary), nonzero_limit in tqdm(\n",
+    "        list(product(zip(models, dictionaries), nonzero_limits)), desc=\"matrices\"):\n",
+    "    annoy = AnnoyIndexer(model, 1)\n",
+    "    index = WordEmbeddingSimilarityIndex(model, kwargs={\"indexer\": annoy})\n",
+    "    matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit)\n",
+    "    matrices.append((matrix, dictionary, nonzero_limit))\n",
+    "    del annoy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "configurations = product(matrices + dense_matrices, corpora + [full_corpus], normalization, repetitions)\n",
+    "results = benchmark_results(benchmark, configurations, \"matrix_speed.inner-product_results.corpus_corpus\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(results)\n",
+    "df[\"speed\"] = df.corpus_actual_size**2 / df.duration\n",
+    "del df[\"corpus_actual_size\"]\n",
+    "df = df.groupby([\"dictionary_size\", \"corpus_size\", \"nonzero_limit\", \"normalized\"])\n",
+    "\n",
+    "def display(df):\n",
+    "    df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n",
+    "    df[\"speed\"] = [\"%.02f Kdoc pairs / s\" % (speed / 1000) for speed in df[\"speed\"]]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>corpus_nonzero</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>corpus_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>normalized</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"24\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.001403</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>6.69 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.005313</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>1.70 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">10</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.001565</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>8634.0</td>\n",
+       "      <td>5.80 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.005307</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>8634.0</td>\n",
+       "      <td>1.70 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.003172</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>3.05 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.008461</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>1.07 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1000</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.021377</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>838588.0</td>\n",
+       "      <td>0.42 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.055234</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>838588.0</td>\n",
+       "      <td>0.16 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.001376</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>418.61 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.005019</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>114.78 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">10</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.001511</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>8634.0</td>\n",
+       "      <td>381.50 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.005208</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>8634.0</td>\n",
+       "      <td>110.60 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.003539</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>164.03 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.008502</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>67.81 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1000</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.021548</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>838588.0</td>\n",
+       "      <td>26.73 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.054425</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>838588.0</td>\n",
+       "      <td>10.59 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.019915</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>391443.20 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.026118</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>1000.0</td>\n",
+       "      <td>298377.75 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">10</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.020152</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>8634.0</td>\n",
+       "      <td>386722.55 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.026998</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>8634.0</td>\n",
+       "      <td>288567.14 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.028345</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>274905.36 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.041069</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>84944.0</td>\n",
+       "      <td>189709.57 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1000</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.089978</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>838588.0</td>\n",
+       "      <td>86598.15 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.185611</td>\n",
+       "      <td>2914.0</td>\n",
+       "      <td>838588.0</td>\n",
+       "      <td>41971.58 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"24\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.003345</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>2013.92 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.008857</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>760.13 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">10</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.032639</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>814154.0</td>\n",
+       "      <td>206.66 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.080591</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>814154.0</td>\n",
+       "      <td>83.46 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.488467</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>13.77 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:01.454507</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>4.62 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1000</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:04.973667</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>89912542.0</td>\n",
+       "      <td>1.35 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:15.035711</td>\n",
+       "      <td>423.0</td>\n",
+       "      <td>89912542.0</td>\n",
+       "      <td>0.45 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.010141</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>67139.73 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.016685</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>40798.02 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">10</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.041392</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>814154.0</td>\n",
+       "      <td>16444.18 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.091686</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>814154.0</td>\n",
+       "      <td>7425.08 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.508916</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>1338.94 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:01.497556</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>454.49 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1000</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:05.101489</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>89912542.0</td>\n",
+       "      <td>133.44 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:15.325415</td>\n",
+       "      <td>5162.0</td>\n",
+       "      <td>89912542.0</td>\n",
+       "      <td>44.42 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"8\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:37.145526</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>192578.80 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:45.729004</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>101868.0</td>\n",
+       "      <td>156431.36 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">10</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:44.981806</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>814154.0</td>\n",
+       "      <td>159029.88 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:54.245450</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>814154.0</td>\n",
+       "      <td>131871.88 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:01:15.925860</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>94216.21 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:01:29.232076</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>8202884.0</td>\n",
+       "      <td>80177.08 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1000</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:03:17.140191</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>89912542.0</td>\n",
+       "      <td>36286.25 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:04:05.865666</td>\n",
+       "      <td>525310.0</td>\n",
+       "      <td>89912542.0</td>\n",
+       "      <td>29097.14 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                            duration  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False      00:00:00.001403   \n",
+       "                                          True       00:00:00.005313   \n",
+       "                            10            False      00:00:00.001565   \n",
+       "                                          True       00:00:00.005307   \n",
+       "                            100           False      00:00:00.003172   \n",
+       "                                          True       00:00:00.008461   \n",
+       "                            1000          False      00:00:00.021377   \n",
+       "                                          True       00:00:00.055234   \n",
+       "                1000        1             False      00:00:00.001376   \n",
+       "                                          True       00:00:00.005019   \n",
+       "                            10            False      00:00:00.001511   \n",
+       "                                          True       00:00:00.005208   \n",
+       "                            100           False      00:00:00.003539   \n",
+       "                                          True       00:00:00.008502   \n",
+       "                            1000          False      00:00:00.021548   \n",
+       "                                          True       00:00:00.054425   \n",
+       "                100000      1             False      00:00:00.019915   \n",
+       "                                          True       00:00:00.026118   \n",
+       "                            10            False      00:00:00.020152   \n",
+       "                                          True       00:00:00.026998   \n",
+       "                            100           False      00:00:00.028345   \n",
+       "                                          True       00:00:00.041069   \n",
+       "                            1000          False      00:00:00.089978   \n",
+       "                                          True       00:00:00.185611   \n",
+       "100000          100         1             False      00:00:00.003345   \n",
+       "                                          True       00:00:00.008857   \n",
+       "                            10            False      00:00:00.032639   \n",
+       "                                          True       00:00:00.080591   \n",
+       "                            100           False      00:00:00.488467   \n",
+       "                                          True       00:00:01.454507   \n",
+       "                            1000          False      00:00:04.973667   \n",
+       "                                          True       00:00:15.035711   \n",
+       "                1000        1             False      00:00:00.010141   \n",
+       "                                          True       00:00:00.016685   \n",
+       "                            10            False      00:00:00.041392   \n",
+       "                                          True       00:00:00.091686   \n",
+       "                            100           False      00:00:00.508916   \n",
+       "                                          True       00:00:01.497556   \n",
+       "                            1000          False      00:00:05.101489   \n",
+       "                                          True       00:00:15.325415   \n",
+       "                100000      1             False      00:00:37.145526   \n",
+       "                                          True       00:00:45.729004   \n",
+       "                            10            False      00:00:44.981806   \n",
+       "                                          True       00:00:54.245450   \n",
+       "                            100           False      00:01:15.925860   \n",
+       "                                          True       00:01:29.232076   \n",
+       "                            1000          False      00:03:17.140191   \n",
+       "                                          True       00:04:05.865666   \n",
+       "\n",
+       "                                                      corpus_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                            10            False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                            100           False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                            1000          False                  3.0   \n",
+       "                                          True                   3.0   \n",
+       "                1000        1             False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "                            10            False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "                            100           False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "                            1000          False                 26.0   \n",
+       "                                          True                  26.0   \n",
+       "                100000      1             False               2914.0   \n",
+       "                                          True                2914.0   \n",
+       "                            10            False               2914.0   \n",
+       "                                          True                2914.0   \n",
+       "                            100           False               2914.0   \n",
+       "                                          True                2914.0   \n",
+       "                            1000          False               2914.0   \n",
+       "                                          True                2914.0   \n",
+       "100000          100         1             False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                            10            False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                            100           False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                            1000          False                423.0   \n",
+       "                                          True                 423.0   \n",
+       "                1000        1             False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "                            10            False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "                            100           False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "                            1000          False               5162.0   \n",
+       "                                          True                5162.0   \n",
+       "                100000      1             False             525310.0   \n",
+       "                                          True              525310.0   \n",
+       "                            10            False             525310.0   \n",
+       "                                          True              525310.0   \n",
+       "                            100           False             525310.0   \n",
+       "                                          True              525310.0   \n",
+       "                            1000          False             525310.0   \n",
+       "                                          True              525310.0   \n",
+       "\n",
+       "                                                      matrix_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False               1000.0   \n",
+       "                                          True                1000.0   \n",
+       "                            10            False               8634.0   \n",
+       "                                          True                8634.0   \n",
+       "                            100           False              84944.0   \n",
+       "                                          True               84944.0   \n",
+       "                            1000          False             838588.0   \n",
+       "                                          True              838588.0   \n",
+       "                1000        1             False               1000.0   \n",
+       "                                          True                1000.0   \n",
+       "                            10            False               8634.0   \n",
+       "                                          True                8634.0   \n",
+       "                            100           False              84944.0   \n",
+       "                                          True               84944.0   \n",
+       "                            1000          False             838588.0   \n",
+       "                                          True              838588.0   \n",
+       "                100000      1             False               1000.0   \n",
+       "                                          True                1000.0   \n",
+       "                            10            False               8634.0   \n",
+       "                                          True                8634.0   \n",
+       "                            100           False              84944.0   \n",
+       "                                          True               84944.0   \n",
+       "                            1000          False             838588.0   \n",
+       "                                          True              838588.0   \n",
+       "100000          100         1             False             101868.0   \n",
+       "                                          True              101868.0   \n",
+       "                            10            False             814154.0   \n",
+       "                                          True              814154.0   \n",
+       "                            100           False            8202884.0   \n",
+       "                                          True             8202884.0   \n",
+       "                            1000          False           89912542.0   \n",
+       "                                          True            89912542.0   \n",
+       "                1000        1             False             101868.0   \n",
+       "                                          True              101868.0   \n",
+       "                            10            False             814154.0   \n",
+       "                                          True              814154.0   \n",
+       "                            100           False            8202884.0   \n",
+       "                                          True             8202884.0   \n",
+       "                            1000          False           89912542.0   \n",
+       "                                          True            89912542.0   \n",
+       "                100000      1             False             101868.0   \n",
+       "                                          True              101868.0   \n",
+       "                            10            False             814154.0   \n",
+       "                                          True              814154.0   \n",
+       "                            100           False            8202884.0   \n",
+       "                                          True             8202884.0   \n",
+       "                            1000          False           89912542.0   \n",
+       "                                          True            89912542.0   \n",
+       "\n",
+       "                                                                         speed  \n",
+       "dictionary_size corpus_size nonzero_limit normalized                            \n",
+       "1000            100         1             False            6.69 Kdoc pairs / s  \n",
+       "                                          True             1.70 Kdoc pairs / s  \n",
+       "                            10            False            5.80 Kdoc pairs / s  \n",
+       "                                          True             1.70 Kdoc pairs / s  \n",
+       "                            100           False            3.05 Kdoc pairs / s  \n",
+       "                                          True             1.07 Kdoc pairs / s  \n",
+       "                            1000          False            0.42 Kdoc pairs / s  \n",
+       "                                          True             0.16 Kdoc pairs / s  \n",
+       "                1000        1             False          418.61 Kdoc pairs / s  \n",
+       "                                          True           114.78 Kdoc pairs / s  \n",
+       "                            10            False          381.50 Kdoc pairs / s  \n",
+       "                                          True           110.60 Kdoc pairs / s  \n",
+       "                            100           False          164.03 Kdoc pairs / s  \n",
+       "                                          True            67.81 Kdoc pairs / s  \n",
+       "                            1000          False           26.73 Kdoc pairs / s  \n",
+       "                                          True            10.59 Kdoc pairs / s  \n",
+       "                100000      1             False       391443.20 Kdoc pairs / s  \n",
+       "                                          True        298377.75 Kdoc pairs / s  \n",
+       "                            10            False       386722.55 Kdoc pairs / s  \n",
+       "                                          True        288567.14 Kdoc pairs / s  \n",
+       "                            100           False       274905.36 Kdoc pairs / s  \n",
+       "                                          True        189709.57 Kdoc pairs / s  \n",
+       "                            1000          False        86598.15 Kdoc pairs / s  \n",
+       "                                          True         41971.58 Kdoc pairs / s  \n",
+       "100000          100         1             False         2013.92 Kdoc pairs / s  \n",
+       "                                          True           760.13 Kdoc pairs / s  \n",
+       "                            10            False          206.66 Kdoc pairs / s  \n",
+       "                                          True            83.46 Kdoc pairs / s  \n",
+       "                            100           False           13.77 Kdoc pairs / s  \n",
+       "                                          True             4.62 Kdoc pairs / s  \n",
+       "                            1000          False            1.35 Kdoc pairs / s  \n",
+       "                                          True             0.45 Kdoc pairs / s  \n",
+       "                1000        1             False        67139.73 Kdoc pairs / s  \n",
+       "                                          True         40798.02 Kdoc pairs / s  \n",
+       "                            10            False        16444.18 Kdoc pairs / s  \n",
+       "                                          True          7425.08 Kdoc pairs / s  \n",
+       "                            100           False         1338.94 Kdoc pairs / s  \n",
+       "                                          True           454.49 Kdoc pairs / s  \n",
+       "                            1000          False          133.44 Kdoc pairs / s  \n",
+       "                                          True            44.42 Kdoc pairs / s  \n",
+       "                100000      1             False       192578.80 Kdoc pairs / s  \n",
+       "                                          True        156431.36 Kdoc pairs / s  \n",
+       "                            10            False       159029.88 Kdoc pairs / s  \n",
+       "                                          True        131871.88 Kdoc pairs / s  \n",
+       "                            100           False        94216.21 Kdoc pairs / s  \n",
+       "                                          True         80177.08 Kdoc pairs / s  \n",
+       "                            1000          False        36286.25 Kdoc pairs / s  \n",
+       "                                          True         29097.14 Kdoc pairs / s  "
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.mean()).loc[\n",
+    "    [1000, 100000], :, [1, 10, 100, 1000], :].loc[\n",
+    "    :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>duration</th>\n",
+       "      <th>corpus_nonzero</th>\n",
+       "      <th>matrix_nonzero</th>\n",
+       "      <th>speed</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dictionary_size</th>\n",
+       "      <th>corpus_size</th>\n",
+       "      <th>nonzero_limit</th>\n",
+       "      <th>normalized</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"12\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000292</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.48 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000225</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.08 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000747</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.02 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000488</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.07 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000027</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>8.10 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000069</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.56 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000309</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>16.26 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000268</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.24 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000576</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>11256.03 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000574</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6512.19 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000562</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>5233.50 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000609</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2743.63 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"12\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000152</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>98.97 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000322</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>28.10 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.004997</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.14 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.022206</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.07 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">1000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.000210</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1420.00 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.000192</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>467.23 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.019022</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>45.91 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.004431</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.35 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">100000</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.024466</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>126.77 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:00.062447</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>213.64 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">100</th>\n",
+       "      <th>False</th>\n",
+       "      <td>00:00:00.087692</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>108.55 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>True</th>\n",
+       "      <td>00:00:01.065889</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>968.80 Kdoc pairs / s</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                            duration  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False      00:00:00.000292   \n",
+       "                                          True       00:00:00.000225   \n",
+       "                            100           False      00:00:00.000747   \n",
+       "                                          True       00:00:00.000488   \n",
+       "                1000        1             False      00:00:00.000027   \n",
+       "                                          True       00:00:00.000069   \n",
+       "                            100           False      00:00:00.000309   \n",
+       "                                          True       00:00:00.000268   \n",
+       "                100000      1             False      00:00:00.000576   \n",
+       "                                          True       00:00:00.000574   \n",
+       "                            100           False      00:00:00.000562   \n",
+       "                                          True       00:00:00.000609   \n",
+       "100000          100         1             False      00:00:00.000152   \n",
+       "                                          True       00:00:00.000322   \n",
+       "                            100           False      00:00:00.004997   \n",
+       "                                          True       00:00:00.022206   \n",
+       "                1000        1             False      00:00:00.000210   \n",
+       "                                          True       00:00:00.000192   \n",
+       "                            100           False      00:00:00.019022   \n",
+       "                                          True       00:00:00.004431   \n",
+       "                100000      1             False      00:00:00.024466   \n",
+       "                                          True       00:00:00.062447   \n",
+       "                            100           False      00:00:00.087692   \n",
+       "                                          True       00:00:01.065889   \n",
+       "\n",
+       "                                                      corpus_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                100000      1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "100000          100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                100000      1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "\n",
+       "                                                      matrix_nonzero  \\\n",
+       "dictionary_size corpus_size nonzero_limit normalized                   \n",
+       "1000            100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                100000      1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "100000          100         1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                1000        1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                100000      1             False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "                            100           False                  0.0   \n",
+       "                                          True                   0.0   \n",
+       "\n",
+       "                                                                        speed  \n",
+       "dictionary_size corpus_size nonzero_limit normalized                           \n",
+       "1000            100         1             False           1.48 Kdoc pairs / s  \n",
+       "                                          True            0.08 Kdoc pairs / s  \n",
+       "                            100           False           1.02 Kdoc pairs / s  \n",
+       "                                          True            0.07 Kdoc pairs / s  \n",
+       "                1000        1             False           8.10 Kdoc pairs / s  \n",
+       "                                          True            1.56 Kdoc pairs / s  \n",
+       "                            100           False          16.26 Kdoc pairs / s  \n",
+       "                                          True            2.24 Kdoc pairs / s  \n",
+       "                100000      1             False       11256.03 Kdoc pairs / s  \n",
+       "                                          True         6512.19 Kdoc pairs / s  \n",
+       "                            100           False        5233.50 Kdoc pairs / s  \n",
+       "                                          True         2743.63 Kdoc pairs / s  \n",
+       "100000          100         1             False          98.97 Kdoc pairs / s  \n",
+       "                                          True           28.10 Kdoc pairs / s  \n",
+       "                            100           False           0.14 Kdoc pairs / s  \n",
+       "                                          True            0.07 Kdoc pairs / s  \n",
+       "                1000        1             False        1420.00 Kdoc pairs / s  \n",
+       "                                          True          467.23 Kdoc pairs / s  \n",
+       "                            100           False          45.91 Kdoc pairs / s  \n",
+       "                                          True            1.35 Kdoc pairs / s  \n",
+       "                100000      1             False         126.77 Kdoc pairs / s  \n",
+       "                                          True          213.64 Kdoc pairs / s  \n",
+       "                            100           False         108.55 Kdoc pairs / s  \n",
+       "                                          True          968.80 Kdoc pairs / s  "
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n",
+    "    [1000, 100000], :, [1, 100], :].loc[\n",
+    "    :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/notebooks/soft_cosine_tutorial.ipynb b/docs/notebooks/soft_cosine_tutorial.ipynb
index e5d11dcd3f..957899c089 100644
--- a/docs/notebooks/soft_cosine_tutorial.ipynb
+++ b/docs/notebooks/soft_cosine_tutorial.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Finding similar documents with Word2Vec and Soft Cosine Measure \n",
     "\n",
-    "Soft Cosine Measure (SCM) is a promising new tool in machine learning that allows us to submit a query and return the most relevant documents. In **part 1**, we will show how you can compute SCM between two documents using `softcossim`. In **part 2**, we will use `SoftCosineSimilarity` to retrieve documents most similar to a query and compare the performance against other similarity measures.\n",
+    "Soft Cosine Measure (SCM) [1, 4] is a promising new tool in machine learning that allows us to submit a query and return the most relevant documents. In **part 1**, we will show how you can compute SCM between two documents using the `inner_product` method. In **part 2**, we will use `SoftCosineSimilarity` to retrieve documents most similar to a query and compare the performance against other similarity measures.\n",
     "\n",
     "First, however, we go through the basics of what Soft Cosine Measure is.\n",
     "\n",
@@ -22,7 +22,7 @@
     "\n",
     "This method was perhaps first introduced in the article “Soft Measure and Soft Cosine Measure: Measure of Features in Vector Space Model” by Grigori Sidorov, Alexander Gelbukh, Helena Gomez-Adorno, and David Pinto ([link to PDF](http://www.scielo.org.mx/pdf/cys/v18n3/v18n3a7.pdf)).\n",
     "\n",
-    "In this tutorial, we will learn how to use Gensim's SCM functionality, which consists of the `softcossim` function for one-off computation, and the `SoftCosineSimilarity` class for corpus-based similarity queries.\n",
+    "In this tutorial, we will learn how to use Gensim's SCM functionality, which consists of the `inner_product` method for one-off computation, and the `SoftCosineSimilarity` class for corpus-based similarity queries.\n",
     "\n",
     "> **Note**:\n",
     ">\n",
@@ -67,7 +67,7 @@
    "source": [
     "sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()\n",
     "sentence_president = 'The president greets the press in Chicago'.lower().split()\n",
-    "sentence_orange = 'Oranges are my favorite fruit'.lower().split()"
+    "sentence_orange = 'Having a tough time finding an orange juice press machine?'.lower().split()"
    ]
   },
   {
@@ -84,19 +84,13 @@
     "scrolled": true
    },
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package stopwords to /home/witiko/nltk_data...\n",
-      "[nltk_data]   Package stopwords is already up-to-date!\n"
-     ]
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2018-02-05 10:47:42,975 : INFO : built Dictionary(11 unique tokens: ['president', 'fruit', 'greets', 'obama', 'illinois']...) from 3 documents (total 11 corpus positions)\n"
+      "2018-09-11 22:02:01,041 : INFO : 'pattern' package not found; tag filters are not available for English\n",
+      "2018-09-11 22:02:01,044 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n",
+      "2018-09-11 22:02:01,045 : INFO : built Dictionary(14 unique tokens: ['speaks', 'illinois', 'greets', 'juice', 'chicago']...) from 3 documents (total 15 corpus positions)\n"
      ]
     }
    ],
@@ -116,7 +110,6 @@
     "from gensim import corpora\n",
     "documents = [sentence_obama, sentence_president, sentence_orange]\n",
     "dictionary = corpora.Dictionary(documents)\n",
-    "corpus = [dictionary.doc2bow(document) for document in documents]\n",
     "\n",
     "# Convert the sentences into bag-of-words vectors.\n",
     "sentence_obama = dictionary.doc2bow(sentence_obama)\n",
@@ -128,7 +121,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, as we mentioned earlier, we will be using some downloaded pre-trained embeddings. Note that the embeddings we have chosen here require a lot of memory. We will use the embeddings to construct a term similarity matrix that will be used by the `softcossim` function."
+    "Now, as we mentioned earlier, we will be using some downloaded pre-trained embeddings. Note that the embeddings we have chosen here require a lot of memory. We will use the embeddings to construct a term similarity matrix that will be used by the `inner_product` method."
    ]
   },
   {
@@ -140,31 +133,38 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2018-02-06 16:14:29,104 : INFO : constructed a term similarity matrix with 91.735537 % nonzero elements\n"
+      "2018-09-11 22:02:01,236 : INFO : loading projection weights from /home/novotny/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n",
+      "2018-09-11 22:02:26,984 : INFO : loaded (400000, 50) matrix from /home/novotny/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n",
+      "2018-09-11 22:02:26,985 : INFO : constructing a sparse term similarity matrix using <gensim.models.keyedvectors.WordEmbeddingSimilarityIndex object at 0x7f8d6e8615c0>\n",
+      "2018-09-11 22:02:26,986 : INFO : iterating over columns in dictionary order\n",
+      "2018-09-11 22:02:27,273 : INFO : constructed a sparse term similarity matrix with 11.224490% density\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 21.2 s, sys: 224 ms, total: 21.4 s\n",
-      "Wall time: 21.8 s\n"
+      "CPU times: user 27.8 s, sys: 2.43 s, total: 30.3 s\n",
+      "Wall time: 26.2 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
     "import gensim.downloader as api\n",
+    "from gensim.models import WordEmbeddingSimilarityIndex\n",
+    "from gensim.similarities import SparseTermSimilarityMatrix\n",
     "\n",
     "w2v_model = api.load(\"glove-wiki-gigaword-50\")\n",
-    "similarity_matrix = w2v_model.similarity_matrix(dictionary)"
+    "similarity_index = WordEmbeddingSimilarityIndex(w2v_model)\n",
+    "similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "So let's compute SCM using the `softcossim` function."
+    "Let's compute SCM using the `inner_product` method."
    ]
   },
   {
@@ -176,14 +176,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "similarity = 0.5789\n"
+      "similarity = 0.3790\n"
      ]
     }
    ],
    "source": [
-    "from gensim.matutils import softcossim\n",
-    "\n",
-    "similarity = softcossim(sentence_obama, sentence_president, similarity_matrix)\n",
+    "similarity = similarity_matrix.inner_product(sentence_obama, sentence_president, normalized=True)\n",
     "print('similarity = %.4f' % similarity)"
    ]
   },
@@ -203,12 +201,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "similarity = 0.1439\n"
+      "similarity = 0.1108\n"
      ]
     }
    ],
    "source": [
-    "similarity = softcossim(sentence_obama, sentence_orange, similarity_matrix)\n",
+    "similarity = similarity_matrix.inner_product(sentence_obama, sentence_orange, normalized=True)\n",
     "print('similarity = %.4f' % similarity)"
    ]
   },
@@ -217,7 +215,7 @@
    "metadata": {},
    "source": [
     "## Part 2: Similarity queries using `SoftCosineSimilarity`\n",
-    "You can use SCM to get the most similar documents to a query, using the SoftCosineSimilarity class. Its interface is similar to what is described in the [Similarity Queries](https://radimrehurek.com/gensim/tut3.html) Gensim tutorial.\n",
+    "You can use SCM to get the most similar documents to a query, using the `SoftCosineSimilarity` class. Its interface is similar to what is described in the [Similarity Queries](https://radimrehurek.com/gensim/tut3.html) Gensim tutorial.\n",
     "\n",
     "### Qatar Living unannotated dataset\n",
     "Contestants solving the community question answering task in the [SemEval 2016][semeval16] and [2017][semeval17] competitions had an unannotated dataset of 189,941 questions and 1,894,456 comments from the [Qatar Living][ql] discussion forums. As our first step, we will use the same dataset to build a corpus.\n",
@@ -236,11 +234,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[nltk_data] Downloading package stopwords to /home/witiko/nltk_data...\n",
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /home/novotny/nltk_data...\n",
       "[nltk_data]   Package stopwords is already up-to-date!\n",
       "Number of documents: 3\n",
-      "CPU times: user 1min 59s, sys: 6.06 s, total: 2min 5s\n",
-      "Wall time: 2min 22s\n"
+      "CPU times: user 2min 37s, sys: 1.62 s, total: 2min 39s\n",
+      "Wall time: 2min 39s\n"
      ]
     }
    ],
@@ -291,41 +290,60 @@
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {
-    "scrolled": true
+    "scrolled": false
    },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2018-02-05 10:52:53,477 : INFO : built Dictionary(462807 unique tokens: ['reclarify', 'depeneded', 'autralia', 'cloudnight', 'openmoko']...) from 2274338 documents (total 40096354 corpus positions)\n",
-      "2018-02-05 10:56:50,633 : INFO : training on a 200481770 raw words (192577574 effective words) took 224.3s, 858402 effective words/s\n",
-      "2018-02-05 11:13:14,895 : INFO : constructed a term similarity matrix with 0.003564 % nonzero elements\n"
+      "2018-09-11 22:06:07,973 : INFO : built Dictionary(462807 unique tokens: ['pples', 'adib', 'strangers', 'kolayaalee', 'softpoint']...) from 2274338 documents (total 40096354 corpus positions)\n",
+      "2018-09-11 22:06:09,432 : INFO : collecting all words and their counts\n",
+      "2018-09-11 22:06:17,564 : INFO : collected 462807 word types from a corpus of 40096354 raw words and 2274338 sentences\n",
+      "2018-09-11 22:06:17,565 : INFO : Loading a fresh vocabulary\n",
+      "2018-09-11 22:06:18,002 : INFO : effective_min_count=5 retains 104360 unique words (22% of original 462807, drops 358447)\n",
+      "2018-09-11 22:06:18,003 : INFO : effective_min_count=5 leaves 39565168 word corpus (98% of original 40096354, drops 531186)\n",
+      "2018-09-11 22:06:18,454 : INFO : deleting the raw counts dictionary of 462807 items\n",
+      "2018-09-11 22:06:18,474 : INFO : sample=0.001 downsamples 22 most-common words\n",
+      "2018-09-11 22:06:18,475 : INFO : downsampling leaves estimated 38552993 word corpus (97.4% of prior 39565168)\n",
+      "2018-09-11 22:06:18,907 : INFO : estimated required memory for 104360 words and 300 dimensions: 302644000 bytes\n",
+      "2018-09-11 22:06:18,908 : INFO : resetting layer weights\n",
+      "2018-09-11 22:06:21,082 : INFO : training model with 32 workers on 104360 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n",
+      "2018-09-11 22:06:53,894 : INFO : EPOCH - 1 : training on 40096354 raw words (38515351 effective words) took 32.8s, 1174692 effective words/s\n",
+      "2018-09-11 22:07:27,121 : INFO : EPOCH - 2 : training on 40096354 raw words (38515107 effective words) took 33.2s, 1159858 effective words/s\n",
+      "2018-09-11 22:08:00,122 : INFO : EPOCH - 3 : training on 40096354 raw words (38514587 effective words) took 33.0s, 1167509 effective words/s\n",
+      "2018-09-11 22:08:32,976 : INFO : EPOCH - 4 : training on 40096354 raw words (38515500 effective words) took 32.8s, 1172993 effective words/s\n",
+      "2018-09-11 22:09:06,211 : INFO : EPOCH - 5 : training on 40096354 raw words (38515593 effective words) took 33.2s, 1159566 effective words/s\n",
+      "2018-09-11 22:09:06,212 : INFO : training on a 200481770 raw words (192576138 effective words) took 165.1s, 1166216 effective words/s\n",
+      "2018-09-11 22:09:06,637 : INFO : constructing a sparse term similarity matrix using <gensim.models.keyedvectors.WordEmbeddingSimilarityIndex object at 0x7f8cde1dc9b0>\n",
+      "2018-09-11 22:09:06,657 : INFO : iterating over columns in tf-idf order\n",
+      "2018-09-11 22:25:34,416 : INFO : constructed a sparse term similarity matrix with 0.003654% density\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of unique words: 462807\n",
-      "CPU times: user 1h 2min 21s, sys: 12min 56s, total: 1h 15min 17s\n",
-      "Wall time: 21min 27s\n"
+      "CPU times: user 4h 38min 32s, sys: 4h 24min 33s, total: 9h 3min 5s\n",
+      "Wall time: 20min 43s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
+    "from multiprocessing import cpu_count\n",
+    "\n",
     "from gensim.corpora import Dictionary\n",
     "from gensim.models import TfidfModel\n",
     "from gensim.models import Word2Vec\n",
-    "from multiprocessing import cpu_count\n",
+    "from gensim.models import WordEmbeddingSimilarityIndex\n",
+    "from gensim.similarities import SparseTermSimilarityMatrix\n",
     "\n",
     "dictionary = Dictionary(corpus)\n",
     "tfidf = TfidfModel(dictionary=dictionary)\n",
     "w2v_model = Word2Vec(corpus, workers=cpu_count(), min_count=5, size=300, seed=12345)\n",
-    "similarity_matrix = w2v_model.wv.similarity_matrix(dictionary, tfidf, nonzero_limit=100)\n",
-    "\n",
-    "print(\"Number of unique words: %d\" % len(dictionary))"
+    "similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)\n",
+    "similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf, nonzero_limit=100)"
    ]
   },
   {
@@ -423,7 +441,7 @@
     "        if dict_index in document] for document in documents]\n",
     "    embeddings = np.array([w2v_model.wv[word] for word in words], dtype=np.float32)\n",
     "    nbow = dict(((index, list(chain([None], zip(*document)))) for index, document in enumerate(documents)))\n",
-    "    nbow[\"query\"] = (None, *zip(*query))\n",
+    "    nbow[\"query\"] = tuple([None] + list(zip(*query)))\n",
     "    distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors(\"query\")\n",
     "    similarities = [-distance for _, distance in sorted(distances)]\n",
     "    return similarities\n",
@@ -471,8 +489,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1.49 s, sys: 1.28 s, total: 2.77 s\n",
-      "Wall time: 1min 42s\n"
+      "CPU times: user 2.14 s, sys: 5.08 s, total: 7.22 s\n",
+      "Wall time: 2min 51s\n"
      ]
     }
    ],
@@ -507,23 +525,23 @@
        "\n",
        "Dataset | Strategy | MAP score | Elapsed time (sec)\n",
        ":---|:---|:---|---:\n",
-       "2016-test|softcossim|77.29 ±10.35|0.20 ±0.06\n",
+       "2016-test|softcossim|77.15 ±10.83|4.48 ±0.56\n",
        "2016-test|**Winner (UH-PRHLT-primary)**|76.70 ±0.00|\n",
-       "2016-test|cossim|76.45 ±10.40|0.48 ±0.07\n",
-       "2016-test|wmd-gensim|76.07 ±11.52|8.36 ±2.05\n",
+       "2016-test|cossim|76.45 ±10.40|0.25 ±0.04\n",
+       "2016-test|wmd-gensim|76.15 ±11.51|13.79 ±1.39\n",
        "2016-test|**Baseline 1 (IR)**|74.75 ±0.00|\n",
-       "2016-test|wmd-relax|73.01 ±10.33|0.97 ±0.16\n",
+       "2016-test|wmd-relax|72.03 ±11.33|0.34 ±0.07\n",
        "2016-test|**Baseline 2 (random)**|46.98 ±0.00|\n",
        "\n",
        "\n",
        "Dataset | Strategy | MAP score | Elapsed time (sec)\n",
        ":---|:---|:---|---:\n",
        "2017-test|**Winner (SimBow-primary)**|47.22 ±0.00|\n",
-       "2017-test|softcossim|46.06 ±18.00|0.15 ±0.03\n",
-       "2017-test|cossim|44.38 ±14.71|0.43 ±0.07\n",
-       "2017-test|wmd-gensim|44.20 ±16.02|9.78 ±1.80\n",
+       "2017-test|wmd-relax|45.04 ±15.44|0.39 ±0.07\n",
+       "2017-test|cossim|44.38 ±14.71|0.29 ±0.05\n",
+       "2017-test|softcossim|44.25 ±15.68|4.89 ±0.80\n",
+       "2017-test|wmd-gensim|44.08 ±15.96|16.69 ±1.90\n",
        "2017-test|**Baseline 1 (IR)**|41.85 ±0.00|\n",
-       "2017-test|wmd-relax|41.24 ±14.87|1.00 ±0.26\n",
        "2017-test|**Baseline 2 (random)**|29.81 ±0.00|"
       ],
       "text/plain": [
@@ -565,7 +583,8 @@
     "\n",
     "1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014. ([link to PDF](http://www.scielo.org.mx/pdf/cys/v18n3/v18n3a7.pdf))\n",
     "2. Delphine Charlet and Geraldine Damnati, SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity between Questions for Community Question Answering, 2017. ([link to PDF](http://www.aclweb.org/anthology/S17-2051))\n",
-    "3. Thomas Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013. ([link to PDF](https://arxiv.org/pdf/1301.3781.pdf))"
+    "3. Thomas Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013. ([link to PDF](https://arxiv.org/pdf/1301.3781.pdf))\n",
+    "4. Vít Novotný. *Implementation Notes for the Soft Cosine Measure*, 2018. ([link to PDF](https://arxiv.org/pdf/1808.09407))"
    ]
   }
  ],
@@ -585,7 +604,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "version": "3.4.2"
   }
  },
  "nbformat": 4,
diff --git a/gensim/matutils.py b/gensim/matutils.py
index 74c0107cde..979b99f6d5 100644
--- a/gensim/matutils.py
+++ b/gensim/matutils.py
@@ -14,6 +14,7 @@
 import math
 
 from gensim import utils
+from gensim.utils import deprecated
 
 import numpy as np
 import scipy.sparse
@@ -796,6 +797,9 @@ def cossim(vec1, vec2):
     return result
 
 
+@deprecated(
+    "Function will be removed in 4.0.0, use "
+    "gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead")
 def softcossim(vec1, vec2, similarity_matrix):
     """Get Soft Cosine Measure between two vectors given a term similarity matrix.
 
@@ -816,8 +820,10 @@ def softcossim(vec1, vec2, similarity_matrix):
     vec2 : list of (int, float)
         A document vector in the BoW format.
     similarity_matrix : {:class:`scipy.sparse.csc_matrix`, :class:`scipy.sparse.csr_matrix`}
-        A term similarity matrix, typically produced by
-        :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`.
+        A term similarity matrix. If the matrix is :class:`scipy.sparse.csr_matrix`, it is going
+        to be transposed. If you rely on the fact that there is at most a constant number of
+        non-zero elements in a single column, it is your responsibility to ensure that the matrix
+        is symmetric.
 
     Returns
     -------
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
index 96ca698b27..a0ee690550 100644
--- a/gensim/models/__init__.py
+++ b/gensim/models/__init__.py
@@ -13,7 +13,7 @@
 from .logentropy_model import LogEntropyModel  # noqa:F401
 from .word2vec import Word2Vec  # noqa:F401
 from .doc2vec import Doc2Vec  # noqa:F401
-from .keyedvectors import KeyedVectors  # noqa:F401
+from .keyedvectors import KeyedVectors, WordEmbeddingSimilarityIndex  # noqa:F401
 from .ldamulticore import LdaMulticore  # noqa:F401
 from .phrases import Phrases  # noqa:F401
 from .normmodel import NormModel  # noqa:F401
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index 702cc6a468..1529668423 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -160,7 +160,6 @@
 
 from __future__ import division  # py3 "true division"
 
-from collections import deque
 from itertools import chain
 import logging
 
@@ -173,11 +172,12 @@
     double, array, zeros, vstack, sqrt, newaxis, integer, \
     ndarray, sum as np_sum, prod, argmax
 import numpy as np
+
 from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
 from gensim.corpora.dictionary import Dictionary
 from six import string_types, integer_types
 from six.moves import zip, range
-from scipy import sparse, stats
+from scipy import stats
 from gensim.utils import deprecated
 from gensim.models.utils_any2vec import (
     _save_word2vec_format,
@@ -186,6 +186,7 @@
     _ft_hash,
     _ft_hash_broken
 )
+from gensim.similarities.termsim import TermSimilarityIndex, SparseTermSimilarityMatrix
 
 logger = logging.getLogger(__name__)
 
@@ -606,6 +607,9 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
         """
         return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
 
+    @deprecated(
+        "Method will be removed in 4.0.0, use "
+        "gensim.models.keyedvectors.WordEmbeddingSimilarityIndex instead")
     def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=REAL):
         """Construct a term similarity matrix for computing Soft Cosine Measure.
 
@@ -615,24 +619,21 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
         Parameters
         ----------
         dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
-            A dictionary that specifies a mapping between words and the indices of rows and columns
-            of the resulting term similarity matrix.
-        tfidf : :class:`gensim.models.tfidfmodel.TfidfModel`, optional
-            A model that specifies the relative importance of the terms in the dictionary. The rows
-            of the term similarity matrix will be build in a decreasing order of importance of terms,
-            or in the order of term identifiers if None.
+            A dictionary that specifies the considered terms.
+        tfidf : :class:`gensim.models.tfidfmodel.TfidfModel` or None, optional
+            A model that specifies the relative importance of the terms in the dictionary. The
+            columns of the term similarity matrix will be build in a decreasing order of importance
+            of terms, or in the order of term identifiers if None.
         threshold : float, optional
-            Only pairs of words whose embeddings are more similar than `threshold` are considered
-            when building the sparse term similarity matrix.
+            Only embeddings more similar than `threshold` are considered when retrieving word
+            embeddings closest to a given word embedding.
         exponent : float, optional
-            The exponent applied to the similarity between two word embeddings when building the term similarity matrix.
+            Take the word embedding similarities larger than `threshold` to the power of `exponent`.
         nonzero_limit : int, optional
-            The maximum number of non-zero elements outside the diagonal in a single row or column
-            of the term similarity matrix. Setting `nonzero_limit` to a constant ensures that the
-            time complexity of computing the Soft Cosine Measure will be linear in the document
-            length rather than quadratic.
+            The maximum number of non-zero elements outside the diagonal in a single column of the
+            sparse term similarity matrix.
         dtype : numpy.dtype, optional
-            Data-type of the term similarity matrix.
+            Data-type of the sparse term similarity matrix.
 
         Returns
         -------
@@ -654,66 +655,10 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
         <http://www.aclweb.org/anthology/S/S17/S17-2051.pdf>`_.
 
         """
-        logger.info("constructing a term similarity matrix")
-        matrix_order = len(dictionary)
-        matrix_nonzero = [1] * matrix_order
-        matrix = sparse.identity(matrix_order, dtype=dtype, format="dok")
-        num_skipped = 0
-        # Decide the order of rows.
-        if tfidf is None:
-            word_indices = deque(sorted(dictionary.keys()))
-        else:
-            assert max(tfidf.idfs) < matrix_order
-            word_indices = deque([
-                index for index, _
-                in sorted(tfidf.idfs.items(), key=lambda x: (x[1], -x[0]), reverse=True)
-            ])
-
-        # Traverse rows.
-        for row_number, w1_index in enumerate(list(word_indices)):
-            word_indices.popleft()
-            if row_number % 1000 == 0:
-                logger.info(
-                    "PROGRESS: at %.02f%% rows (%d / %d, %d skipped, %.06f%% density)",
-                    100.0 * (row_number + 1) / matrix_order, row_number + 1, matrix_order,
-                    num_skipped, 100.0 * matrix.getnnz() / matrix_order**2)
-            w1 = dictionary[w1_index]
-            if w1 not in self.vocab:
-                num_skipped += 1
-                continue  # A word from the dictionary is not present in the word2vec model.
-
-            # Traverse upper triangle columns.
-            if matrix_order <= nonzero_limit + 1:  # Traverse all columns.
-                columns = (
-                    (w2_index, self.similarity(w1, dictionary[w2_index]))
-                    for w2_index in word_indices
-                    if dictionary[w2_index] in self.vocab)
-            else:  # Traverse only columns corresponding to the embeddings closest to w1.
-                num_nonzero = matrix_nonzero[w1_index] - 1
-                columns = (
-                    (dictionary.token2id[w2], similarity)
-                    for _, (w2, similarity)
-                    in zip(
-                        range(nonzero_limit - num_nonzero),
-                        self.most_similar(positive=[w1], topn=nonzero_limit - num_nonzero)
-                    )
-                    if w2 in dictionary.token2id
-                )
-                columns = sorted(columns, key=lambda x: x[0])
-
-            for w2_index, similarity in columns:
-                # Ensure that we don't exceed `nonzero_limit` by mirroring the upper triangle.
-                if similarity > threshold and matrix_nonzero[w2_index] <= nonzero_limit:
-                    element = similarity**exponent
-                    matrix[w1_index, w2_index] = element
-                    matrix_nonzero[w1_index] += 1
-                    matrix[w2_index, w1_index] = element
-                    matrix_nonzero[w2_index] += 1
-        logger.info(
-            "constructed a term similarity matrix with %0.6f %% nonzero elements",
-            100.0 * matrix.getnnz() / matrix_order**2
-        )
-        return matrix.tocsc()
+        index = WordEmbeddingSimilarityIndex(self, threshold=threshold, exponent=exponent)
+        similarity_matrix = SparseTermSimilarityMatrix(
+            index, dictionary, tfidf=tfidf, nonzero_limit=nonzero_limit, dtype=dtype)
+        return similarity_matrix.matrix
 
     def wmdistance(self, document1, document2):
         """Compute the Word Mover's Distance between two documents.
@@ -1386,6 +1331,48 @@ def init_sims(self, replace=False):
             self.vectors_norm = _l2_norm(self.vectors, replace=replace)
 
 
+class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
+    """
+    Computes cosine similarities between word embeddings and retrieves the closest word embeddings
+    by cosine similarity for a given word embedding.
+
+    Parameters
+    ----------
+    keyedvectors : :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors`
+        The word embeddings.
+    threshold : float, optional
+        Only embeddings more similar than `threshold` are considered when retrieving word embeddings
+        closest to a given word embedding.
+    exponent : float, optional
+        Take the word embedding similarities larger than `threshold` to the power of `exponent`.
+    kwargs : dict or None
+        A dict with keyword arguments that will be passed to the `keyedvectors.most_similar` method
+        when retrieving the word embeddings closest to a given word embedding.
+
+    See Also
+    --------
+    :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
+        Build a term similarity matrix and compute the Soft Cosine Measure.
+
+    """
+    def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None):
+        assert isinstance(keyedvectors, WordEmbeddingsKeyedVectors)
+        self.keyedvectors = keyedvectors
+        self.threshold = threshold
+        self.exponent = exponent
+        self.kwargs = kwargs or {}
+        super(WordEmbeddingSimilarityIndex, self).__init__()
+
+    def most_similar(self, t1, topn=10):
+        if t1 not in self.keyedvectors.vocab:
+            logger.debug('an out-of-dictionary term "%s"', t1)
+        else:
+            most_similar = self.keyedvectors.most_similar(positive=[t1], topn=topn, **self.kwargs)
+            for t2, similarity in most_similar:
+                if similarity > self.threshold:
+                    yield (t2, similarity**self.exponent)
+
+
 class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors):
     """Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model.
     Used to perform operations on the vectors such as vector lookup, distance, similarity etc.
diff --git a/gensim/similarities/__init__.py b/gensim/similarities/__init__.py
index 52cbad43e7..1becd76831 100644
--- a/gensim/similarities/__init__.py
+++ b/gensim/similarities/__init__.py
@@ -4,3 +4,5 @@
 
 # bring classes directly into package namespace, to save some typing
 from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity, SoftCosineSimilarity, WmdSimilarity  # noqa:F401
+from .termsim import TermSimilarityIndex, UniformTermSimilarityIndex, SparseTermSimilarityMatrix  # noqa:F401
+from .levenshtein import LevenshteinSimilarityIndex  # noqa:F401
diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py
index 91e2b96f10..bb7b4f402b 100755
--- a/gensim/similarities/docsim.py
+++ b/gensim/similarities/docsim.py
@@ -77,6 +77,7 @@
 import scipy.sparse
 
 from gensim import interfaces, utils, matutils
+from .termsim import SparseTermSimilarityMatrix
 from six.moves import map, range, zip
 
 
@@ -272,8 +273,6 @@ class Similarity(interfaces.SimilarityABC):
         Index similarity (dense with cosine distance).
     :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`
         Index similarity (sparse with cosine distance).
-    :class:`~gensim.similarities.docsim.SoftCosineSimilarity`
-        Index similarity (with soft-cosine distance).
     :class:`~gensim.similarities.docsim.WmdSimilarity`
         Index similarity (with word-mover distance).
 
@@ -866,20 +865,18 @@ class SoftCosineSimilarity(interfaces.SimilarityABC):
 
         >>> from gensim.test.utils import common_texts
         >>> from gensim.corpora import Dictionary
-        >>> from gensim.models import Word2Vec
-        >>> from gensim.similarities import SoftCosineSimilarity
+        >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
+        >>> from gensim.similarities import SoftCosineSimilarity, TermSimilarityMatrix
         >>>
         >>> model = Word2Vec(common_texts, size=20, min_count=1)  # train word-vectors
+        >>> termsim_index = WordEmbeddingSimilarityIndex(model)
         >>> dictionary = Dictionary(common_texts)
         >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
+        >>> similarity_matrix = TermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
+        >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
         >>>
-        >>> similarity_matrix = model.wv.similarity_matrix(dictionary)  # construct similarity matrix
-        >>> index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
-        >>>
-        >>> # Make a query.
-        >>> query = 'graph trees computer'.split()
-        >>> # calculate similarity between query and each doc from bow_corpus
-        >>> sims = index[dictionary.doc2bow(query)]
+        >>> query = 'graph trees computer'.split()  # make a query
+        >>> sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus
 
     Check out `Tutorial Notebook
     <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb>`_
@@ -893,9 +890,8 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256):
         ----------
         corpus: iterable of list of (int, float)
             A list of documents in the BoW format.
-        similarity_matrix : :class:`scipy.sparse.csc_matrix`
-            A term similarity matrix, typically produced by
-            :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`.
+        similarity_matrix : :class:`gensim.similarities.SparseTermSimilarityMatrix`
+            A term similarity matrix.
         num_best : int, optional
             The number of results to retrieve for a query, if None - return similarities with all elements from corpus.
         chunksize: int, optional
@@ -903,14 +899,23 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256):
 
         See Also
         --------
-        :meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`
-            A term similarity matrix produced from term embeddings.
-        :func:`gensim.matutils.softcossim`
-            The Soft Cosine Measure.
+        :class:`gensim.similarities.SparseTermSimilarityMatrix`
+            A sparse term similarity matrix build using a term similarity index.
+        :class:`gensim.similarities.LevenshteinSimilarityIndex`
+            A term similarity index that computes Levenshtein similarities between terms.
+        :class:`gensim.models.WordEmbeddingSimilarityIndex`
+            A term similarity index that computes cosine similarities between word embeddings.
 
         """
+        if scipy.sparse.issparse(similarity_matrix):
+            logger.warn(
+                "Support for passing an unencapsulated sparse matrix will be removed in 4.0.0, pass "
+                "a SparseTermSimilarityMatrix instance instead")
+            self.similarity_matrix = SparseTermSimilarityMatrix(similarity_matrix)
+        else:
+            self.similarity_matrix = similarity_matrix
+
         self.corpus = corpus
-        self.similarity_matrix = similarity_matrix
         self.num_best = num_best
         self.chunksize = chunksize
 
@@ -943,31 +948,19 @@ def get_similarities(self, query):
             Similarity matrix.
 
         """
+        if not self.corpus:
+            return numpy.array()
 
         is_corpus, query = utils.is_corpus(query)
-        if not is_corpus:
-            if isinstance(query, numpy.ndarray):
-                # Convert document indexes to actual documents.
-                query = [self.corpus[i] for i in query]
-            else:
-                query = [query]
-
-        result = []
-        for query_document in query:
-            # Compute similarity for each query.
-            qresult = [matutils.softcossim(query_document, corpus_document, self.similarity_matrix)
-                       for corpus_document in self.corpus]
-            qresult = numpy.array(qresult)
-
-            # Append single query result to list of all results.
-            result.append(qresult)
-
-        if is_corpus:
-            result = numpy.array(result)
-        else:
-            result = result[0]
-
-        return result
+        if not is_corpus and isinstance(query, numpy.ndarray):
+            query = [self.corpus[i] for i in query]  # convert document indexes to actual documents
+        result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True)
+
+        if scipy.sparse.issparse(result):
+            return numpy.asarray(result.todense())
+        if numpy.isscalar(result):
+            return numpy.array(result)
+        return numpy.asarray(result)[0]
 
     def __str__(self):
         return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.similarity_matrix.shape[0])
diff --git a/gensim/similarities/levenshtein.py b/gensim/similarities/levenshtein.py
new file mode 100644
index 0000000000..e517c51217
--- /dev/null
+++ b/gensim/similarities/levenshtein.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2018 Vit Novotny <witiko@mail.muni.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+This module provides a namespace for functions that use the Levenshtein distance.
+"""
+
+from itertools import islice
+import logging
+from math import floor
+
+from gensim.similarities.termsim import TermSimilarityIndex
+
+logger = logging.getLogger(__name__)
+
+
+def levdist(t1, t2, max_distance=float("inf")):
+    """Get the Levenshtein distance between two terms.
+
+    Return the Levenshtein distance between two terms. The distance is a
+    number between <1.0, inf>, higher is less similar.
+
+    Parameters
+    ----------
+    t1 : {bytes, str, unicode}
+        The first compared term.
+    t2 : {bytes, str, unicode}
+        The second compared term.
+    max_distance : {int, float}, optional
+        If you don't care about distances larger than a known threshold, a more
+        efficient code path can be taken. For terms that are clearly "too far
+        apart", we will not compute the distance exactly, but we will return
+        `max(len(t1), len(t2))` more quickly, meaning "more than
+        `max_distance`".
+        Default: always compute distance exactly, no threshold clipping.
+
+    Returns
+    -------
+    int
+        The Levenshtein distance between `t1` and `t2`.
+
+    """
+    import Levenshtein
+
+    distance = Levenshtein.distance(t1, t2)
+    if distance > max_distance:
+        return max(len(t1), len(t2))
+    return distance
+
+
+def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0):
+    """Get the Levenshtein similarity between two terms.
+
+    Return the Levenshtein similarity between two terms. The similarity is a
+    number between <0.0, 1.0>, higher is more similar.
+
+    Parameters
+    ----------
+    t1 : {bytes, str, unicode}
+        The first compared term.
+    t2 : {bytes, str, unicode}
+        The second compared term.
+    alpha : float, optional
+        The multiplicative factor alpha defined by Charlet and Damnati (2017).
+    beta : float, optional
+        The exponential factor beta defined by Charlet and Damnati (2017).
+    min_similarity : {int, float}, optional
+        If you don't care about similarities smaller than a known threshold, a
+        more efficient code path can be taken. For terms that are clearly "too
+        far apart", we will not compute the distance exactly, but we will
+        return zero more quickly, meaning "less than `min_similarity`".
+        Default: always compute similarity exactly, no threshold clipping.
+
+    Returns
+    -------
+    float
+        The Levenshtein similarity between `t1` and `t2`.
+
+    Notes
+    -----
+    This notion of Levenshtein similarity was first defined in section 2.2 of
+    `Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3:
+    Soft-Cosine Semantic Similarity between Questions for Community Question
+    Answering", 2017 <http://www.aclweb.org/anthology/S/S17/S17-2051.pdf>`_.
+
+    """
+    assert alpha >= 0
+    assert beta >= 0
+
+    max_lengths = max(len(t1), len(t2))
+    if max_lengths == 0:
+        return 1.0
+
+    min_similarity = float(max(min(min_similarity, 1.0), 0.0))
+    max_distance = int(floor(max_lengths * (1 - (min_similarity / alpha) ** (1 / beta))))
+    distance = levdist(t1, t2, max_distance)
+    similarity = alpha * (1 - distance * 1.0 / max_lengths)**beta
+    return similarity
+
+
+class LevenshteinSimilarityIndex(TermSimilarityIndex):
+    """
+    Computes Levenshtein similarities between terms and retrieves most similar
+    terms for a given term.
+
+    Notes
+    -----
+    This is a naive implementation that iteratively computes pointwise Levenshtein similarities
+    between individual terms. Using this implementation to compute the similarity of all terms in
+    real-world dictionaries such as the English Wikipedia will take years.
+
+    Parameters
+    ----------
+    dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
+        A dictionary that specifies the considered terms.
+    alpha : float, optional
+        The multiplicative factor alpha defined by Charlet and Damnati (2017).
+    beta : float, optional
+        The exponential factor beta defined by Charlet and Damnati (2017).
+    threshold : float, optional
+        Only terms more similar than `threshold` are considered when retrieving
+        the most similar terms for a given term.
+
+    See Also
+    --------
+    :func:`gensim.similarities.levenshtein.levsim`
+        The Levenshtein similarity.
+    :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
+        Build a term similarity matrix and compute the Soft Cosine Measure.
+
+    """
+    def __init__(self, dictionary, alpha=1.8, beta=5.0, threshold=0.0):
+        self.dictionary = dictionary
+        self.alpha = alpha
+        self.beta = beta
+        self.threshold = threshold
+        super(LevenshteinSimilarityIndex, self).__init__()
+
+    def most_similar(self, t1, topn=10):
+        similarities = (
+            (levsim(t1, t2, self.alpha, self.beta, self.threshold), t2)
+            for t2 in self.dictionary.values()
+            if t1 != t2
+        )
+        most_similar = (
+            (t2, similarity)
+            for (similarity, t2) in sorted(similarities, reverse=True)
+            if similarity > 0
+        )
+        return islice(most_similar, topn)
diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py
new file mode 100644
index 0000000000..6a0b6d12b5
--- /dev/null
+++ b/gensim/similarities/termsim.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2018 Vit Novotny <witiko@mail.muni.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+This module provides classes that deal with term similarities.
+"""
+
+from itertools import chain
+import logging
+from math import sqrt
+
+import numpy as np
+from scipy import sparse
+
+from gensim.matutils import corpus2csc
+from gensim.utils import SaveLoad, is_corpus
+
+logger = logging.getLogger(__name__)
+
+
+class TermSimilarityIndex(SaveLoad):
+    """
+    Retrieves most similar terms for a given term.
+
+    See Also
+    --------
+    :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
+        Build a term similarity matrix and compute the Soft Cosine Measure.
+
+    """
+    def most_similar(self, term, topn=10):
+        """Get most similar terms for a given term.
+
+        Return most similar terms for a given term along with the similarities.
+
+        Parameters
+        ----------
+        term : str
+            Tne term for which we are retrieving `topn` most similar terms.
+        topn : int, optional
+            The maximum number of most similar terms to `term` that will be retrieved.
+
+        Returns
+        -------
+        iterable of (str, float)
+            Most similar terms along with their similarities to `term`. Only terms distinct from
+            `term` must be returned.
+
+        """
+        raise NotImplementedError
+
+
+class UniformTermSimilarityIndex(TermSimilarityIndex):
+    """
+    Retrieves most similar terms for a given term under the hypothesis that the similarities between
+    distinct terms are uniform.
+
+    Parameters
+    ----------
+    dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
+        A dictionary that specifies the considered terms.
+    term_similarity : float, optional
+        The uniform similarity between distinct terms.
+
+    See Also
+    --------
+    :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
+        Build a term similarity matrix and compute the Soft Cosine Measure.
+
+    Notes
+    -----
+    This class is mainly intended for testing SparseTermSimilarityMatrix and other classes that
+    depend on the TermSimilarityIndex.
+
+    """
+    def __init__(self, dictionary, term_similarity=0.5):
+        self.dictionary = sorted(dictionary.items())
+        self.term_similarity = term_similarity
+
+    def most_similar(self, t1, topn=10):
+        for __, (t2_index, t2) in zip(range(topn), (
+                (t2_index, t2) for t2_index, t2 in self.dictionary if t2 != t1)):
+            yield (t2, self.term_similarity)
+
+
+def _shortest_uint_dtype(max_value):
+    """Get the shortest unsingned integer data-type required for representing values up to a given
+    maximum value.
+
+    Returns the shortest unsingned integer data-type required for representing values up to a given
+    maximum value.
+
+    Parameters
+    ----------
+    max_value : int
+        The maximum value we wish to represent.
+
+    Returns
+    -------
+    data-type
+        The shortest unsigned integer data-type required for representing values up to a given
+        maximum value.
+    """
+    if max_value < 2**8:
+        return np.uint8
+    elif max_value < 2**16:
+        return np.uint16
+    elif max_value < 2**32:
+        return np.uint32
+    return np.uint64
+
+
+class SparseTermSimilarityMatrix(SaveLoad):
+    """
+    Builds a sparse term similarity matrix using a term similarity index.
+
+    Notes
+    -----
+    Building a DOK matrix, and converting it to a CSC matrix carries a significant memory overhead.
+    Future work should switch to building arrays of rows, columns, and non-zero elements and
+    directly passing these arrays to the CSC matrix constructor without copying.
+
+    Examples
+    --------
+    >>> from gensim.test.utils import common_texts
+    >>> from gensim.corpora import Dictionary
+    >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
+    >>> from gensim.similarities import SoftCosineSimilarity, TermSimilarityMatrix
+    >>>
+    >>> model = Word2Vec(common_texts, size=20, min_count=1)  # train word-vectors
+    >>> termsim_index = WordEmbeddingSimilarityIndex(model)
+    >>> dictionary = Dictionary(common_texts)
+    >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
+    >>> similarity_matrix = TermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
+    >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
+    >>>
+    >>> query = 'graph trees computer'.split()  # make a query
+    >>> sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus
+
+    Check out `Tutorial Notebook
+    <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb>`_
+    for more examples.
+
+    Parameters
+    ----------
+    source : :class:`~gensim.similarities.termsim.TermSimilarityIndex` or :class:`scipy.sparse.spmatrix`
+        The source of the term similarity. Either a term similarity index that will be used for
+        building the term similarity matrix, or an existing sparse term similarity matrix that will
+        be encapsulated and stored in the matrix attribute.
+    dictionary : :class:`~gensim.corpora.dictionary.Dictionary` or None, optional
+        A dictionary that specifies a mapping between terms and the indices of rows and columns
+        of the resulting term similarity matrix. The dictionary may only be `None` when `source` is
+        a :class:`scipy.sparse.spmatrix`.
+    tfidf : :class:`gensim.models.tfidfmodel.TfidfModel` or None, optional
+        A model that specifies the relative importance of the terms in the dictionary. The columns
+        of the term similarity matrix will be build in a decreasing order of importance of
+        terms, or in the order of term identifiers if None.
+    symmetric : bool, optional
+        Whether the symmetry of the term similarity matrix will be enforced. This parameter only has
+        an effect when `source` is a :class:`scipy.sparse.spmatrix`. Positive definiteness is a
+        necessary precondition if you later wish to derive a change-of-basis matrix from the term
+        similarity matrix using Cholesky factorization.
+    positive_definite: bool, optional
+        Whether the positive definiteness of the term similarity matrix will be enforced through
+        strict column diagonal dominance. Positive definiteness is a necessary precondition if you
+        later wish to derive a change-of-basis matrix from the term similarity matrix using Cholesky
+        factorization.
+    nonzero_limit : {int, None}, optional
+        The maximum number of non-zero elements outside the diagonal in a single column of the
+        sparse term similarity matrix. If None, then no limit will be imposed.
+    dtype : numpy.dtype, optional
+        Data-type of the sparse term similarity matrix.
+
+    Attributes
+    ----------
+    matrix : :class:`scipy.sparse.csc_matrix`
+        The encapsulated sparse term similarity matrix.
+    """
+    PROGRESS_MESSAGE_PERIOD = 1000  # how many columns are processed between progress messages
+
+    def __init__(self, source, dictionary=None, tfidf=None, symmetric=True, positive_definite=False, nonzero_limit=100,
+                 dtype=np.float32):
+        if sparse.issparse(source):
+            self.matrix = source.tocsc()  # encapsulate the passed sparse matrix
+            return
+
+        index = source
+        assert isinstance(index, TermSimilarityIndex)
+        assert dictionary is not None
+        matrix_order = len(dictionary)
+
+        logger.info("constructing a sparse term similarity matrix using %s", index)
+
+        if nonzero_limit is None:
+            nonzero_limit = matrix_order
+
+        if tfidf is None:
+            logger.info("iterating over columns in dictionary order")
+            columns = sorted(dictionary.keys())
+        else:
+            assert max(tfidf.idfs) == matrix_order - 1
+            logger.info("iterating over columns in tf-idf order")
+            columns = [
+                term_index for term_index, _
+                in sorted(
+                    tfidf.idfs.items(),
+                    key=lambda x: (lambda term_index, term_idf: (term_idf, -term_index))(*x), reverse=True)]
+
+        column_nonzero = np.array([1] * matrix_order, dtype=_shortest_uint_dtype(nonzero_limit))
+        column_sum = np.zeros(matrix_order, dtype=dtype)
+        matrix = sparse.identity(matrix_order, dtype=dtype, format="dok")
+
+        for column_number, t1_index in enumerate(columns):
+            if column_number % self.PROGRESS_MESSAGE_PERIOD == 0:
+                logger.info(
+                    "PROGRESS: at %.02f%% columns (%d / %d, %.06f%% density, "
+                    "%.06f%% projected density)",
+                    100.0 * (column_number + 1) / matrix_order, column_number + 1, matrix_order,
+                    100.0 * matrix.getnnz() / matrix_order**2,
+                    100.0 * np.clip(
+                        (1.0 * (matrix.getnnz() - matrix_order) / matrix_order**2)
+                        * (1.0 * matrix_order / (column_number + 1))
+                        + (1.0 / matrix_order),  # add density correspoding to the main diagonal
+                        0.0, 1.0))
+
+            t1 = dictionary[t1_index]
+            num_nonzero = column_nonzero[t1_index] - 1
+            num_rows = nonzero_limit - num_nonzero
+            most_similar = [
+                (dictionary.token2id[term], similarity)
+                for term, similarity in index.most_similar(t1, num_rows)
+                if term in dictionary.token2id]
+
+            if tfidf is None:
+                rows = sorted(most_similar)
+            else:
+                rows = sorted(
+                    most_similar,
+                    key=lambda x: (lambda term_index, _: (tfidf.idfs[term_index], -term_index))(*x), reverse=True)
+
+            for row_number, (t2_index, similarity) in zip(range(num_rows), rows):
+                if positive_definite and column_sum[t1_index] + similarity >= 1.0:
+                    break
+                if symmetric:
+                    if column_nonzero[t2_index] <= nonzero_limit \
+                            and (not positive_definite or column_sum[t2_index] + similarity < 1.0) \
+                            and not (t1_index, t2_index) in matrix:
+                        matrix[t1_index, t2_index] = similarity
+                        column_nonzero[t1_index] += 1
+                        column_sum[t1_index] += abs(similarity)
+                        matrix[t2_index, t1_index] = similarity
+                        column_nonzero[t2_index] += 1
+                        column_sum[t2_index] += abs(similarity)
+                else:
+                    matrix[t1_index, t2_index] = similarity
+                    column_sum[t1_index] += abs(similarity)
+
+        logger.info(
+            "constructed a sparse term similarity matrix with %0.06f%% density",
+            100.0 * matrix.getnnz() / matrix_order**2)
+
+        matrix = matrix.T
+        assert sparse.issparse(matrix)
+        self.__init__(matrix)
+
+    def inner_product(self, X, Y, normalized=False):
+        """Get the inner product(s) between real vectors / corpora X and Y.
+
+        Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a
+        non-orthogonal normalized basis, where the dot product between the basis vectors is given by
+        the sparse term similarity matrix.
+
+        Parameters
+        ----------
+        vec1 : list of (int, float) or iterable of list of (int, float)
+            A query vector / corpus in the sparse bag-of-words format.
+        vec2 : list of (int, float) or iterable of list of (int, float)
+            A document vector / corpus in the sparse bag-of-words format.
+        normalized : bool, optional
+            Whether the inner product should be L2-normalized. The normalized inner product
+            corresponds to the Soft Cosine Measure (SCM). SCM is a number between <-1.0, 1.0>,
+            where higher is more similar.
+
+        Returns
+        -------
+        `self.matrix.dtype`,  `scipy.sparse.csr_matrix`, or :class:`numpy.matrix`
+            The inner product(s) between `X` and `Y`.
+
+        References
+        ----------
+        The soft cosine measure was perhaps first described by [sidorovetal14]_.
+
+        .. [sidorovetal14] Grigori Sidorov et al., "Soft Similarity and Soft Cosine Measure: Similarity
+           of Features in Vector Space Model", 2014, http://www.cys.cic.ipn.mx/ojs/index.php/CyS/article/view/2043/1921.
+
+        """
+        if not X or not Y:
+            return self.matrix.dtype.type(0.0)
+
+        is_corpus_X, X = is_corpus(X)
+        is_corpus_Y, Y = is_corpus(Y)
+
+        if not is_corpus_X and not is_corpus_Y:
+            X = dict(X)
+            Y = dict(Y)
+            word_indices = np.array(sorted(set(chain(X, Y))))
+            dtype = self.matrix.dtype
+            X = np.array([X[i] if i in X else 0 for i in word_indices], dtype=dtype)
+            Y = np.array([Y[i] if i in Y else 0 for i in word_indices], dtype=dtype)
+            matrix = self.matrix[word_indices[:, None], word_indices].todense()
+
+            result = X.T.dot(matrix).dot(Y)
+
+            if normalized:
+                X_norm = X.T.dot(matrix).dot(X)[0, 0]
+                Y_norm = Y.T.dot(matrix).dot(Y)[0, 0]
+
+                assert \
+                    X_norm > 0.0 and Y_norm > 0.0, \
+                    u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \
+                    u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x."
+
+                result /= sqrt(X_norm) * sqrt(Y_norm)
+                result = np.clip(result, -1.0, 1.0)
+
+            return result[0, 0]
+        elif not is_corpus_X or not is_corpus_Y:
+            if is_corpus_X and not is_corpus_Y:
+                is_corpus_X, X, is_corpus_Y, Y = is_corpus_Y, Y, is_corpus_X, X  # make Y the corpus
+                transposed = True
+            else:
+                transposed = False
+
+            dtype = self.matrix.dtype
+            expanded_X = corpus2csc([X], num_terms=self.matrix.shape[0], dtype=dtype).T.dot(self.matrix)
+            word_indices = np.array(sorted(expanded_X.nonzero()[1]))
+            del expanded_X
+
+            X = dict(X)
+            X = np.array([X[i] if i in X else 0 for i in word_indices], dtype=dtype)
+            Y = corpus2csc(Y, num_terms=self.matrix.shape[0], dtype=dtype)[word_indices, :].todense()
+            matrix = self.matrix[word_indices[:, None], word_indices].todense()
+            if normalized:
+                # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T
+                X_norm = np.multiply(X.T.dot(matrix), X.T).sum(axis=1).T
+                Y_norm = np.multiply(Y.T.dot(matrix), Y.T).sum(axis=1).T
+
+                assert \
+                    X_norm.min() > 0.0 and Y_norm.min() >= 0.0, \
+                    u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \
+                    u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x."
+
+                X = np.multiply(X, 1 / np.sqrt(X_norm)).T
+                Y = np.multiply(Y, 1 / np.sqrt(Y_norm))
+                Y = np.nan_to_num(Y)  # Account for division by zero when Y_norm.min() == 0.0
+
+            result = X.T.dot(matrix).dot(Y)
+
+            if normalized:
+                result = np.clip(result, -1.0, 1.0)
+
+            if transposed:
+                result = result.T
+
+            return result
+        else:  # if is_corpus_X and is_corpus_Y:
+            dtype = self.matrix.dtype
+            X = corpus2csc(X if is_corpus_X else [X], num_terms=self.matrix.shape[0], dtype=dtype)
+            Y = corpus2csc(Y if is_corpus_Y else [Y], num_terms=self.matrix.shape[0], dtype=dtype)
+            matrix = self.matrix
+
+            if normalized:
+                # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T
+                X_norm = X.T.dot(matrix).multiply(X.T).sum(axis=1).T
+                Y_norm = Y.T.dot(matrix).multiply(Y.T).sum(axis=1).T
+
+                assert \
+                    X_norm.min() > 0.0 and Y_norm.min() >= 0.0, \
+                    u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \
+                    u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x."
+
+                X = X.multiply(sparse.csr_matrix(1 / np.sqrt(X_norm)))
+                Y = Y.multiply(sparse.csr_matrix(1 / np.sqrt(Y_norm)))
+                Y[Y == np.inf] = 0  # Account for division by zero when Y_norm.min() == 0.0
+
+            result = X.T.dot(matrix).dot(Y)
+
+            if normalized:
+                result.data = np.clip(result.data, -1.0, 1.0)
+
+            return result
diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
index fc15dcd871..abe1bcdcfe 100644
--- a/gensim/test/test_keyedvectors.py
+++ b/gensim/test/test_keyedvectors.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 from gensim.corpora import Dictionary
-from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel
+from gensim.models import KeyedVectors as EuclideanKeyedVectors, WordEmbeddingSimilarityIndex
 from gensim.test.utils import datapath
 
 import gensim.models.keyedvectors
@@ -24,6 +24,51 @@
 logger = logging.getLogger(__name__)
 
 
+class TestWordEmbeddingSimilarityIndex(unittest.TestCase):
+    def setUp(self):
+        self.vectors = EuclideanKeyedVectors.load_word2vec_format(
+            datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
+
+    def test_most_similar(self):
+        """Test most_similar returns expected results."""
+
+        # check the handling of out-of-dictionary terms
+        index = WordEmbeddingSimilarityIndex(self.vectors)
+        self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10))))
+        self.assertEqual(0, len(list(index.most_similar(u"out-of-dictionary term", topn=10))))
+
+        # check that the topn works as expected
+        index = WordEmbeddingSimilarityIndex(self.vectors)
+        results = list(index.most_similar(u"holiday", topn=10))
+        self.assertLess(0, len(results))
+        self.assertGreaterEqual(10, len(results))
+        results = list(index.most_similar(u"holiday", topn=20))
+        self.assertLess(10, len(results))
+        self.assertGreaterEqual(20, len(results))
+
+        # check that the term itself is not returned
+        index = WordEmbeddingSimilarityIndex(self.vectors)
+        terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.vectors.vocab))]
+        self.assertFalse(u"holiday" in terms)
+
+        # check that the threshold works as expected
+        index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0)
+        results = list(index.most_similar(u"holiday", topn=10))
+        self.assertLess(0, len(results))
+        self.assertGreaterEqual(10, len(results))
+
+        index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0)
+        results = list(index.most_similar(u"holiday", topn=10))
+        self.assertEqual(0, len(results))
+
+        # check that the exponent works as expected
+        index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0)
+        first_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
+        index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0)
+        second_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
+        self.assertTrue(np.allclose(first_similarities**2.0, second_similarities))
+
+
 class TestEuclideanKeyedVectors(unittest.TestCase):
     def setUp(self):
         self.vectors = EuclideanKeyedVectors.load_word2vec_format(
@@ -32,60 +77,14 @@ def setUp(self):
     def test_similarity_matrix(self):
         """Test similarity_matrix returns expected results."""
 
-        documents = [["government", "denied", "holiday"],
-                  ["holiday", "slowing", "hollingworth"]]
+        documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
         dictionary = Dictionary(documents)
-
-        # checking symmetry and the existence of ones on the diagonal
         similarity_matrix = self.vectors.similarity_matrix(dictionary).todense()
-        self.assertTrue((similarity_matrix.T == similarity_matrix).all())
+
+        # checking the existence of ones on the main diagonal
         self.assertTrue(
             (np.diag(similarity_matrix) == np.ones(similarity_matrix.shape[0])).all())
 
-        # checking that thresholding works as expected
-        similarity_matrix = self.vectors.similarity_matrix(dictionary, threshold=0.45).todense()
-        self.assertEqual(18, np.sum(similarity_matrix == 0))
-
-        # checking that exponent works as expected
-        similarity_matrix = self.vectors.similarity_matrix(dictionary, exponent=1.0).todense()
-        self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix), places=5)
-
-        # checking that nonzero_limit works as expected
-        similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=4).todense()
-        self.assertEqual(4, np.sum(similarity_matrix == 0))
-
-        similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense()
-        self.assertEqual(20, np.sum(similarity_matrix == 0))
-
-        # check that processing rows in the order given by IDF has desired effect
-
-        # The complete similarity matrix we would obtain with nonzero_limit would look as follows:
-        documents = [["honour", "understanding"], ["understanding", "mean", "knop"]]
-        dictionary = Dictionary(documents)
-        tfidf = TfidfModel(dictionary=dictionary)
-
-        # All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1.
-        # The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0.
-        #
-        # If we do not pass the tfidf parameter to the similarity_matrix
-        # method, then we process rows in the order from 1 to 4. If we do pass
-        # the tfidf parameter to the similarity_matrix method, then we first
-        # process the rows 1, 3, 4 that correspond to terms with IDF of 1.0 and
-        # then the row 2 that corresponds to the term "understanding" with IDF
-        # of 0. Since the method is greedy, we will end up with two different
-        # similarity matrices.
-
-        similarity_matrix = self.vectors.similarity_matrix(
-            dictionary, nonzero_limit=2).todense()
-        self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([
-            [1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]))))
-
-        similarity_matrix = self.vectors.similarity_matrix(
-            dictionary, tfidf, nonzero_limit=2).todense()
-        self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([
-            [1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0], [0, 0.90007025, 1, 0],
-            [0.9112908, 0, 0, 1]]))))
-
     def test_most_similar(self):
         """Test most_similar returns expected results."""
         expected = [
diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
index e1f876e216..7aafbd34d7 100644
--- a/gensim/test/test_similarities.py
+++ b/gensim/test/test_similarities.py
@@ -11,12 +11,14 @@
 
 import logging
 import unittest
+import math
 import os
 
 import numpy
 import scipy
 
 from smart_open import smart_open
+from gensim.corpora import Dictionary
 from gensim.models import word2vec
 from gensim.models import doc2vec
 from gensim.models import KeyedVectors
@@ -25,6 +27,10 @@
 from gensim.models import Word2Vec, FastText
 from gensim.test.utils import (datapath, get_tmpfile,
     common_texts as texts, common_dictionary as dictionary, common_corpus as corpus)
+from gensim.similarities import UniformTermSimilarityIndex
+from gensim.similarities import SparseTermSimilarityMatrix
+from gensim.similarities import LevenshteinSimilarityIndex
+from gensim.similarities.levenshtein import levdist, levsim
 
 try:
     from pyemd import emd  # noqa:F401
@@ -371,7 +377,7 @@ def setUp(self):
         similarity_matrix = scipy.sparse.identity(12, format="lil")
         similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5
         similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5
-        self.similarity_matrix = similarity_matrix.tocsc()
+        self.similarity_matrix = SparseTermSimilarityMatrix(similarity_matrix)
 
     def factoryMethod(self):
         # Override factoryMethod.
@@ -393,8 +399,6 @@ def testFull(self, num_best=None):
             self.assertAlmostEqual(1.0, sims[0])  # Similarity of a document with itself is 1.0.
             self.assertTrue(numpy.alltrue(sims[1:] >= 0.0))
             self.assertTrue(numpy.alltrue(sims[1:] < 1.0))
-            expected = 2.1889350195476758
-            self.assertAlmostEqual(expected, numpy.sum(sims))
 
         # Corpora
         for query in (
@@ -425,8 +429,8 @@ def testNonIncreasing(self):
         sims = index[query]
         sims2 = numpy.asarray(sims)[:, 1]  # Just the similarities themselves.
 
-        # The difference of adjacent elements should be negative.
-        cond = sum(numpy.diff(sims2) < 0) == len(sims2) - 1
+        # The difference of adjacent elements should be less than or equal to zero.
+        cond = sum(numpy.diff(sims2) <= 0) == len(sims2) - 1
         self.assertTrue(cond)
 
     def testChunking(self):
@@ -684,6 +688,385 @@ def testSaveLoad(self):
         self.assertEqual(self.index.num_trees, self.index2.num_trees)
 
 
+class TestUniformTermSimilarityIndex(unittest.TestCase):
+    def setUp(self):
+        self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
+        self.dictionary = Dictionary(self.documents)
+
+    def test_most_similar(self):
+        """Test most_similar returns expected results."""
+
+        # check that the topn works as expected
+        index = UniformTermSimilarityIndex(self.dictionary)
+        results = list(index.most_similar(u"holiday", topn=1))
+        self.assertLess(0, len(results))
+        self.assertGreaterEqual(1, len(results))
+        results = list(index.most_similar(u"holiday", topn=4))
+        self.assertLess(1, len(results))
+        self.assertGreaterEqual(4, len(results))
+
+        # check that the term itself is not returned
+        index = UniformTermSimilarityIndex(self.dictionary)
+        terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))]
+        self.assertFalse(u"holiday" in terms)
+
+        # check that the term_similarity works as expected
+        index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.2)
+        similarities = numpy.array([
+            similarity for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))])
+        self.assertTrue(numpy.all(similarities == 0.2))
+
+
+class TestSparseTermSimilarityMatrix(unittest.TestCase):
+    def setUp(self):
+        self.documents = [
+            [u"government", u"denied", u"holiday"],
+            [u"government", u"denied", u"holiday", u"slowing", u"hollingworth"]]
+        self.dictionary = Dictionary(self.documents)
+        self.tfidf = TfidfModel(dictionary=self.dictionary)
+        self.index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5)
+
+    def test_type(self):
+        """Test the type of the produced matrix."""
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix
+        self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix))
+
+    def test_diagonal(self):
+        """Test the existence of ones on the main diagonal."""
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix.todense()
+        self.assertTrue(numpy.all(numpy.diag(matrix) == numpy.ones(matrix.shape[0])))
+
+    def test_order(self):
+        """Test the matrix order."""
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix.todense()
+        self.assertEqual(matrix.shape[0], len(self.dictionary))
+        self.assertEqual(matrix.shape[1], len(self.dictionary))
+
+    def test_dtype(self):
+        """Test the dtype parameter of the matrix constructor."""
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, dtype=numpy.float32).matrix.todense()
+        self.assertEqual(numpy.float32, matrix.dtype)
+
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, dtype=numpy.float64).matrix.todense()
+        self.assertEqual(numpy.float64, matrix.dtype)
+
+    def test_nonzero_limit(self):
+        """Test the nonzero_limit parameter of the matrix constructor."""
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=100).matrix.todense()
+        self.assertGreaterEqual(101, numpy.max(numpy.sum(matrix != 0, axis=0)))
+
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=4).matrix.todense()
+        self.assertGreaterEqual(5, numpy.max(numpy.sum(matrix != 0, axis=0)))
+
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=1).matrix.todense()
+        self.assertGreaterEqual(2, numpy.max(numpy.sum(matrix != 0, axis=0)))
+
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=0).matrix.todense()
+        self.assertEqual(1, numpy.max(numpy.sum(matrix != 0, axis=0)))
+        self.assertTrue(numpy.all(matrix == numpy.eye(matrix.shape[0])))
+
+    def test_symmetric(self):
+        """Test the symmetric parameter of the matrix constructor."""
+        matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix.todense()
+        self.assertTrue(numpy.all(matrix == matrix.T))
+
+        matrix = SparseTermSimilarityMatrix(
+            self.index, self.dictionary, nonzero_limit=1).matrix.todense()
+        expected_matrix = numpy.array([
+            [1.0, 0.5, 0.0, 0.0, 0.0],
+            [0.5, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(numpy.all(expected_matrix == matrix))
+
+        matrix = SparseTermSimilarityMatrix(
+            self.index, self.dictionary, nonzero_limit=1, symmetric=False).matrix.todense()
+        expected_matrix = numpy.array([
+            [1.0, 0.5, 0.5, 0.5, 0.5],
+            [0.5, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(numpy.all(expected_matrix == matrix))
+
+    def test_positive_definite(self):
+        """Test the positive_definite parameter of the matrix constructor."""
+        matrix = SparseTermSimilarityMatrix(
+            self.index, self.dictionary, nonzero_limit=2).matrix.todense()
+        expected_matrix = numpy.array([
+            [1.0, 0.5, 0.5, 0.0, 0.0],
+            [0.5, 1.0, 0.0, 0.5, 0.0],
+            [0.5, 0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.5, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(numpy.all(expected_matrix == matrix))
+
+        matrix = SparseTermSimilarityMatrix(
+            self.index, self.dictionary, nonzero_limit=2, positive_definite=True).matrix.todense()
+        expected_matrix = numpy.array([
+            [1.0, 0.5, 0.0, 0.0, 0.0],
+            [0.5, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(numpy.all(expected_matrix == matrix))
+
+    def test_tfidf(self):
+        """Test the tfidf parameter of the matrix constructor."""
+        matrix = SparseTermSimilarityMatrix(
+            self.index, self.dictionary, nonzero_limit=1).matrix.todense()
+        expected_matrix = numpy.array([
+            [1.0, 0.5, 0.0, 0.0, 0.0],
+            [0.5, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(numpy.all(expected_matrix == matrix))
+
+        matrix = SparseTermSimilarityMatrix(
+            self.index, self.dictionary, nonzero_limit=1, tfidf=self.tfidf).matrix.todense()
+        expected_matrix = numpy.array([
+            [1.0, 0.0, 0.0, 0.5, 0.0],
+            [0.0, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0],
+            [0.5, 0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(numpy.all(expected_matrix == matrix))
+
+    def test_encapsulation(self):
+        """Test the matrix encapsulation."""
+
+        # check that a sparse matrix will be converted to a CSC format
+        expected_matrix = numpy.array([
+            [1.0, 2.0, 3.0],
+            [0.0, 1.0, 4.0],
+            [0.0, 0.0, 1.0]])
+
+        matrix = SparseTermSimilarityMatrix(scipy.sparse.csc_matrix(expected_matrix)).matrix
+        self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix))
+        self.assertTrue(numpy.all(matrix.todense() == expected_matrix))
+
+        matrix = SparseTermSimilarityMatrix(scipy.sparse.csr_matrix(expected_matrix)).matrix
+        self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix))
+        self.assertTrue(numpy.all(matrix.todense() == expected_matrix))
+
+    def test_inner_product(self):
+        """Test the inner product."""
+
+        matrix = SparseTermSimilarityMatrix(
+            UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5), self.dictionary)
+
+        # check zero vectors work as expected
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+
+        self.assertEqual(0.0, matrix.inner_product([], vec2))
+        self.assertEqual(0.0, matrix.inner_product(vec1, []))
+        self.assertEqual(0.0, matrix.inner_product([], []))
+
+        self.assertEqual(0.0, matrix.inner_product([], vec2, normalized=True))
+        self.assertEqual(0.0, matrix.inner_product(vec1, [], normalized=True))
+        self.assertEqual(0.0, matrix.inner_product([], [], normalized=True))
+
+        # check that real-world vectors work as expected
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = 0.0
+        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
+        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
+        result = matrix.inner_product(vec1, vec2)
+        self.assertAlmostEqual(expected_result, result, places=5)
+
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = matrix.inner_product(vec1, vec2)
+        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
+        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
+        result = matrix.inner_product(vec1, vec2, normalized=True)
+        self.assertAlmostEqual(expected_result, result, places=5)
+
+        # check that real-world (vector, corpus) pairs work as expected
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = 0.0
+        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
+        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
+        expected_result = numpy.full((1, 2), expected_result)
+        result = matrix.inner_product(vec1, [vec2] * 2)
+        self.assertTrue(isinstance(result, numpy.ndarray))
+        self.assertTrue(numpy.allclose(expected_result, result))
+
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = matrix.inner_product(vec1, vec2)
+        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
+        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
+        expected_result = numpy.full((1, 2), expected_result)
+        result = matrix.inner_product(vec1, [vec2] * 2, normalized=True)
+        self.assertTrue(isinstance(result, numpy.ndarray))
+        self.assertTrue(numpy.allclose(expected_result, result))
+
+        # check that real-world (corpus, vector) pairs work as expected
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = 0.0
+        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
+        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
+        expected_result = numpy.full((3, 1), expected_result)
+        result = matrix.inner_product([vec1] * 3, vec2)
+        self.assertTrue(isinstance(result, numpy.ndarray))
+        self.assertTrue(numpy.allclose(expected_result, result))
+
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = matrix.inner_product(vec1, vec2)
+        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
+        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
+        expected_result = numpy.full((3, 1), expected_result)
+        result = matrix.inner_product([vec1] * 3, vec2, normalized=True)
+        self.assertTrue(isinstance(result, numpy.ndarray))
+        self.assertTrue(numpy.allclose(expected_result, result))
+
+        # check that real-world corpora work as expected
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = 0.0
+        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
+        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
+        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
+        expected_result = numpy.full((3, 2), expected_result)
+        result = matrix.inner_product([vec1] * 3, [vec2] * 2)
+        self.assertTrue(isinstance(result, scipy.sparse.csr_matrix))
+        self.assertTrue(numpy.allclose(expected_result, result.todense()))
+
+        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
+        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
+        expected_result = matrix.inner_product(vec1, vec2)
+        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
+        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
+        expected_result = numpy.full((3, 2), expected_result)
+        result = matrix.inner_product([vec1] * 3, [vec2] * 2, normalized=True)
+        self.assertTrue(isinstance(result, scipy.sparse.csr_matrix))
+        self.assertTrue(numpy.allclose(expected_result, result.todense()))
+
+
+class TestLevenshteinDistance(unittest.TestCase):
+    def test_max_distance(self):
+        t1 = "holiday"
+        t2 = "day"
+        max_distance = max(len(t1), len(t2))
+
+        self.assertEqual(4, levdist(t1, t2))
+        self.assertEqual(4, levdist(t1, t2, 4))
+        self.assertEqual(max_distance, levdist(t1, t2, 2))
+        self.assertEqual(max_distance, levdist(t1, t2, -2))
+
+
+class TestLevenshteinSimilarity(unittest.TestCase):
+    def test_empty_strings(self):
+        t1 = ""
+        t2 = ""
+
+        self.assertEqual(1.0, levsim(t1, t2))
+
+    def test_negative_hyperparameters(self):
+        t1 = "holiday"
+        t2 = "day"
+        alpha = 2.0
+        beta = 2.0
+
+        with self.assertRaises(AssertionError):
+            levsim(t1, t2, -alpha, beta)
+
+        with self.assertRaises(AssertionError):
+            levsim(t1, t2, alpha, -beta)
+
+        with self.assertRaises(AssertionError):
+            levsim(t1, t2, -alpha, -beta)
+
+    def test_min_similarity(self):
+        t1 = "holiday"
+        t2 = "day"
+        alpha = 2.0
+        beta = 2.0
+        similarity = alpha * (1 - 4.0 / 7)**beta
+        assert similarity > 0.1 and similarity < 0.5
+
+        self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta))
+
+        self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2))
+        self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2.0))
+
+        self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0))
+        self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0.0))
+
+        self.assertEqual(similarity, levsim(t1, t2, alpha, beta, 0.1))
+        self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 0.5))
+        self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 1.0))
+
+        self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2))
+        self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2.0))
+
+
+class TestLevenshteinSimilarityIndex(unittest.TestCase):
+    def setUp(self):
+        self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
+        self.dictionary = Dictionary(self.documents)
+
+    def test_most_similar(self):
+        """Test most_similar returns expected results."""
+
+        index = LevenshteinSimilarityIndex(self.dictionary)
+        results = list(index.most_similar(u"holiday", topn=1))
+        self.assertLess(0, len(results))
+        self.assertGreaterEqual(1, len(results))
+        results = list(index.most_similar(u"holiday", topn=4))
+        self.assertLess(1, len(results))
+        self.assertGreaterEqual(4, len(results))
+
+        # check the order of the results
+        results = index.most_similar(u"holiday", topn=4)
+        terms, _ = tuple(zip(*results))
+        self.assertEqual((u"hollingworth", u"slowing", u"denied", u"government"), terms)
+
+        # check that the term itself is not returned
+        index = LevenshteinSimilarityIndex(self.dictionary)
+        terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))]
+        self.assertFalse(u"holiday" in terms)
+
+        # check that the threshold works as expected
+        index = LevenshteinSimilarityIndex(self.dictionary, threshold=0.0)
+        results = list(index.most_similar(u"holiday", topn=10))
+        self.assertLess(0, len(results))
+        self.assertGreaterEqual(10, len(results))
+
+        index = LevenshteinSimilarityIndex(self.dictionary, threshold=1.0)
+        results = list(index.most_similar(u"holiday", topn=10))
+        self.assertEqual(0, len(results))
+
+        # check that the alpha works as expected
+        index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0)
+        first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
+        index = LevenshteinSimilarityIndex(self.dictionary, alpha=2.0)
+        second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
+        self.assertTrue(numpy.allclose(2.0 * first_similarities, second_similarities))
+
+        # check that the beta works as expected
+        index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=1.0)
+        first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
+        index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=2.0)
+        second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
+        self.assertTrue(numpy.allclose(first_similarities ** 2.0, second_similarities))
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()
diff --git a/setup.py b/setup.py
index f9db98f8fc..79f1bdc98b 100644
--- a/setup.py
+++ b/setup.py
@@ -232,6 +232,7 @@ def finalize_options(self):
     'testfixtures',
     'scikit-learn',
     'Morfessor==2.0.2a4',
+    'python-Levenshtein >= 0.10.2',
     'visdom >= 0.1.8',
 ]