From f3cf463c0f0e28c97c9a3b319a58a7e099092041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 14 Jan 2019 15:33:59 +0100 Subject: [PATCH] Implement Levenshtein term similarity matrix and fast SCM between corpora (#2016) * Wrap docstring for WordEmbeddingsKeyedVectors.similarity_matrix * Add the gensim.models.levenshtein module * Add projected density to term similarity matrix logs * Add tests for the gensim.models.levenshtein.similarity_matrix function * Separate similarity_matrix methods into director and builder classes. * Add symmetric parameter to SparseTermSimilarityMatrix * Add corpus support to SparseTermSimilarityMatrix.inner_product * Replace scipy.sparse.dok_matrix.has_key with the in operator * Fix handling of unicode in Python 3 in levsim * Remove temporary method similarity of LevenshteinSimilarityIndex * Move models.term_similarity, and levenshtein to similarities * Make python-Levenshtein a conditional import * Add default values to gensim.similarities.levenshtein.levsim arguments * Remove extraneous addition operators from @deprecated annotations * Remove @deprecated annotation from tests * Merge test_term_similarity, and test_levenshtein with test_similarities * Reword TermSimilarityIndex docstring * Consume no more than topn similarities produced by a TermSimilarityIndex * Use short uints (<64b) for dok_matrix keys and num_nonzero array * Write to matrix_nonzero only when building a symmetric matrix * Ensure UniformTermSimilarityIndex does not yield only topn - 1 values * Document _shortest_uint_dtype * Add soft cosine measure benchmark, part 1 * Add soft cosine measure benchmark, part 2 * Make similarity_matrix support non-contiguous dictionaries Closes #2041 * Support fast inner product between a document and a corpus * Support fast inner product between a document and a corpus (python 2.7) * Add faster sparse matrix slicing * Make Soft Cosine Measure support non-contiguous dictionaries * Remove gensim::similarities::levenshtein::similarity_matrix facade * Implement SoftCosineSimilarity using the inner_product method * Fix flake8 warnings * Make Soft Cosine Measure support non-contiguous dictionaries (cont) * Remove parallelization in gensim::similarities::levenshtein * Document future work * Update Soft Cosine Measure benchmark after commits 093d569, and c316b95 * Update SCM tutorial after PR 2016 * Add example to gensim::similarities::termsim::SparseTermSimilarityMatrix * Add max_distance kwarg to gensim::similarities::levenshtein::levsim * Replace max_distance kwarg in levsim with min_similarity, add tests * Remove conditional expression from levsim * Use less confusing wording in docsting for min_similarity / max_distance * Defer thresholding in LevenshteinSimilarityIndex.most_similar to levsim * Allow None value of nonzero_limit parameter in SparseTermSimilarityMatrix * Add positive_definite parameter to SparseTermSimilarityMatrix * Split test_building test into a number of atomic unit tests * Presort dictionary keys in UniformTermSimilarityIndex constructor * Make documentation of SparseTermSimilarityMatrix more accurate * Make SparseTermSimilarityMatrix expect negative similarities * Avoid expensive array copying in dot_product * Update SCM tutorial, and benchmark after PR 2016 * Remove fluff from stderr in the SCM tutorial notebook * Add a paper reference to the SCM tutorial notebook * Directly import Levenshtein package in levdist * Use embedded URI instead of indirect hyperlink target in documentation * Assume that max of lens is always an integer * Make LevenshteinSimilarityIndex.most_similar easier to read * Make LevenshteinSimilarityIndex.most_similar easier to read * Add an ordering test for LevenshteinSimilarityIndex.most_similar * Make WordEmbeddingSimilarityIndex.most_similar easier to read --- docs/notebooks/soft_cosine_benchmark.ipynb | 4605 ++++++++++++++++++++ docs/notebooks/soft_cosine_tutorial.ipynb | 125 +- gensim/matutils.py | 10 +- gensim/models/__init__.py | 2 +- gensim/models/keyedvectors.py | 139 +- gensim/similarities/__init__.py | 2 + gensim/similarities/docsim.py | 77 +- gensim/similarities/levenshtein.py | 153 + gensim/similarities/termsim.py | 394 ++ gensim/test/test_keyedvectors.py | 99 +- gensim/test/test_similarities.py | 393 +- setup.py | 1 + 12 files changed, 5771 insertions(+), 229 deletions(-) create mode 100644 docs/notebooks/soft_cosine_benchmark.ipynb create mode 100644 gensim/similarities/levenshtein.py create mode 100644 gensim/similarities/termsim.py diff --git a/docs/notebooks/soft_cosine_benchmark.ipynb b/docs/notebooks/soft_cosine_benchmark.ipynb new file mode 100644 index 0000000000..9421b84c17 --- /dev/null +++ b/docs/notebooks/soft_cosine_benchmark.ipynb @@ -0,0 +1,4605 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmark: Implement Levenshtein term similarity matrix and fast SCM between corpora ([RaRe-Technologies/gensim PR #2016][#2016])\n", + "\n", + " [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "d429fedf094e00c4bb5c27589d5befb53b2e4b13\r\n" + ] + } + ], + "source": [ + "!git rev-parse HEAD" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "from datetime import timedelta\n", + "from itertools import product\n", + "import logging\n", + "from math import floor, ceil, log10\n", + "import pickle\n", + "from random import sample, seed, shuffle\n", + "from time import time\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm import tqdm_notebook\n", + "\n", + "def tqdm(iterable, total=None, desc=None):\n", + " if total is None:\n", + " total = len(iterable)\n", + " for num_done, element in enumerate(tqdm_notebook(iterable, total=total)):\n", + " logger.info(\"%s: %d / %d\", desc, num_done, total)\n", + " yield element\n", + "\n", + "from gensim.corpora import Dictionary\n", + "import gensim.downloader as api\n", + "from gensim.similarities.index import AnnoyIndexer\n", + "from gensim.similarities import SparseTermSimilarityMatrix\n", + "from gensim.similarities import UniformTermSimilarityIndex\n", + "from gensim.similarities import LevenshteinSimilarityIndex\n", + "from gensim.models import WordEmbeddingSimilarityIndex\n", + "from gensim.utils import simple_preprocess\n", + "\n", + "RANDOM_SEED = 12345\n", + "\n", + "logger = logging.getLogger()\n", + "fhandler = logging.FileHandler(filename='matrix_speed.log', mode='a')\n", + "formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n", + "fhandler.setFormatter(formatter)\n", + "logger.addHandler(fhandler)\n", + "logger.setLevel(logging.INFO)\n", + "\n", + "pd.set_option('display.max_rows', None, 'display.max_seq_items', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Repeatedly run a benchmark callable given various configurations and\n", + "get a list of results.\n", + "\n", + "Return a list of results of repeatedly running a benchmark callable.\n", + "\n", + "Parameters\n", + "----------\n", + "benchmark : callable tuple -> dict\n", + " A benchmark callable that accepts a configuration and returns results.\n", + "configurations : iterable of tuple\n", + " An iterable of configurations that are used for calling the benchmark function.\n", + "results_filename : str\n", + " A filename of a file that will be used to persistently store the results using\n", + " pickle. If the file exists, then the function will load the stored results\n", + " instead of calling the benchmark callable.\n", + "\n", + "Returns\n", + "-------\n", + "iterable of tuple\n", + " The return values of the individual invocations of the benchmark callable.\n", + "\n", + "\"\"\"\n", + "def benchmark_results(benchmark, configurations, results_filename):\n", + " try:\n", + " with open(results_filename, \"rb\") as file:\n", + " results = pickle.load(file)\n", + " except IOError:\n", + " configurations = list(configurations)\n", + " shuffle(configurations)\n", + " results = list(tqdm(\n", + " (benchmark(configuration) for configuration in configurations),\n", + " total=len(configurations), desc=\"benchmark\"))\n", + " with open(results_filename, \"wb\") as file:\n", + " pickle.dump(results, file)\n", + " return results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implement Levenshtein term similarity matrix\n", + "\n", + "In Gensim PR [#1827][], we added a base implementation of the soft cosine measure (SCM). The base implementation would create term similarity matrices using a single complex procedure. In the Gensim PR [#2016][], we split the procedure into:\n", + "\n", + "- **TermSimilarityIndex** builder classes that produce the $k$ most similar terms for a given term $t$ that are distinct from $t$ along with the term similarities, and\n", + "- the **SparseTermSimilarityMatrix** director class that constructs term similarity matrices and consumes term similarities produced by **TermSimilarityIndex** instances.\n", + "\n", + "One of the benefits of this separation is that we can easily measure the speed at which a **TermSimilarityIndex** builder class produces term similarities and compare this speed with the speed at which the **SparseTermSimilarityMatrix** director class consumes term similarities. This allows us to see which of the classes are a bottleneck that slows down the construction of term similarity matrices.\n", + "\n", + "In this notebook, we measure all the currently available builder and director classes. For the measurements, we use the [Google News word embeddings][word2vec-google-news-300] distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01M terms.\n", + "\n", + " [word2vec-google-news-300]: https://github.com/mmihaltz/word2vec-GoogleNews-vectors (word2vec-GoogleNews-vectors)\n", + " [#1827]: https://github.com/RaRe-Technologies/gensim/pull/1827 (Implement Soft Cosine Measure - Pull Request #1827)\n", + " [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "full_model = api.load(\"word2vec-google-news-300\")\n", + "\n", + "try:\n", + " full_dictionary = Dictionary.load(\"matrix_speed.dictionary\")\n", + "except IOError:\n", + " full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()])\n", + " full_dictionary.save(\"matrix_speed.dictionary\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Director class benchmark\n", + "#### SparseTermSimilarityMatrix\n", + "First, we measure the speed at which the **SparseTermSimilarityMatrix** director class consumes term similarities." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def benchmark(configuration):\n", + " dictionary, nonzero_limit, symmetric, positive_definite, repetition = configuration\n", + " index = UniformTermSimilarityIndex(dictionary)\n", + " \n", + " start_time = time()\n", + " matrix = SparseTermSimilarityMatrix(\n", + " index, dictionary, nonzero_limit=nonzero_limit, symmetric=symmetric,\n", + " positive_definite=positive_definite, dtype=np.float16).matrix\n", + " end_time = time()\n", + " \n", + " duration = end_time - start_time\n", + " return {\n", + " \"dictionary_size\": len(dictionary),\n", + " \"nonzero_limit\": nonzero_limit,\n", + " \"matrix_nonzero\": matrix.nnz,\n", + " \"repetition\": repetition,\n", + " \"symmetric\": symmetric,\n", + " \"positive_definite\": positive_definite,\n", + " \"duration\": duration, }" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4aef903a70e24247ad3c889237ed4c48", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=4), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "dictionary_sizes = [10**k for k in range(3, int(ceil(log10(len(full_dictionary)))))]\n", + "seed(RANDOM_SEED)\n", + "dictionaries = []\n", + "for size in tqdm(dictionary_sizes, desc=\"dictionaries\"):\n", + " dictionary = Dictionary([sample(list(full_dictionary.values()), size)])\n", + " dictionaries.append(dictionary)\n", + "dictionaries.append(full_dictionary)\n", + "nonzero_limits = [1, 10, 100]\n", + "symmetry = (True, False)\n", + "positive_definiteness = (True, False)\n", + "repetitions = range(10)\n", + "\n", + "configurations = product(dictionaries, nonzero_limits, symmetry, positive_definiteness, repetitions)\n", + "results = benchmark_results(benchmark, configurations, \"matrix_speed.director_results\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following tables show how long it takes to construct a term similarity matrix (the **duration** column), how many nonzero elements there are in the matrix (the **matrix_nonzero** column) and the mean term similarity consumption speed (the **consumption_speed** column) as we vary the dictionary size (the **dictionary_size** column) the maximum number of nonzero elements outside the diagonal in every column of the matrix (the **nonzero_limit** column), the matrix symmetry constraint (the **symmetric** column), and the matrix positive definiteness constraing (the **positive_definite** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n", + "\n", + "We can see that the symmetry and positive definiteness constraints severely limit the number of nonzero elements in the resulting matrix. This in turn increases the consumption speed, since we end up throwing away most of the elements that we consume. The effects of the dictionary size on the mean term similarity consumption speed are minor to none." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(results)\n", + "df[\"consumption_speed\"] = df.dictionary_size * df.nonzero_limit / df.duration\n", + "df = df.groupby([\"dictionary_size\", \"nonzero_limit\", \"symmetric\", \"positive_definite\"])\n", + "\n", + "def display(df):\n", + " df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n", + " df[\"matrix_nonzero\"] = [int(nonzero) for nonzero in df[\"matrix_nonzero\"]]\n", + " df[\"consumption_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"consumption_speed\"]]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationmatrix_nonzeroconsumption_speed
dictionary_sizenonzero_limitsymmetricpositive_definite
100001FalseFalse00:00:00.4355332000022.96 Kword pairs / s
True00:00:00.4926062000020.30 Kword pairs / s
TrueFalse00:00:00.1855631000253.90 Kword pairs / s
True00:00:00.2404711000241.59 Kword pairs / s
10FalseFalse00:00:02.68783611000037.21 Kword pairs / s
True00:00:00.61549220000162.49 Kword pairs / s
TrueFalse00:00:00.50118810118199.53 Kword pairs / s
True00:00:01.3805861001072.44 Kword pairs / s
100FalseFalse00:00:25.262807101000039.58 Kword pairs / s
True00:00:01.13252420000883.02 Kword pairs / s
TrueFalse00:00:03.59566620198278.13 Kword pairs / s
True00:00:11.8189121010084.61 Kword pairs / s
20100001FalseFalse00:01:31.786585402000021.90 Kword pairs / s
True00:01:40.954580402000019.91 Kword pairs / s
TrueFalse00:00:39.050064201000251.48 Kword pairs / s
True00:00:49.238437201000240.82 Kword pairs / s
10FalseFalse00:09:35.4703732211000034.93 Kword pairs / s
True00:02:02.9203344020000163.52 Kword pairs / s
TrueFalse00:01:39.5766932010118201.88 Kword pairs / s
True00:04:35.646501201001072.92 Kword pairs / s
100FalseFalse01:42:01.74756820301000032.88 Kword pairs / s
True00:03:36.4207784020000928.75 Kword pairs / s
TrueFalse00:10:58.4340602020198305.30 Kword pairs / s
True00:39:40.319479201010084.44 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size nonzero_limit symmetric positive_definite \n", + "10000 1 False False 00:00:00.435533 \n", + " True 00:00:00.492606 \n", + " True False 00:00:00.185563 \n", + " True 00:00:00.240471 \n", + " 10 False False 00:00:02.687836 \n", + " True 00:00:00.615492 \n", + " True False 00:00:00.501188 \n", + " True 00:00:01.380586 \n", + " 100 False False 00:00:25.262807 \n", + " True 00:00:01.132524 \n", + " True False 00:00:03.595666 \n", + " True 00:00:11.818912 \n", + "2010000 1 False False 00:01:31.786585 \n", + " True 00:01:40.954580 \n", + " True False 00:00:39.050064 \n", + " True 00:00:49.238437 \n", + " 10 False False 00:09:35.470373 \n", + " True 00:02:02.920334 \n", + " True False 00:01:39.576693 \n", + " True 00:04:35.646501 \n", + " 100 False False 01:42:01.747568 \n", + " True 00:03:36.420778 \n", + " True False 00:10:58.434060 \n", + " True 00:39:40.319479 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size nonzero_limit symmetric positive_definite \n", + "10000 1 False False 20000 \n", + " True 20000 \n", + " True False 10002 \n", + " True 10002 \n", + " 10 False False 110000 \n", + " True 20000 \n", + " True False 10118 \n", + " True 10010 \n", + " 100 False False 1010000 \n", + " True 20000 \n", + " True False 20198 \n", + " True 10100 \n", + "2010000 1 False False 4020000 \n", + " True 4020000 \n", + " True False 2010002 \n", + " True 2010002 \n", + " 10 False False 22110000 \n", + " True 4020000 \n", + " True False 2010118 \n", + " True 2010010 \n", + " 100 False False 203010000 \n", + " True 4020000 \n", + " True False 2020198 \n", + " True 2010100 \n", + "\n", + " consumption_speed \n", + "dictionary_size nonzero_limit symmetric positive_definite \n", + "10000 1 False False 22.96 Kword pairs / s \n", + " True 20.30 Kword pairs / s \n", + " True False 53.90 Kword pairs / s \n", + " True 41.59 Kword pairs / s \n", + " 10 False False 37.21 Kword pairs / s \n", + " True 162.49 Kword pairs / s \n", + " True False 199.53 Kword pairs / s \n", + " True 72.44 Kword pairs / s \n", + " 100 False False 39.58 Kword pairs / s \n", + " True 883.02 Kword pairs / s \n", + " True False 278.13 Kword pairs / s \n", + " True 84.61 Kword pairs / s \n", + "2010000 1 False False 21.90 Kword pairs / s \n", + " True 19.91 Kword pairs / s \n", + " True False 51.48 Kword pairs / s \n", + " True 40.82 Kword pairs / s \n", + " 10 False False 34.93 Kword pairs / s \n", + " True 163.52 Kword pairs / s \n", + " True False 201.88 Kword pairs / s \n", + " True 72.92 Kword pairs / s \n", + " 100 False False 32.88 Kword pairs / s \n", + " True 928.75 Kword pairs / s \n", + " True False 305.30 Kword pairs / s \n", + " True 84.44 Kword pairs / s " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.mean()).loc[\n", + " [10000, len(full_dictionary)], :, :].loc[\n", + " :, [\"duration\", \"matrix_nonzero\", \"consumption_speed\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationmatrix_nonzeroconsumption_speed
dictionary_sizenonzero_limitsymmetricpositive_definite
100001FalseFalse00:00:00.00533400.28 Kword pairs / s
True00:00:00.00407200.17 Kword pairs / s
TrueFalse00:00:00.00312400.90 Kword pairs / s
True00:00:00.00179700.31 Kword pairs / s
10FalseFalse00:00:00.01198600.17 Kword pairs / s
True00:00:00.00597201.59 Kword pairs / s
TrueFalse00:00:00.00286901.15 Kword pairs / s
True00:00:00.01141100.60 Kword pairs / s
100FalseFalse00:00:00.11111800.17 Kword pairs / s
True00:00:00.00761105.94 Kword pairs / s
TrueFalse00:00:00.03087502.38 Kword pairs / s
True00:00:00.05019800.36 Kword pairs / s
20100001FalseFalse00:00:00.76730500.18 Kword pairs / s
True00:00:00.17243200.03 Kword pairs / s
TrueFalse00:00:00.34623900.46 Kword pairs / s
True00:00:00.17707500.15 Kword pairs / s
10FalseFalse00:00:05.15665500.31 Kword pairs / s
True00:00:00.63167600.83 Kword pairs / s
TrueFalse00:00:01.21606702.41 Kword pairs / s
True00:00:00.54777300.14 Kword pairs / s
100FalseFalse00:04:10.37103501.24 Kword pairs / s
True00:00:00.63441602.73 Kword pairs / s
TrueFalse00:00:06.58676703.05 Kword pairs / s
True00:00:09.03093200.32 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size nonzero_limit symmetric positive_definite \n", + "10000 1 False False 00:00:00.005334 \n", + " True 00:00:00.004072 \n", + " True False 00:00:00.003124 \n", + " True 00:00:00.001797 \n", + " 10 False False 00:00:00.011986 \n", + " True 00:00:00.005972 \n", + " True False 00:00:00.002869 \n", + " True 00:00:00.011411 \n", + " 100 False False 00:00:00.111118 \n", + " True 00:00:00.007611 \n", + " True False 00:00:00.030875 \n", + " True 00:00:00.050198 \n", + "2010000 1 False False 00:00:00.767305 \n", + " True 00:00:00.172432 \n", + " True False 00:00:00.346239 \n", + " True 00:00:00.177075 \n", + " 10 False False 00:00:05.156655 \n", + " True 00:00:00.631676 \n", + " True False 00:00:01.216067 \n", + " True 00:00:00.547773 \n", + " 100 False False 00:04:10.371035 \n", + " True 00:00:00.634416 \n", + " True False 00:00:06.586767 \n", + " True 00:00:09.030932 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size nonzero_limit symmetric positive_definite \n", + "10000 1 False False 0 \n", + " True 0 \n", + " True False 0 \n", + " True 0 \n", + " 10 False False 0 \n", + " True 0 \n", + " True False 0 \n", + " True 0 \n", + " 100 False False 0 \n", + " True 0 \n", + " True False 0 \n", + " True 0 \n", + "2010000 1 False False 0 \n", + " True 0 \n", + " True False 0 \n", + " True 0 \n", + " 10 False False 0 \n", + " True 0 \n", + " True False 0 \n", + " True 0 \n", + " 100 False False 0 \n", + " True 0 \n", + " True False 0 \n", + " True 0 \n", + "\n", + " consumption_speed \n", + "dictionary_size nonzero_limit symmetric positive_definite \n", + "10000 1 False False 0.28 Kword pairs / s \n", + " True 0.17 Kword pairs / s \n", + " True False 0.90 Kword pairs / s \n", + " True 0.31 Kword pairs / s \n", + " 10 False False 0.17 Kword pairs / s \n", + " True 1.59 Kword pairs / s \n", + " True False 1.15 Kword pairs / s \n", + " True 0.60 Kword pairs / s \n", + " 100 False False 0.17 Kword pairs / s \n", + " True 5.94 Kword pairs / s \n", + " True False 2.38 Kword pairs / s \n", + " True 0.36 Kword pairs / s \n", + "2010000 1 False False 0.18 Kword pairs / s \n", + " True 0.03 Kword pairs / s \n", + " True False 0.46 Kword pairs / s \n", + " True 0.15 Kword pairs / s \n", + " 10 False False 0.31 Kword pairs / s \n", + " True 0.83 Kword pairs / s \n", + " True False 2.41 Kword pairs / s \n", + " True 0.14 Kword pairs / s \n", + " 100 False False 1.24 Kword pairs / s \n", + " True 2.73 Kword pairs / s \n", + " True False 3.05 Kword pairs / s \n", + " True 0.32 Kword pairs / s " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n", + " [10000, len(full_dictionary)], :, :].loc[\n", + " :, [\"duration\", \"matrix_nonzero\", \"consumption_speed\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Builder class benchmark\n", + "#### UniformTermSimilarityIndex\n", + "First, we measure the speed at which the **UniformTermSimilarityIndex** builder class produces term similarities. **UniformTermSimilarityIndex** is a dummy class that just generates a sequence of constants. It produces much more term similarities per second than the **SparseTermSimilarityMatrix** is capable of consuming and its results will serve as an upper limit." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(configuration):\n", + " dictionary, nonzero_limit, repetition = configuration\n", + " \n", + " start_time = time()\n", + " index = UniformTermSimilarityIndex(dictionary)\n", + " end_time = time()\n", + " constructor_duration = end_time - start_time\n", + " \n", + " start_time = time()\n", + " for term in dictionary.values():\n", + " for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):\n", + " pass\n", + " end_time = time()\n", + " production_duration = end_time - start_time\n", + " \n", + " return {\n", + " \"dictionary_size\": len(dictionary),\n", + " \"nonzero_limit\": nonzero_limit,\n", + " \"repetition\": repetition,\n", + " \"constructor_duration\": constructor_duration,\n", + " \"production_duration\": production_duration, }" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "nonzero_limits = [1, 10, 100, 1000]\n", + "\n", + "configurations = product(dictionaries, nonzero_limits, repetitions)\n", + "results = benchmark_results(benchmark, configurations, \"matrix_speed.builder_results.uniform\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following tables show how long it takes to retrieve the most similar terms for all terms in a dictionary (the **production_duration** column) and the mean term similarity production speed (the **production_speed** column) as we vary the dictionary size (the **dictionary_size** column), and the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n", + "\n", + "The **production_speed** is proportional to **nonzero_limit**." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(results)\n", + "df[\"processing_speed\"] = df.dictionary_size ** 2 / df.production_duration\n", + "df[\"production_speed\"] = df.dictionary_size * df.nonzero_limit / df.production_duration\n", + "df = df.groupby([\"dictionary_size\", \"nonzero_limit\"])\n", + "\n", + "def display(df):\n", + " df[\"constructor_duration\"] = [timedelta(0, duration) for duration in df[\"constructor_duration\"]]\n", + " df[\"production_duration\"] = [timedelta(0, duration) for duration in df[\"production_duration\"]]\n", + " df[\"processing_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"processing_speed\"]]\n", + " df[\"production_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"production_speed\"]]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
production_durationproduction_speed
dictionary_sizenonzero_limit
1000100:00:00.002973336.41 Kword pairs / s
1000:00:00.0053721861.64 Kword pairs / s
10000:00:00.0267523738.79 Kword pairs / s
100000:00:00.2902653449.16 Kword pairs / s
2010000100:00:06.318446318.12 Kword pairs / s
1000:00:10.7836111863.96 Kword pairs / s
10000:00:53.1086443785.04 Kword pairs / s
100000:09:45.1037413437.36 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " production_duration production_speed\n", + "dictionary_size nonzero_limit \n", + "1000 1 00:00:00.002973 336.41 Kword pairs / s\n", + " 10 00:00:00.005372 1861.64 Kword pairs / s\n", + " 100 00:00:00.026752 3738.79 Kword pairs / s\n", + " 1000 00:00:00.290265 3449.16 Kword pairs / s\n", + "2010000 1 00:00:06.318446 318.12 Kword pairs / s\n", + " 10 00:00:10.783611 1863.96 Kword pairs / s\n", + " 100 00:00:53.108644 3785.04 Kword pairs / s\n", + " 1000 00:09:45.103741 3437.36 Kword pairs / s" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.mean()).loc[\n", + " [1000, len(full_dictionary)], :, :].loc[\n", + " :, [\"production_duration\", \"production_speed\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
production_durationproduction_speed
dictionary_sizenonzero_limit
1000100:00:00.0000171.93 Kword pairs / s
1000:00:00.00006221.50 Kword pairs / s
10000:00:00.00040856.66 Kword pairs / s
100000:00:00.010500123.82 Kword pairs / s
2010000100:00:00.0234951.18 Kword pairs / s
1000:00:00.0355876.16 Kword pairs / s
10000:00:00.53576537.76 Kword pairs / s
100000:00:15.03781689.56 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " production_duration production_speed\n", + "dictionary_size nonzero_limit \n", + "1000 1 00:00:00.000017 1.93 Kword pairs / s\n", + " 10 00:00:00.000062 21.50 Kword pairs / s\n", + " 100 00:00:00.000408 56.66 Kword pairs / s\n", + " 1000 00:00:00.010500 123.82 Kword pairs / s\n", + "2010000 1 00:00:00.023495 1.18 Kword pairs / s\n", + " 10 00:00:00.035587 6.16 Kword pairs / s\n", + " 100 00:00:00.535765 37.76 Kword pairs / s\n", + " 1000 00:00:15.037816 89.56 Kword pairs / s" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n", + " [1000, len(full_dictionary)], :, :].loc[\n", + " :, [\"production_duration\", \"production_speed\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### LevenshteinSimilarityIndex\n", + "Next, we measure the speed at which the **LevenshteinSimilarityIndex** builder class produces term similarities. **LevenshteinSimilarityIndex** is currently just a naïve implementation that produces much fewer term similarities per second than the **SparseTermSimilarityMatrix** class is capable of consuming." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(configuration):\n", + " dictionary, nonzero_limit, query_terms, repetition = configuration\n", + " \n", + " start_time = time()\n", + " index = LevenshteinSimilarityIndex(dictionary)\n", + " end_time = time()\n", + " constructor_duration = end_time - start_time\n", + " \n", + " start_time = time()\n", + " for term in query_terms:\n", + " for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):\n", + " pass\n", + " end_time = time()\n", + " production_duration = end_time - start_time\n", + " \n", + " return {\n", + " \"dictionary_size\": len(dictionary),\n", + " \"mean_query_term_length\": np.mean([len(term) for term in query_terms]),\n", + " \"nonzero_limit\": nonzero_limit,\n", + " \"repetition\": repetition,\n", + " \"constructor_duration\": constructor_duration,\n", + " \"production_duration\": production_duration, }" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "nonzero_limits = [1, 10, 100]\n", + "seed(RANDOM_SEED)\n", + "min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1]\n", + "query_terms = sample(list(min_dictionary.values()), 10)\n", + "\n", + "configurations = product(dictionaries, nonzero_limits, [query_terms], repetitions)\n", + "results = benchmark_results(benchmark, configurations, \"matrix_speed.builder_results.levenshtein\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following tables show how long it takes to retrieve the most similar terms for ten randomly sampled terms from a dictionary (the **production_duration** column), the mean term similarity production speed (the **production_speed** column) and the mean term similarity processing speed (the **processing_speed** column) as we vary the dictionary size (the **dictionary_size** column), and the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n", + "\n", + "The **production_speed** is proportional to **nonzero_limit / dictionary_size**. The **processing_speed** is constant." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(results)\n", + "df[\"processing_speed\"] = df.dictionary_size * len(query_terms) / df.production_duration\n", + "df[\"production_speed\"] = df.nonzero_limit * len(query_terms) / df.production_duration\n", + "df = df.groupby([\"dictionary_size\", \"nonzero_limit\"])\n", + "\n", + "def display(df):\n", + " df[\"constructor_duration\"] = [timedelta(0, duration) for duration in df[\"constructor_duration\"]]\n", + " df[\"production_duration\"] = [timedelta(0, duration) for duration in df[\"production_duration\"]]\n", + " df[\"processing_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"processing_speed\"]]\n", + " df[\"production_speed\"] = [\"%.02f word pairs / s\" % speed for speed in df[\"production_speed\"]]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
production_durationproduction_speedprocessing_speed
dictionary_sizenonzero_limit
1000100:00:00.055994178.61 word pairs / s178.61 Kword pairs / s
1000:00:00.0560971782.70 word pairs / s178.27 Kword pairs / s
10000:00:00.05621217791.65 word pairs / s177.92 Kword pairs / s
1000000100:01:20.6180700.12 word pairs / s124.05 Kword pairs / s
1000:01:20.0482381.25 word pairs / s124.92 Kword pairs / s
10000:01:20.06499912.49 word pairs / s124.90 Kword pairs / s
2010000100:02:44.0693990.06 word pairs / s122.51 Kword pairs / s
1000:02:43.9146010.61 word pairs / s122.63 Kword pairs / s
10000:02:43.8924086.10 word pairs / s122.64 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " production_duration production_speed \\\n", + "dictionary_size nonzero_limit \n", + "1000 1 00:00:00.055994 178.61 word pairs / s \n", + " 10 00:00:00.056097 1782.70 word pairs / s \n", + " 100 00:00:00.056212 17791.65 word pairs / s \n", + "1000000 1 00:01:20.618070 0.12 word pairs / s \n", + " 10 00:01:20.048238 1.25 word pairs / s \n", + " 100 00:01:20.064999 12.49 word pairs / s \n", + "2010000 1 00:02:44.069399 0.06 word pairs / s \n", + " 10 00:02:43.914601 0.61 word pairs / s \n", + " 100 00:02:43.892408 6.10 word pairs / s \n", + "\n", + " processing_speed \n", + "dictionary_size nonzero_limit \n", + "1000 1 178.61 Kword pairs / s \n", + " 10 178.27 Kword pairs / s \n", + " 100 177.92 Kword pairs / s \n", + "1000000 1 124.05 Kword pairs / s \n", + " 10 124.92 Kword pairs / s \n", + " 100 124.90 Kword pairs / s \n", + "2010000 1 122.51 Kword pairs / s \n", + " 10 122.63 Kword pairs / s \n", + " 100 122.64 Kword pairs / s " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.mean()).loc[\n", + " [1000, 1000000, len(full_dictionary)], :].loc[\n", + " :, [\"production_duration\", \"production_speed\", \"processing_speed\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
production_durationproduction_speedprocessing_speed
dictionary_sizenonzero_limit
1000100:00:00.0006732.16 word pairs / s2.16 Kword pairs / s
1000:00:00.00040913.06 word pairs / s1.31 Kword pairs / s
10000:00:00.000621196.80 word pairs / s1.97 Kword pairs / s
1000000100:00:00.8106610.00 word pairs / s1.23 Kword pairs / s
1000:00:00.1100130.00 word pairs / s0.17 Kword pairs / s
10000:00:00.1649590.03 word pairs / s0.26 Kword pairs / s
2010000100:00:01.1592730.00 word pairs / s0.85 Kword pairs / s
1000:00:00.4290110.00 word pairs / s0.32 Kword pairs / s
10000:00:00.4336870.02 word pairs / s0.32 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " production_duration production_speed \\\n", + "dictionary_size nonzero_limit \n", + "1000 1 00:00:00.000673 2.16 word pairs / s \n", + " 10 00:00:00.000409 13.06 word pairs / s \n", + " 100 00:00:00.000621 196.80 word pairs / s \n", + "1000000 1 00:00:00.810661 0.00 word pairs / s \n", + " 10 00:00:00.110013 0.00 word pairs / s \n", + " 100 00:00:00.164959 0.03 word pairs / s \n", + "2010000 1 00:00:01.159273 0.00 word pairs / s \n", + " 10 00:00:00.429011 0.00 word pairs / s \n", + " 100 00:00:00.433687 0.02 word pairs / s \n", + "\n", + " processing_speed \n", + "dictionary_size nonzero_limit \n", + "1000 1 2.16 Kword pairs / s \n", + " 10 1.31 Kword pairs / s \n", + " 100 1.97 Kword pairs / s \n", + "1000000 1 1.23 Kword pairs / s \n", + " 10 0.17 Kword pairs / s \n", + " 100 0.26 Kword pairs / s \n", + "2010000 1 0.85 Kword pairs / s \n", + " 10 0.32 Kword pairs / s \n", + " 100 0.32 Kword pairs / s " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n", + " [1000, 1000000, len(full_dictionary)], :].loc[\n", + " :, [\"production_duration\", \"production_speed\", \"processing_speed\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### WordEmbeddingSimilarityIndex\n", + "Lastly, we measure the speed at which the **WordEmbeddingSimilarityIndex** builder class constructs an instance and produces term similarities. Gensim currently supports slow and precise nearest neighbor search, and also approximate nearest neighbor search using [ANNOY][]. We evaluate both options.\n", + "\n", + " [ANNOY]: https://github.com/spotify/annoy (Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(configuration):\n", + " (model, dictionary), nonzero_limit, annoy_n_trees, query_terms, repetition = configuration\n", + " use_annoy = annoy_n_trees > 0\n", + " model.init_sims()\n", + " \n", + " start_time = time()\n", + " if use_annoy:\n", + " annoy = AnnoyIndexer(model, annoy_n_trees)\n", + " kwargs = {\"indexer\": annoy}\n", + " else:\n", + " kwargs = {}\n", + " index = WordEmbeddingSimilarityIndex(model, kwargs=kwargs)\n", + " end_time = time()\n", + " constructor_duration = end_time - start_time\n", + " \n", + " start_time = time()\n", + " for term in query_terms:\n", + " for _j, _k in zip(index.most_similar(term, topn=nonzero_limit), range(nonzero_limit)):\n", + " pass\n", + " end_time = time()\n", + " production_duration = end_time - start_time\n", + " \n", + " return {\n", + " \"dictionary_size\": len(dictionary),\n", + " \"mean_query_term_length\": np.mean([len(term) for term in query_terms]),\n", + " \"nonzero_limit\": nonzero_limit,\n", + " \"use_annoy\": use_annoy,\n", + " \"annoy_n_trees\": annoy_n_trees,\n", + " \"repetition\": repetition,\n", + " \"constructor_duration\": constructor_duration,\n", + " \"production_duration\": production_duration, }" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "842bb1a60f814110a8f20eb44a973397", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "models = []\n", + "for dictionary in tqdm(dictionaries, desc=\"models\"):\n", + " if dictionary == full_dictionary:\n", + " models.append(full_model)\n", + " continue\n", + " model = full_model.__class__(full_model.vector_size)\n", + " model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()}\n", + " model.index2entity = []\n", + " vector_indices = []\n", + " for index, word in enumerate(full_model.index2entity):\n", + " if word in model.vocab.keys():\n", + " model.index2entity.append(word)\n", + " model.vocab[word].index = len(vector_indices)\n", + " vector_indices.append(index)\n", + " model.vectors = full_model.vectors[vector_indices]\n", + " models.append(model)\n", + "annoy_n_trees = [0] + [10**k for k in range(3)]\n", + "seed(RANDOM_SEED)\n", + "query_terms = sample(list(min_dictionary.values()), 1000)\n", + "\n", + "configurations = product(zip(models, dictionaries), nonzero_limits, annoy_n_trees, [query_terms], repetitions)\n", + "results = benchmark_results(benchmark, configurations, \"matrix_speed.builder_results.wordembeddings\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following tables show how long it takes to construct an ANNOY index and the builder class instance (the **constructor_duration** column), how long it takes to retrieve the most similar terms for 1,000 randomly sampled terms from a dictionary (the **production_duration** column), the mean term similarity production speed (the **production_speed** column) and the mean term similarity processing speed (the **processing_speed** column) as we vary the dictionary size (the **dictionary_size** column), the maximum number of most similar terms that will be retrieved (the **nonzero_limit** column), and the number of constructed ANNOY trees (the **annoy_n_trees** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n", + "\n", + "If we do not use ANNOY (**annoy_n_trees**${}=0$), then **production_speed** is proportional to **nonzero_limit / dictionary_size**. \n", + "If we do use ANNOY (**annoy_n_trees**${}>0$), then **production_speed** is proportional to **nonzero_limit / (annoy_n_trees)**${}^{1/2}$." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(results)\n", + "df[\"processing_speed\"] = df.dictionary_size * len(query_terms) / df.production_duration\n", + "df[\"production_speed\"] = df.nonzero_limit * len(query_terms) / df.production_duration\n", + "df = df.groupby([\"dictionary_size\", \"nonzero_limit\", \"annoy_n_trees\"])\n", + "\n", + "def display(df):\n", + " df[\"constructor_duration\"] = [timedelta(0, duration) for duration in df[\"constructor_duration\"]]\n", + " df[\"production_duration\"] = [timedelta(0, duration) for duration in df[\"production_duration\"]]\n", + " df[\"processing_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"processing_speed\"]]\n", + " df[\"production_speed\"] = [\"%.02f Kword pairs / s\" % (speed / 1000) for speed in df[\"production_speed\"]]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constructor_durationproduction_durationproduction_speedprocessing_speed
dictionary_sizenonzero_limitannoy_n_trees
10000001000:00:00.00000700:00:19.9629770.05 Kword pairs / s50094.22 Kword pairs / s
100:00:30.26879700:00:00.09701110.32 Kword pairs / s10320061.76 Kword pairs / s
10000:06:23.41598200:00:00.1608706.24 Kword pairs / s6236688.27 Kword pairs / s
100000:00:00.00000800:00:22.8683724.37 Kword pairs / s43729.34 Kword pairs / s
100:00:31.15487600:00:00.156238641.91 Kword pairs / s6419086.99 Kword pairs / s
10000:06:23.29057200:00:01.29744577.13 Kword pairs / s771277.71 Kword pairs / s
20100001000:00:00.00000700:01:55.3032160.01 Kword pairs / s17432.79 Kword pairs / s
100:01:34.00419600:00:00.1904635.25 Kword pairs / s10561607.14 Kword pairs / s
10000:23:29.79600600:00:00.3395002.96 Kword pairs / s5954865.50 Kword pairs / s
100000:00:00.00000700:02:11.9268610.76 Kword pairs / s15236.46 Kword pairs / s
100:01:35.81341400:00:00.301120332.38 Kword pairs / s6680879.02 Kword pairs / s
10000:23:05.15539900:00:03.03152733.42 Kword pairs / s671683.05 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " constructor_duration \\\n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 00:00:00.000007 \n", + " 1 00:00:30.268797 \n", + " 100 00:06:23.415982 \n", + " 100 0 00:00:00.000008 \n", + " 1 00:00:31.154876 \n", + " 100 00:06:23.290572 \n", + "2010000 1 0 00:00:00.000007 \n", + " 1 00:01:34.004196 \n", + " 100 00:23:29.796006 \n", + " 100 0 00:00:00.000007 \n", + " 1 00:01:35.813414 \n", + " 100 00:23:05.155399 \n", + "\n", + " production_duration \\\n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 00:00:19.962977 \n", + " 1 00:00:00.097011 \n", + " 100 00:00:00.160870 \n", + " 100 0 00:00:22.868372 \n", + " 1 00:00:00.156238 \n", + " 100 00:00:01.297445 \n", + "2010000 1 0 00:01:55.303216 \n", + " 1 00:00:00.190463 \n", + " 100 00:00:00.339500 \n", + " 100 0 00:02:11.926861 \n", + " 1 00:00:00.301120 \n", + " 100 00:00:03.031527 \n", + "\n", + " production_speed \\\n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 0.05 Kword pairs / s \n", + " 1 10.32 Kword pairs / s \n", + " 100 6.24 Kword pairs / s \n", + " 100 0 4.37 Kword pairs / s \n", + " 1 641.91 Kword pairs / s \n", + " 100 77.13 Kword pairs / s \n", + "2010000 1 0 0.01 Kword pairs / s \n", + " 1 5.25 Kword pairs / s \n", + " 100 2.96 Kword pairs / s \n", + " 100 0 0.76 Kword pairs / s \n", + " 1 332.38 Kword pairs / s \n", + " 100 33.42 Kword pairs / s \n", + "\n", + " processing_speed \n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 50094.22 Kword pairs / s \n", + " 1 10320061.76 Kword pairs / s \n", + " 100 6236688.27 Kword pairs / s \n", + " 100 0 43729.34 Kword pairs / s \n", + " 1 6419086.99 Kword pairs / s \n", + " 100 771277.71 Kword pairs / s \n", + "2010000 1 0 17432.79 Kword pairs / s \n", + " 1 10561607.14 Kword pairs / s \n", + " 100 5954865.50 Kword pairs / s \n", + " 100 0 15236.46 Kword pairs / s \n", + " 1 6680879.02 Kword pairs / s \n", + " 100 671683.05 Kword pairs / s " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.mean()).loc[\n", + " [1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[\n", + " :, [\"constructor_duration\", \"production_duration\", \"production_speed\", \"processing_speed\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constructor_durationproduction_durationproduction_speedprocessing_speed
dictionary_sizenonzero_limitannoy_n_trees
10000001000:00:00.00000200:00:00.1156440.00 Kword pairs / s286.27 Kword pairs / s
100:00:01.85409700:00:00.0035170.37 Kword pairs / s367959.55 Kword pairs / s
10000:00:04.70203500:00:00.0104440.35 Kword pairs / s350506.05 Kword pairs / s
100000:00:00.00000200:00:00.1048720.02 Kword pairs / s198.86 Kword pairs / s
100:00:01.16367800:00:00.00893936.14 Kword pairs / s361441.71 Kword pairs / s
10000:00:06.81856800:00:00.0369792.07 Kword pairs / s20741.69 Kword pairs / s
20100001000:00:00.00000100:00:00.6531770.00 Kword pairs / s97.50 Kword pairs / s
100:00:04.67720900:00:00.0056790.16 Kword pairs / s311832.91 Kword pairs / s
10000:01:38.56268400:00:00.0298870.22 Kword pairs / s434681.25 Kword pairs / s
100000:00:00.00000100:00:00.9796130.01 Kword pairs / s111.85 Kword pairs / s
100:00:03.20747400:00:00.00947910.18 Kword pairs / s204614.80 Kword pairs / s
10000:00:55.11959500:00:00.4195313.46 Kword pairs / s69543.35 Kword pairs / s
\n", + "
" + ], + "text/plain": [ + " constructor_duration \\\n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 00:00:00.000002 \n", + " 1 00:00:01.854097 \n", + " 100 00:00:04.702035 \n", + " 100 0 00:00:00.000002 \n", + " 1 00:00:01.163678 \n", + " 100 00:00:06.818568 \n", + "2010000 1 0 00:00:00.000001 \n", + " 1 00:00:04.677209 \n", + " 100 00:01:38.562684 \n", + " 100 0 00:00:00.000001 \n", + " 1 00:00:03.207474 \n", + " 100 00:00:55.119595 \n", + "\n", + " production_duration \\\n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 00:00:00.115644 \n", + " 1 00:00:00.003517 \n", + " 100 00:00:00.010444 \n", + " 100 0 00:00:00.104872 \n", + " 1 00:00:00.008939 \n", + " 100 00:00:00.036979 \n", + "2010000 1 0 00:00:00.653177 \n", + " 1 00:00:00.005679 \n", + " 100 00:00:00.029887 \n", + " 100 0 00:00:00.979613 \n", + " 1 00:00:00.009479 \n", + " 100 00:00:00.419531 \n", + "\n", + " production_speed \\\n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 0.00 Kword pairs / s \n", + " 1 0.37 Kword pairs / s \n", + " 100 0.35 Kword pairs / s \n", + " 100 0 0.02 Kword pairs / s \n", + " 1 36.14 Kword pairs / s \n", + " 100 2.07 Kword pairs / s \n", + "2010000 1 0 0.00 Kword pairs / s \n", + " 1 0.16 Kword pairs / s \n", + " 100 0.22 Kword pairs / s \n", + " 100 0 0.01 Kword pairs / s \n", + " 1 10.18 Kword pairs / s \n", + " 100 3.46 Kword pairs / s \n", + "\n", + " processing_speed \n", + "dictionary_size nonzero_limit annoy_n_trees \n", + "1000000 1 0 286.27 Kword pairs / s \n", + " 1 367959.55 Kword pairs / s \n", + " 100 350506.05 Kword pairs / s \n", + " 100 0 198.86 Kword pairs / s \n", + " 1 361441.71 Kword pairs / s \n", + " 100 20741.69 Kword pairs / s \n", + "2010000 1 0 97.50 Kword pairs / s \n", + " 1 311832.91 Kword pairs / s \n", + " 100 434681.25 Kword pairs / s \n", + " 100 0 111.85 Kword pairs / s \n", + " 1 204614.80 Kword pairs / s \n", + " 100 69543.35 Kword pairs / s " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n", + " [1000000, len(full_dictionary)], [1, 100], [0, 1, 100]].loc[\n", + " :, [\"constructor_duration\", \"production_duration\", \"production_speed\", \"processing_speed\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implement fast SCM between corpora\n", + "\n", + "In Gensim PR [#1827][], we added a base implementation of the soft cosine measure (SCM). The base implementation would compute SCM between single documents using the **softcossim** function. In the Gensim PR [#2016][], we intruduced the **SparseTermSimilarityMatrix.inner_product** method, which computes SCM not only between single documents, but also between a document and a corpus, and between two corpora.\n", + "\n", + "For the measurements, we use the [Google News word embeddings][word2vec-google-news-300] distributed with the C implementation of Word2Vec. From the word embeddings, we will derive a dictionary of 2.01m terms. As a corpus, we will use a random sample of 100K articles from the 4.92m English [Wikipedia articles][enwiki].\n", + "\n", + " [word2vec-google-news-300]: https://github.com/mmihaltz/word2vec-GoogleNews-vectors (word2vec-GoogleNews-vectors)\n", + " [enwiki]: https://github.com/RaRe-Technologies/gensim-data/releases/tag/wiki-english-20171001 (wiki-english-20171001)\n", + " [#1827]: https://github.com/RaRe-Technologies/gensim/pull/1827 (Implement Soft Cosine Measure - Pull Request #1827)\n", + " [#2016]: https://github.com/RaRe-Technologies/gensim/pull/2016 (Implement Levenshtein term similarity matrix and fast SCM between corpora - Pull Request #2016)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "full_model = api.load(\"word2vec-google-news-300\")\n", + "\n", + "try:\n", + " with open(\"matrix_speed.corpus\", \"rb\") as file:\n", + " full_corpus = pickle.load(file) \n", + "except IOError:\n", + " original_corpus = list(tqdm(api.load(\"wiki-english-20171001\"), desc=\"original_corpus\", total=4924894))\n", + " seed(RANDOM_SEED)\n", + " full_corpus = [\n", + " simple_preprocess(u'\\n'.join(article[\"section_texts\"]))\n", + " for article in tqdm(sample(original_corpus, 10**5), desc=\"full_corpus\", total=10**5)]\n", + " del original_corpus\n", + " with open(\"matrix_speed.corpus\", \"wb\") as file:\n", + " pickle.dump(full_corpus, file)\n", + "\n", + "try:\n", + " full_dictionary = Dictionary.load(\"matrix_speed.dictionary\")\n", + "except IOError:\n", + " full_dictionary = Dictionary([[term] for term in full_model.vocab.keys()])\n", + " full_dictionary.save(\"matrix_speed.dictionary\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SCM between two documents\n", + "First, we measure the speed at which the **inner_product** method produces term similarities between single documents." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(configuration):\n", + " (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration\n", + " corpus_size = len(corpus)\n", + " corpus = [dictionary.doc2bow(doc) for doc in corpus]\n", + " corpus = [vec for vec in corpus if len(vec) > 0]\n", + " \n", + " start_time = time()\n", + " for vec1 in corpus:\n", + " for vec2 in corpus:\n", + " matrix.inner_product(vec1, vec2, normalized=normalized)\n", + " end_time = time()\n", + " duration = end_time - start_time\n", + " \n", + " return {\n", + " \"dictionary_size\": matrix.matrix.shape[0],\n", + " \"matrix_nonzero\": matrix.matrix.nnz,\n", + " \"nonzero_limit\": nonzero_limit,\n", + " \"normalized\": normalized,\n", + " \"corpus_size\": corpus_size,\n", + " \"corpus_actual_size\": len(corpus),\n", + " \"corpus_nonzero\": sum(len(vec) for vec in corpus),\n", + " \"mean_document_length\": np.mean([len(doc) for doc in corpus]),\n", + " \"repetition\": repetition,\n", + " \"duration\": duration, }" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "110675d5552847819754f0dc5b1c19e1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "744e400d597440f79b5923dafb1974fc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0f84efc0c79a4628a9543736fc5f0c9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8a185a8e530e4481b90056222f5f0a1c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=6), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/mnt/storage/home/novotny/.virtualenvs/gensim/lib/python3.4/site-packages/gensim/matutils.py:738: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "seed(RANDOM_SEED)\n", + "dictionary_sizes = [1000, 100000]\n", + "dictionaries = []\n", + "for size in tqdm(dictionary_sizes, desc=\"dictionaries\"):\n", + " dictionary = Dictionary([sample(list(full_dictionary.values()), size)])\n", + " dictionaries.append(dictionary)\n", + "min_dictionary = sorted((len(dictionary), dictionary) for dictionary in dictionaries)[0][1]\n", + "\n", + "corpus_sizes = [100, 1000]\n", + "corpora = []\n", + "for size in tqdm(corpus_sizes, desc=\"corpora\"):\n", + " corpus = sample(full_corpus, size)\n", + " corpora.append(corpus)\n", + "\n", + "models = []\n", + "for dictionary in tqdm(dictionaries, desc=\"models\"):\n", + " if dictionary == full_dictionary:\n", + " models.append(full_model)\n", + " continue\n", + " model = full_model.__class__(full_model.vector_size)\n", + " model.vocab = {word: deepcopy(full_model.vocab[word]) for word in dictionary.values()}\n", + " model.index2entity = []\n", + " vector_indices = []\n", + " for index, word in enumerate(full_model.index2entity):\n", + " if word in model.vocab.keys():\n", + " model.index2entity.append(word)\n", + " model.vocab[word].index = len(vector_indices)\n", + " vector_indices.append(index)\n", + " model.vectors = full_model.vectors[vector_indices]\n", + " models.append(model)\n", + "\n", + "nonzero_limits = [1, 10, 100]\n", + "matrices = []\n", + "for (model, dictionary), nonzero_limit in tqdm(\n", + " list(product(zip(models, dictionaries), nonzero_limits)), desc=\"matrices\"):\n", + " annoy = AnnoyIndexer(model, 1)\n", + " index = WordEmbeddingSimilarityIndex(model, kwargs={\"indexer\": annoy})\n", + " matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit)\n", + " matrices.append((matrix, dictionary, nonzero_limit))\n", + " del annoy\n", + "\n", + "normalization = (True, False)\n", + "repetitions = range(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "configurations = product(matrices, corpora, normalization, repetitions)\n", + "results = benchmark_results(benchmark, configurations, \"matrix_speed.inner-product_results.doc_doc\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following tables show how long it takes to compute the **inner_product** method between all document vectors in a corpus (the **duration** column), how many nonzero elements there are in a corpus matrix (the **corpus_nonzero** column), how many nonzero elements there are in a term similarity matrix (the **matrix_nonzero** column) and the mean document similarity production speed (the **speed** column) as we vary the dictionary size (the **dictionary_size** column), the size of the corpus (the **corpus_size** column), the maximum number of nonzero elements in a single column of the matrix (the **nonzero_limit** column), and the matrix symmetry constraint (the **symmetric** column). Ten independendent measurements were taken. The top table shows the mean values and the bottom table shows the standard deviations.\n", + "\n", + "The **speed** is proportional to the square of the number of unique terms shared by the two document vectors. In our scenario as well as the standard IR scenario, this means **speed** is constant. Computing a normalized inner product (**normalized**${}={}$True) results in a constant speed decrease." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(results)\n", + "df[\"speed\"] = df.corpus_actual_size**2 / df.duration\n", + "del df[\"corpus_actual_size\"]\n", + "df = df.groupby([\"dictionary_size\", \"corpus_size\", \"nonzero_limit\", \"normalized\"])\n", + "\n", + "def display(df):\n", + " df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n", + " df[\"speed\"] = [\"%.02f Kdoc pairs / s\" % (speed / 1000) for speed in df[\"speed\"]]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationcorpus_nonzeromatrix_nonzerospeed
dictionary_sizecorpus_sizenonzero_limitnormalized
10001001False00:00:00.0073833.01000.01.23 Kdoc pairs / s
True00:00:00.0090283.01000.01.01 Kdoc pairs / s
100False00:00:00.0076573.084944.01.19 Kdoc pairs / s
True00:00:00.0082383.084944.01.10 Kdoc pairs / s
10001False00:00:00.41436426.01000.01.39 Kdoc pairs / s
True00:00:00.47378926.01000.01.22 Kdoc pairs / s
100False00:00:00.43083326.084944.01.35 Kdoc pairs / s
True00:00:00.45347726.084944.01.27 Kdoc pairs / s
1000001001False00:00:05.236376423.0101868.01.29 Kdoc pairs / s
True00:00:05.623463423.0101868.01.20 Kdoc pairs / s
100False00:00:05.083829423.08202884.01.33 Kdoc pairs / s
True00:00:05.576003423.08202884.01.21 Kdoc pairs / s
10001False00:08:59.2853475162.0101868.01.26 Kdoc pairs / s
True00:09:57.6932195162.0101868.01.14 Kdoc pairs / s
100False00:09:23.2134505162.08202884.01.21 Kdoc pairs / s
True00:10:10.6124585162.08202884.01.12 Kdoc pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 00:00:00.007383 \n", + " True 00:00:00.009028 \n", + " 100 False 00:00:00.007657 \n", + " True 00:00:00.008238 \n", + " 1000 1 False 00:00:00.414364 \n", + " True 00:00:00.473789 \n", + " 100 False 00:00:00.430833 \n", + " True 00:00:00.453477 \n", + "100000 100 1 False 00:00:05.236376 \n", + " True 00:00:05.623463 \n", + " 100 False 00:00:05.083829 \n", + " True 00:00:05.576003 \n", + " 1000 1 False 00:08:59.285347 \n", + " True 00:09:57.693219 \n", + " 100 False 00:09:23.213450 \n", + " True 00:10:10.612458 \n", + "\n", + " corpus_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 3.0 \n", + " True 3.0 \n", + " 100 False 3.0 \n", + " True 3.0 \n", + " 1000 1 False 26.0 \n", + " True 26.0 \n", + " 100 False 26.0 \n", + " True 26.0 \n", + "100000 100 1 False 423.0 \n", + " True 423.0 \n", + " 100 False 423.0 \n", + " True 423.0 \n", + " 1000 1 False 5162.0 \n", + " True 5162.0 \n", + " 100 False 5162.0 \n", + " True 5162.0 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 1000.0 \n", + " True 1000.0 \n", + " 100 False 84944.0 \n", + " True 84944.0 \n", + " 1000 1 False 1000.0 \n", + " True 1000.0 \n", + " 100 False 84944.0 \n", + " True 84944.0 \n", + "100000 100 1 False 101868.0 \n", + " True 101868.0 \n", + " 100 False 8202884.0 \n", + " True 8202884.0 \n", + " 1000 1 False 101868.0 \n", + " True 101868.0 \n", + " 100 False 8202884.0 \n", + " True 8202884.0 \n", + "\n", + " speed \n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 1.23 Kdoc pairs / s \n", + " True 1.01 Kdoc pairs / s \n", + " 100 False 1.19 Kdoc pairs / s \n", + " True 1.10 Kdoc pairs / s \n", + " 1000 1 False 1.39 Kdoc pairs / s \n", + " True 1.22 Kdoc pairs / s \n", + " 100 False 1.35 Kdoc pairs / s \n", + " True 1.27 Kdoc pairs / s \n", + "100000 100 1 False 1.29 Kdoc pairs / s \n", + " True 1.20 Kdoc pairs / s \n", + " 100 False 1.33 Kdoc pairs / s \n", + " True 1.21 Kdoc pairs / s \n", + " 1000 1 False 1.26 Kdoc pairs / s \n", + " True 1.14 Kdoc pairs / s \n", + " 100 False 1.21 Kdoc pairs / s \n", + " True 1.12 Kdoc pairs / s " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.mean()).loc[\n", + " [1000, 100000], :, [1, 100], :].loc[\n", + " :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationcorpus_nonzeromatrix_nonzerospeed
dictionary_sizecorpus_sizenonzero_limitnormalized
10001001False00:00:00.0008710.00.00.13 Kdoc pairs / s
True00:00:00.0013150.00.00.14 Kdoc pairs / s
100False00:00:00.0008930.00.00.12 Kdoc pairs / s
True00:00:00.0006310.00.00.08 Kdoc pairs / s
10001False00:00:00.0144600.00.00.05 Kdoc pairs / s
True00:00:00.0252500.00.00.07 Kdoc pairs / s
100False00:00:00.0390880.00.00.11 Kdoc pairs / s
True00:00:00.0236020.00.00.06 Kdoc pairs / s
1000001001False00:00:00.2763590.00.00.07 Kdoc pairs / s
True00:00:00.2788060.00.00.06 Kdoc pairs / s
100False00:00:00.2867810.00.00.07 Kdoc pairs / s
True00:00:00.3133970.00.00.06 Kdoc pairs / s
10001False00:00:14.3211010.00.00.03 Kdoc pairs / s
True00:00:23.5261040.00.00.05 Kdoc pairs / s
100False00:00:05.8995270.00.00.01 Kdoc pairs / s
True00:00:24.4544220.00.00.05 Kdoc pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 00:00:00.000871 \n", + " True 00:00:00.001315 \n", + " 100 False 00:00:00.000893 \n", + " True 00:00:00.000631 \n", + " 1000 1 False 00:00:00.014460 \n", + " True 00:00:00.025250 \n", + " 100 False 00:00:00.039088 \n", + " True 00:00:00.023602 \n", + "100000 100 1 False 00:00:00.276359 \n", + " True 00:00:00.278806 \n", + " 100 False 00:00:00.286781 \n", + " True 00:00:00.313397 \n", + " 1000 1 False 00:00:14.321101 \n", + " True 00:00:23.526104 \n", + " 100 False 00:00:05.899527 \n", + " True 00:00:24.454422 \n", + "\n", + " corpus_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "100000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "100000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "\n", + " speed \n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 0.13 Kdoc pairs / s \n", + " True 0.14 Kdoc pairs / s \n", + " 100 False 0.12 Kdoc pairs / s \n", + " True 0.08 Kdoc pairs / s \n", + " 1000 1 False 0.05 Kdoc pairs / s \n", + " True 0.07 Kdoc pairs / s \n", + " 100 False 0.11 Kdoc pairs / s \n", + " True 0.06 Kdoc pairs / s \n", + "100000 100 1 False 0.07 Kdoc pairs / s \n", + " True 0.06 Kdoc pairs / s \n", + " 100 False 0.07 Kdoc pairs / s \n", + " True 0.06 Kdoc pairs / s \n", + " 1000 1 False 0.03 Kdoc pairs / s \n", + " True 0.05 Kdoc pairs / s \n", + " 100 False 0.01 Kdoc pairs / s \n", + " True 0.05 Kdoc pairs / s " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n", + " [1000, 100000], :, [1, 100], :].loc[\n", + " :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SCM between a document and a corpus\n", + "Next, we measure the speed at which the **inner_product** method produces term similarities between documents and a corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(configuration):\n", + " (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration\n", + " corpus_size = len(corpus)\n", + " corpus = [dictionary.doc2bow(doc) for doc in corpus if doc]\n", + " \n", + " start_time = time()\n", + " for vec in corpus:\n", + " matrix.inner_product(vec, corpus, normalized=normalized)\n", + " end_time = time()\n", + " duration = end_time - start_time\n", + " \n", + " return {\n", + " \"dictionary_size\": matrix.matrix.shape[0],\n", + " \"matrix_nonzero\": matrix.matrix.nnz,\n", + " \"nonzero_limit\": nonzero_limit,\n", + " \"normalized\": normalized,\n", + " \"corpus_size\": corpus_size,\n", + " \"corpus_actual_size\": len(corpus),\n", + " \"corpus_nonzero\": sum(len(vec) for vec in corpus),\n", + " \"mean_document_length\": np.mean([len(doc) for doc in corpus]),\n", + " \"repetition\": repetition,\n", + " \"duration\": duration, }" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "configurations = product(matrices, corpora, normalization, repetitions)\n", + "results = benchmark_results(benchmark, configurations, \"matrix_speed.inner-product_results.doc_corpus\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The **speed** is inversely proportional to **matrix_nonzero**. Computing a normalized inner product (**normalized**${}={}$True) results in a constant speed decrease." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(results)\n", + "df[\"speed\"] = df.corpus_actual_size**2 / df.duration\n", + "del df[\"corpus_actual_size\"]\n", + "df = df.groupby([\"dictionary_size\", \"corpus_size\", \"nonzero_limit\", \"normalized\"])\n", + "\n", + "def display(df):\n", + " df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n", + " df[\"speed\"] = [\"%.02f Kdoc pairs / s\" % (speed / 1000) for speed in df[\"speed\"]]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationcorpus_nonzeromatrix_nonzerospeed
dictionary_sizecorpus_sizenonzero_limitnormalized
10001001False00:00:00.0093633.01000.01117.12 Kdoc pairs / s
True00:00:00.0109483.01000.0954.13 Kdoc pairs / s
100False00:00:00.0141283.084944.0728.91 Kdoc pairs / s
True00:00:00.0181643.084944.0551.78 Kdoc pairs / s
10001False00:00:00.07209126.01000.013872.12 Kdoc pairs / s
True00:00:00.07928426.01000.012615.36 Kdoc pairs / s
100False00:00:00.16248326.084944.06188.43 Kdoc pairs / s
True00:00:00.20308126.084944.04924.48 Kdoc pairs / s
1000001001False00:00:00.278253423.0101868.036.05 Kdoc pairs / s
True00:00:00.298519423.0101868.033.56 Kdoc pairs / s
100False00:00:36.326167423.08202884.00.28 Kdoc pairs / s
True00:00:36.928802423.08202884.00.27 Kdoc pairs / s
10001False00:00:07.4033015162.0101868.0135.08 Kdoc pairs / s
True00:00:07.7949435162.0101868.0128.29 Kdoc pairs / s
100False00:05:55.6747125162.08202884.02.81 Kdoc pairs / s
True00:06:05.5613985162.08202884.02.74 Kdoc pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 00:00:00.009363 \n", + " True 00:00:00.010948 \n", + " 100 False 00:00:00.014128 \n", + " True 00:00:00.018164 \n", + " 1000 1 False 00:00:00.072091 \n", + " True 00:00:00.079284 \n", + " 100 False 00:00:00.162483 \n", + " True 00:00:00.203081 \n", + "100000 100 1 False 00:00:00.278253 \n", + " True 00:00:00.298519 \n", + " 100 False 00:00:36.326167 \n", + " True 00:00:36.928802 \n", + " 1000 1 False 00:00:07.403301 \n", + " True 00:00:07.794943 \n", + " 100 False 00:05:55.674712 \n", + " True 00:06:05.561398 \n", + "\n", + " corpus_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 3.0 \n", + " True 3.0 \n", + " 100 False 3.0 \n", + " True 3.0 \n", + " 1000 1 False 26.0 \n", + " True 26.0 \n", + " 100 False 26.0 \n", + " True 26.0 \n", + "100000 100 1 False 423.0 \n", + " True 423.0 \n", + " 100 False 423.0 \n", + " True 423.0 \n", + " 1000 1 False 5162.0 \n", + " True 5162.0 \n", + " 100 False 5162.0 \n", + " True 5162.0 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 1000.0 \n", + " True 1000.0 \n", + " 100 False 84944.0 \n", + " True 84944.0 \n", + " 1000 1 False 1000.0 \n", + " True 1000.0 \n", + " 100 False 84944.0 \n", + " True 84944.0 \n", + "100000 100 1 False 101868.0 \n", + " True 101868.0 \n", + " 100 False 8202884.0 \n", + " True 8202884.0 \n", + " 1000 1 False 101868.0 \n", + " True 101868.0 \n", + " 100 False 8202884.0 \n", + " True 8202884.0 \n", + "\n", + " speed \n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 1117.12 Kdoc pairs / s \n", + " True 954.13 Kdoc pairs / s \n", + " 100 False 728.91 Kdoc pairs / s \n", + " True 551.78 Kdoc pairs / s \n", + " 1000 1 False 13872.12 Kdoc pairs / s \n", + " True 12615.36 Kdoc pairs / s \n", + " 100 False 6188.43 Kdoc pairs / s \n", + " True 4924.48 Kdoc pairs / s \n", + "100000 100 1 False 36.05 Kdoc pairs / s \n", + " True 33.56 Kdoc pairs / s \n", + " 100 False 0.28 Kdoc pairs / s \n", + " True 0.27 Kdoc pairs / s \n", + " 1000 1 False 135.08 Kdoc pairs / s \n", + " True 128.29 Kdoc pairs / s \n", + " 100 False 2.81 Kdoc pairs / s \n", + " True 2.74 Kdoc pairs / s " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.mean()).loc[\n", + " [1000, 100000], :, [1, 100], :].loc[\n", + " :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationcorpus_nonzeromatrix_nonzerospeed
dictionary_sizecorpus_sizenonzero_limitnormalized
10001001False00:00:00.0021200.00.0242.09 Kdoc pairs / s
True00:00:00.0023870.00.0207.64 Kdoc pairs / s
100False00:00:00.0025310.00.0130.94 Kdoc pairs / s
True00:00:00.0009110.00.027.68 Kdoc pairs / s
10001False00:00:00.0005870.00.0112.92 Kdoc pairs / s
True00:00:00.0011910.00.0187.31 Kdoc pairs / s
100False00:00:00.0119440.00.0513.79 Kdoc pairs / s
True00:00:00.0017930.00.043.54 Kdoc pairs / s
1000001001False00:00:00.0161560.00.02.06 Kdoc pairs / s
True00:00:00.0134510.00.01.47 Kdoc pairs / s
100False00:00:01.3397870.00.00.01 Kdoc pairs / s
True00:00:01.6173400.00.00.01 Kdoc pairs / s
10001False00:00:00.0389610.00.00.71 Kdoc pairs / s
True00:00:00.0241540.00.00.40 Kdoc pairs / s
100False00:00:07.6048050.00.00.06 Kdoc pairs / s
True00:00:14.7995190.00.00.10 Kdoc pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 00:00:00.002120 \n", + " True 00:00:00.002387 \n", + " 100 False 00:00:00.002531 \n", + " True 00:00:00.000911 \n", + " 1000 1 False 00:00:00.000587 \n", + " True 00:00:00.001191 \n", + " 100 False 00:00:00.011944 \n", + " True 00:00:00.001793 \n", + "100000 100 1 False 00:00:00.016156 \n", + " True 00:00:00.013451 \n", + " 100 False 00:00:01.339787 \n", + " True 00:00:01.617340 \n", + " 1000 1 False 00:00:00.038961 \n", + " True 00:00:00.024154 \n", + " 100 False 00:00:07.604805 \n", + " True 00:00:14.799519 \n", + "\n", + " corpus_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "100000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "100000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "\n", + " speed \n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 242.09 Kdoc pairs / s \n", + " True 207.64 Kdoc pairs / s \n", + " 100 False 130.94 Kdoc pairs / s \n", + " True 27.68 Kdoc pairs / s \n", + " 1000 1 False 112.92 Kdoc pairs / s \n", + " True 187.31 Kdoc pairs / s \n", + " 100 False 513.79 Kdoc pairs / s \n", + " True 43.54 Kdoc pairs / s \n", + "100000 100 1 False 2.06 Kdoc pairs / s \n", + " True 1.47 Kdoc pairs / s \n", + " 100 False 0.01 Kdoc pairs / s \n", + " True 0.01 Kdoc pairs / s \n", + " 1000 1 False 0.71 Kdoc pairs / s \n", + " True 0.40 Kdoc pairs / s \n", + " 100 False 0.06 Kdoc pairs / s \n", + " True 0.10 Kdoc pairs / s " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n", + " [1000, 100000], :, [1, 100], :].loc[\n", + " :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SCM between two corpora\n", + "Lastly, we measure the speed at which the **inner_product** method produces term similarities between entire corpora." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(configuration):\n", + " (matrix, dictionary, nonzero_limit), corpus, normalized, repetition = configuration\n", + " corpus_size = len(corpus)\n", + " corpus = [dictionary.doc2bow(doc) for doc in corpus]\n", + " corpus = [vec for vec in corpus if len(vec) > 0]\n", + " \n", + " start_time = time()\n", + " matrix.inner_product(corpus, corpus, normalized=normalized)\n", + " end_time = time()\n", + " duration = end_time - start_time\n", + " \n", + " return {\n", + " \"dictionary_size\": matrix.matrix.shape[0],\n", + " \"matrix_nonzero\": matrix.matrix.nnz,\n", + " \"nonzero_limit\": nonzero_limit,\n", + " \"normalized\": normalized,\n", + " \"corpus_size\": corpus_size,\n", + " \"corpus_actual_size\": len(corpus),\n", + " \"corpus_nonzero\": sum(len(vec) for vec in corpus),\n", + " \"mean_document_length\": np.mean([len(doc) for doc in corpus]),\n", + " \"repetition\": repetition,\n", + " \"duration\": duration, }" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "84e1344be5d944fa98368e6b3994944a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=2), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/mnt/storage/home/novotny/.virtualenvs/gensim/lib/python3.4/site-packages/gensim/matutils.py:738: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "nonzero_limits = [1000]\n", + "dense_matrices = []\n", + "for (model, dictionary), nonzero_limit in tqdm(\n", + " list(product(zip(models, dictionaries), nonzero_limits)), desc=\"matrices\"):\n", + " annoy = AnnoyIndexer(model, 1)\n", + " index = WordEmbeddingSimilarityIndex(model, kwargs={\"indexer\": annoy})\n", + " matrix = SparseTermSimilarityMatrix(index, dictionary, nonzero_limit=nonzero_limit)\n", + " matrices.append((matrix, dictionary, nonzero_limit))\n", + " del annoy" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "configurations = product(matrices + dense_matrices, corpora + [full_corpus], normalization, repetitions)\n", + "results = benchmark_results(benchmark, configurations, \"matrix_speed.inner-product_results.corpus_corpus\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(results)\n", + "df[\"speed\"] = df.corpus_actual_size**2 / df.duration\n", + "del df[\"corpus_actual_size\"]\n", + "df = df.groupby([\"dictionary_size\", \"corpus_size\", \"nonzero_limit\", \"normalized\"])\n", + "\n", + "def display(df):\n", + " df[\"duration\"] = [timedelta(0, duration) for duration in df[\"duration\"]]\n", + " df[\"speed\"] = [\"%.02f Kdoc pairs / s\" % (speed / 1000) for speed in df[\"speed\"]]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationcorpus_nonzeromatrix_nonzerospeed
dictionary_sizecorpus_sizenonzero_limitnormalized
10001001False00:00:00.0014033.01000.06.69 Kdoc pairs / s
True00:00:00.0053133.01000.01.70 Kdoc pairs / s
10False00:00:00.0015653.08634.05.80 Kdoc pairs / s
True00:00:00.0053073.08634.01.70 Kdoc pairs / s
100False00:00:00.0031723.084944.03.05 Kdoc pairs / s
True00:00:00.0084613.084944.01.07 Kdoc pairs / s
1000False00:00:00.0213773.0838588.00.42 Kdoc pairs / s
True00:00:00.0552343.0838588.00.16 Kdoc pairs / s
10001False00:00:00.00137626.01000.0418.61 Kdoc pairs / s
True00:00:00.00501926.01000.0114.78 Kdoc pairs / s
10False00:00:00.00151126.08634.0381.50 Kdoc pairs / s
True00:00:00.00520826.08634.0110.60 Kdoc pairs / s
100False00:00:00.00353926.084944.0164.03 Kdoc pairs / s
True00:00:00.00850226.084944.067.81 Kdoc pairs / s
1000False00:00:00.02154826.0838588.026.73 Kdoc pairs / s
True00:00:00.05442526.0838588.010.59 Kdoc pairs / s
1000001False00:00:00.0199152914.01000.0391443.20 Kdoc pairs / s
True00:00:00.0261182914.01000.0298377.75 Kdoc pairs / s
10False00:00:00.0201522914.08634.0386722.55 Kdoc pairs / s
True00:00:00.0269982914.08634.0288567.14 Kdoc pairs / s
100False00:00:00.0283452914.084944.0274905.36 Kdoc pairs / s
True00:00:00.0410692914.084944.0189709.57 Kdoc pairs / s
1000False00:00:00.0899782914.0838588.086598.15 Kdoc pairs / s
True00:00:00.1856112914.0838588.041971.58 Kdoc pairs / s
1000001001False00:00:00.003345423.0101868.02013.92 Kdoc pairs / s
True00:00:00.008857423.0101868.0760.13 Kdoc pairs / s
10False00:00:00.032639423.0814154.0206.66 Kdoc pairs / s
True00:00:00.080591423.0814154.083.46 Kdoc pairs / s
100False00:00:00.488467423.08202884.013.77 Kdoc pairs / s
True00:00:01.454507423.08202884.04.62 Kdoc pairs / s
1000False00:00:04.973667423.089912542.01.35 Kdoc pairs / s
True00:00:15.035711423.089912542.00.45 Kdoc pairs / s
10001False00:00:00.0101415162.0101868.067139.73 Kdoc pairs / s
True00:00:00.0166855162.0101868.040798.02 Kdoc pairs / s
10False00:00:00.0413925162.0814154.016444.18 Kdoc pairs / s
True00:00:00.0916865162.0814154.07425.08 Kdoc pairs / s
100False00:00:00.5089165162.08202884.01338.94 Kdoc pairs / s
True00:00:01.4975565162.08202884.0454.49 Kdoc pairs / s
1000False00:00:05.1014895162.089912542.0133.44 Kdoc pairs / s
True00:00:15.3254155162.089912542.044.42 Kdoc pairs / s
1000001False00:00:37.145526525310.0101868.0192578.80 Kdoc pairs / s
True00:00:45.729004525310.0101868.0156431.36 Kdoc pairs / s
10False00:00:44.981806525310.0814154.0159029.88 Kdoc pairs / s
True00:00:54.245450525310.0814154.0131871.88 Kdoc pairs / s
100False00:01:15.925860525310.08202884.094216.21 Kdoc pairs / s
True00:01:29.232076525310.08202884.080177.08 Kdoc pairs / s
1000False00:03:17.140191525310.089912542.036286.25 Kdoc pairs / s
True00:04:05.865666525310.089912542.029097.14 Kdoc pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 00:00:00.001403 \n", + " True 00:00:00.005313 \n", + " 10 False 00:00:00.001565 \n", + " True 00:00:00.005307 \n", + " 100 False 00:00:00.003172 \n", + " True 00:00:00.008461 \n", + " 1000 False 00:00:00.021377 \n", + " True 00:00:00.055234 \n", + " 1000 1 False 00:00:00.001376 \n", + " True 00:00:00.005019 \n", + " 10 False 00:00:00.001511 \n", + " True 00:00:00.005208 \n", + " 100 False 00:00:00.003539 \n", + " True 00:00:00.008502 \n", + " 1000 False 00:00:00.021548 \n", + " True 00:00:00.054425 \n", + " 100000 1 False 00:00:00.019915 \n", + " True 00:00:00.026118 \n", + " 10 False 00:00:00.020152 \n", + " True 00:00:00.026998 \n", + " 100 False 00:00:00.028345 \n", + " True 00:00:00.041069 \n", + " 1000 False 00:00:00.089978 \n", + " True 00:00:00.185611 \n", + "100000 100 1 False 00:00:00.003345 \n", + " True 00:00:00.008857 \n", + " 10 False 00:00:00.032639 \n", + " True 00:00:00.080591 \n", + " 100 False 00:00:00.488467 \n", + " True 00:00:01.454507 \n", + " 1000 False 00:00:04.973667 \n", + " True 00:00:15.035711 \n", + " 1000 1 False 00:00:00.010141 \n", + " True 00:00:00.016685 \n", + " 10 False 00:00:00.041392 \n", + " True 00:00:00.091686 \n", + " 100 False 00:00:00.508916 \n", + " True 00:00:01.497556 \n", + " 1000 False 00:00:05.101489 \n", + " True 00:00:15.325415 \n", + " 100000 1 False 00:00:37.145526 \n", + " True 00:00:45.729004 \n", + " 10 False 00:00:44.981806 \n", + " True 00:00:54.245450 \n", + " 100 False 00:01:15.925860 \n", + " True 00:01:29.232076 \n", + " 1000 False 00:03:17.140191 \n", + " True 00:04:05.865666 \n", + "\n", + " corpus_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 3.0 \n", + " True 3.0 \n", + " 10 False 3.0 \n", + " True 3.0 \n", + " 100 False 3.0 \n", + " True 3.0 \n", + " 1000 False 3.0 \n", + " True 3.0 \n", + " 1000 1 False 26.0 \n", + " True 26.0 \n", + " 10 False 26.0 \n", + " True 26.0 \n", + " 100 False 26.0 \n", + " True 26.0 \n", + " 1000 False 26.0 \n", + " True 26.0 \n", + " 100000 1 False 2914.0 \n", + " True 2914.0 \n", + " 10 False 2914.0 \n", + " True 2914.0 \n", + " 100 False 2914.0 \n", + " True 2914.0 \n", + " 1000 False 2914.0 \n", + " True 2914.0 \n", + "100000 100 1 False 423.0 \n", + " True 423.0 \n", + " 10 False 423.0 \n", + " True 423.0 \n", + " 100 False 423.0 \n", + " True 423.0 \n", + " 1000 False 423.0 \n", + " True 423.0 \n", + " 1000 1 False 5162.0 \n", + " True 5162.0 \n", + " 10 False 5162.0 \n", + " True 5162.0 \n", + " 100 False 5162.0 \n", + " True 5162.0 \n", + " 1000 False 5162.0 \n", + " True 5162.0 \n", + " 100000 1 False 525310.0 \n", + " True 525310.0 \n", + " 10 False 525310.0 \n", + " True 525310.0 \n", + " 100 False 525310.0 \n", + " True 525310.0 \n", + " 1000 False 525310.0 \n", + " True 525310.0 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 1000.0 \n", + " True 1000.0 \n", + " 10 False 8634.0 \n", + " True 8634.0 \n", + " 100 False 84944.0 \n", + " True 84944.0 \n", + " 1000 False 838588.0 \n", + " True 838588.0 \n", + " 1000 1 False 1000.0 \n", + " True 1000.0 \n", + " 10 False 8634.0 \n", + " True 8634.0 \n", + " 100 False 84944.0 \n", + " True 84944.0 \n", + " 1000 False 838588.0 \n", + " True 838588.0 \n", + " 100000 1 False 1000.0 \n", + " True 1000.0 \n", + " 10 False 8634.0 \n", + " True 8634.0 \n", + " 100 False 84944.0 \n", + " True 84944.0 \n", + " 1000 False 838588.0 \n", + " True 838588.0 \n", + "100000 100 1 False 101868.0 \n", + " True 101868.0 \n", + " 10 False 814154.0 \n", + " True 814154.0 \n", + " 100 False 8202884.0 \n", + " True 8202884.0 \n", + " 1000 False 89912542.0 \n", + " True 89912542.0 \n", + " 1000 1 False 101868.0 \n", + " True 101868.0 \n", + " 10 False 814154.0 \n", + " True 814154.0 \n", + " 100 False 8202884.0 \n", + " True 8202884.0 \n", + " 1000 False 89912542.0 \n", + " True 89912542.0 \n", + " 100000 1 False 101868.0 \n", + " True 101868.0 \n", + " 10 False 814154.0 \n", + " True 814154.0 \n", + " 100 False 8202884.0 \n", + " True 8202884.0 \n", + " 1000 False 89912542.0 \n", + " True 89912542.0 \n", + "\n", + " speed \n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 6.69 Kdoc pairs / s \n", + " True 1.70 Kdoc pairs / s \n", + " 10 False 5.80 Kdoc pairs / s \n", + " True 1.70 Kdoc pairs / s \n", + " 100 False 3.05 Kdoc pairs / s \n", + " True 1.07 Kdoc pairs / s \n", + " 1000 False 0.42 Kdoc pairs / s \n", + " True 0.16 Kdoc pairs / s \n", + " 1000 1 False 418.61 Kdoc pairs / s \n", + " True 114.78 Kdoc pairs / s \n", + " 10 False 381.50 Kdoc pairs / s \n", + " True 110.60 Kdoc pairs / s \n", + " 100 False 164.03 Kdoc pairs / s \n", + " True 67.81 Kdoc pairs / s \n", + " 1000 False 26.73 Kdoc pairs / s \n", + " True 10.59 Kdoc pairs / s \n", + " 100000 1 False 391443.20 Kdoc pairs / s \n", + " True 298377.75 Kdoc pairs / s \n", + " 10 False 386722.55 Kdoc pairs / s \n", + " True 288567.14 Kdoc pairs / s \n", + " 100 False 274905.36 Kdoc pairs / s \n", + " True 189709.57 Kdoc pairs / s \n", + " 1000 False 86598.15 Kdoc pairs / s \n", + " True 41971.58 Kdoc pairs / s \n", + "100000 100 1 False 2013.92 Kdoc pairs / s \n", + " True 760.13 Kdoc pairs / s \n", + " 10 False 206.66 Kdoc pairs / s \n", + " True 83.46 Kdoc pairs / s \n", + " 100 False 13.77 Kdoc pairs / s \n", + " True 4.62 Kdoc pairs / s \n", + " 1000 False 1.35 Kdoc pairs / s \n", + " True 0.45 Kdoc pairs / s \n", + " 1000 1 False 67139.73 Kdoc pairs / s \n", + " True 40798.02 Kdoc pairs / s \n", + " 10 False 16444.18 Kdoc pairs / s \n", + " True 7425.08 Kdoc pairs / s \n", + " 100 False 1338.94 Kdoc pairs / s \n", + " True 454.49 Kdoc pairs / s \n", + " 1000 False 133.44 Kdoc pairs / s \n", + " True 44.42 Kdoc pairs / s \n", + " 100000 1 False 192578.80 Kdoc pairs / s \n", + " True 156431.36 Kdoc pairs / s \n", + " 10 False 159029.88 Kdoc pairs / s \n", + " True 131871.88 Kdoc pairs / s \n", + " 100 False 94216.21 Kdoc pairs / s \n", + " True 80177.08 Kdoc pairs / s \n", + " 1000 False 36286.25 Kdoc pairs / s \n", + " True 29097.14 Kdoc pairs / s " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.mean()).loc[\n", + " [1000, 100000], :, [1, 10, 100, 1000], :].loc[\n", + " :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
durationcorpus_nonzeromatrix_nonzerospeed
dictionary_sizecorpus_sizenonzero_limitnormalized
10001001False00:00:00.0002920.00.01.48 Kdoc pairs / s
True00:00:00.0002250.00.00.08 Kdoc pairs / s
100False00:00:00.0007470.00.01.02 Kdoc pairs / s
True00:00:00.0004880.00.00.07 Kdoc pairs / s
10001False00:00:00.0000270.00.08.10 Kdoc pairs / s
True00:00:00.0000690.00.01.56 Kdoc pairs / s
100False00:00:00.0003090.00.016.26 Kdoc pairs / s
True00:00:00.0002680.00.02.24 Kdoc pairs / s
1000001False00:00:00.0005760.00.011256.03 Kdoc pairs / s
True00:00:00.0005740.00.06512.19 Kdoc pairs / s
100False00:00:00.0005620.00.05233.50 Kdoc pairs / s
True00:00:00.0006090.00.02743.63 Kdoc pairs / s
1000001001False00:00:00.0001520.00.098.97 Kdoc pairs / s
True00:00:00.0003220.00.028.10 Kdoc pairs / s
100False00:00:00.0049970.00.00.14 Kdoc pairs / s
True00:00:00.0222060.00.00.07 Kdoc pairs / s
10001False00:00:00.0002100.00.01420.00 Kdoc pairs / s
True00:00:00.0001920.00.0467.23 Kdoc pairs / s
100False00:00:00.0190220.00.045.91 Kdoc pairs / s
True00:00:00.0044310.00.01.35 Kdoc pairs / s
1000001False00:00:00.0244660.00.0126.77 Kdoc pairs / s
True00:00:00.0624470.00.0213.64 Kdoc pairs / s
100False00:00:00.0876920.00.0108.55 Kdoc pairs / s
True00:00:01.0658890.00.0968.80 Kdoc pairs / s
\n", + "
" + ], + "text/plain": [ + " duration \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 00:00:00.000292 \n", + " True 00:00:00.000225 \n", + " 100 False 00:00:00.000747 \n", + " True 00:00:00.000488 \n", + " 1000 1 False 00:00:00.000027 \n", + " True 00:00:00.000069 \n", + " 100 False 00:00:00.000309 \n", + " True 00:00:00.000268 \n", + " 100000 1 False 00:00:00.000576 \n", + " True 00:00:00.000574 \n", + " 100 False 00:00:00.000562 \n", + " True 00:00:00.000609 \n", + "100000 100 1 False 00:00:00.000152 \n", + " True 00:00:00.000322 \n", + " 100 False 00:00:00.004997 \n", + " True 00:00:00.022206 \n", + " 1000 1 False 00:00:00.000210 \n", + " True 00:00:00.000192 \n", + " 100 False 00:00:00.019022 \n", + " True 00:00:00.004431 \n", + " 100000 1 False 00:00:00.024466 \n", + " True 00:00:00.062447 \n", + " 100 False 00:00:00.087692 \n", + " True 00:00:01.065889 \n", + "\n", + " corpus_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 100000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "100000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 100000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "\n", + " matrix_nonzero \\\n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 100000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "100000 100 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 1000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + " 100000 1 False 0.0 \n", + " True 0.0 \n", + " 100 False 0.0 \n", + " True 0.0 \n", + "\n", + " speed \n", + "dictionary_size corpus_size nonzero_limit normalized \n", + "1000 100 1 False 1.48 Kdoc pairs / s \n", + " True 0.08 Kdoc pairs / s \n", + " 100 False 1.02 Kdoc pairs / s \n", + " True 0.07 Kdoc pairs / s \n", + " 1000 1 False 8.10 Kdoc pairs / s \n", + " True 1.56 Kdoc pairs / s \n", + " 100 False 16.26 Kdoc pairs / s \n", + " True 2.24 Kdoc pairs / s \n", + " 100000 1 False 11256.03 Kdoc pairs / s \n", + " True 6512.19 Kdoc pairs / s \n", + " 100 False 5233.50 Kdoc pairs / s \n", + " True 2743.63 Kdoc pairs / s \n", + "100000 100 1 False 98.97 Kdoc pairs / s \n", + " True 28.10 Kdoc pairs / s \n", + " 100 False 0.14 Kdoc pairs / s \n", + " True 0.07 Kdoc pairs / s \n", + " 1000 1 False 1420.00 Kdoc pairs / s \n", + " True 467.23 Kdoc pairs / s \n", + " 100 False 45.91 Kdoc pairs / s \n", + " True 1.35 Kdoc pairs / s \n", + " 100000 1 False 126.77 Kdoc pairs / s \n", + " True 213.64 Kdoc pairs / s \n", + " 100 False 108.55 Kdoc pairs / s \n", + " True 968.80 Kdoc pairs / s " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "display(df.apply(lambda x: (x - x.mean()).std())).loc[\n", + " [1000, 100000], :, [1, 100], :].loc[\n", + " :, [\"duration\", \"corpus_nonzero\", \"matrix_nonzero\", \"speed\"]]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/notebooks/soft_cosine_tutorial.ipynb b/docs/notebooks/soft_cosine_tutorial.ipynb index e5d11dcd3f..957899c089 100644 --- a/docs/notebooks/soft_cosine_tutorial.ipynb +++ b/docs/notebooks/soft_cosine_tutorial.ipynb @@ -6,7 +6,7 @@ "source": [ "# Finding similar documents with Word2Vec and Soft Cosine Measure \n", "\n", - "Soft Cosine Measure (SCM) is a promising new tool in machine learning that allows us to submit a query and return the most relevant documents. In **part 1**, we will show how you can compute SCM between two documents using `softcossim`. In **part 2**, we will use `SoftCosineSimilarity` to retrieve documents most similar to a query and compare the performance against other similarity measures.\n", + "Soft Cosine Measure (SCM) [1, 4] is a promising new tool in machine learning that allows us to submit a query and return the most relevant documents. In **part 1**, we will show how you can compute SCM between two documents using the `inner_product` method. In **part 2**, we will use `SoftCosineSimilarity` to retrieve documents most similar to a query and compare the performance against other similarity measures.\n", "\n", "First, however, we go through the basics of what Soft Cosine Measure is.\n", "\n", @@ -22,7 +22,7 @@ "\n", "This method was perhaps first introduced in the article “Soft Measure and Soft Cosine Measure: Measure of Features in Vector Space Model” by Grigori Sidorov, Alexander Gelbukh, Helena Gomez-Adorno, and David Pinto ([link to PDF](http://www.scielo.org.mx/pdf/cys/v18n3/v18n3a7.pdf)).\n", "\n", - "In this tutorial, we will learn how to use Gensim's SCM functionality, which consists of the `softcossim` function for one-off computation, and the `SoftCosineSimilarity` class for corpus-based similarity queries.\n", + "In this tutorial, we will learn how to use Gensim's SCM functionality, which consists of the `inner_product` method for one-off computation, and the `SoftCosineSimilarity` class for corpus-based similarity queries.\n", "\n", "> **Note**:\n", ">\n", @@ -67,7 +67,7 @@ "source": [ "sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()\n", "sentence_president = 'The president greets the press in Chicago'.lower().split()\n", - "sentence_orange = 'Oranges are my favorite fruit'.lower().split()" + "sentence_orange = 'Having a tough time finding an orange juice press machine?'.lower().split()" ] }, { @@ -84,19 +84,13 @@ "scrolled": true }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /home/witiko/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "2018-02-05 10:47:42,975 : INFO : built Dictionary(11 unique tokens: ['president', 'fruit', 'greets', 'obama', 'illinois']...) from 3 documents (total 11 corpus positions)\n" + "2018-09-11 22:02:01,041 : INFO : 'pattern' package not found; tag filters are not available for English\n", + "2018-09-11 22:02:01,044 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2018-09-11 22:02:01,045 : INFO : built Dictionary(14 unique tokens: ['speaks', 'illinois', 'greets', 'juice', 'chicago']...) from 3 documents (total 15 corpus positions)\n" ] } ], @@ -116,7 +110,6 @@ "from gensim import corpora\n", "documents = [sentence_obama, sentence_president, sentence_orange]\n", "dictionary = corpora.Dictionary(documents)\n", - "corpus = [dictionary.doc2bow(document) for document in documents]\n", "\n", "# Convert the sentences into bag-of-words vectors.\n", "sentence_obama = dictionary.doc2bow(sentence_obama)\n", @@ -128,7 +121,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, as we mentioned earlier, we will be using some downloaded pre-trained embeddings. Note that the embeddings we have chosen here require a lot of memory. We will use the embeddings to construct a term similarity matrix that will be used by the `softcossim` function." + "Now, as we mentioned earlier, we will be using some downloaded pre-trained embeddings. Note that the embeddings we have chosen here require a lot of memory. We will use the embeddings to construct a term similarity matrix that will be used by the `inner_product` method." ] }, { @@ -140,31 +133,38 @@ "name": "stderr", "output_type": "stream", "text": [ - "2018-02-06 16:14:29,104 : INFO : constructed a term similarity matrix with 91.735537 % nonzero elements\n" + "2018-09-11 22:02:01,236 : INFO : loading projection weights from /home/novotny/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n", + "2018-09-11 22:02:26,984 : INFO : loaded (400000, 50) matrix from /home/novotny/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n", + "2018-09-11 22:02:26,985 : INFO : constructing a sparse term similarity matrix using \n", + "2018-09-11 22:02:26,986 : INFO : iterating over columns in dictionary order\n", + "2018-09-11 22:02:27,273 : INFO : constructed a sparse term similarity matrix with 11.224490% density\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 21.2 s, sys: 224 ms, total: 21.4 s\n", - "Wall time: 21.8 s\n" + "CPU times: user 27.8 s, sys: 2.43 s, total: 30.3 s\n", + "Wall time: 26.2 s\n" ] } ], "source": [ "%%time\n", "import gensim.downloader as api\n", + "from gensim.models import WordEmbeddingSimilarityIndex\n", + "from gensim.similarities import SparseTermSimilarityMatrix\n", "\n", "w2v_model = api.load(\"glove-wiki-gigaword-50\")\n", - "similarity_matrix = w2v_model.similarity_matrix(dictionary)" + "similarity_index = WordEmbeddingSimilarityIndex(w2v_model)\n", + "similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "So let's compute SCM using the `softcossim` function." + "Let's compute SCM using the `inner_product` method." ] }, { @@ -176,14 +176,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "similarity = 0.5789\n" + "similarity = 0.3790\n" ] } ], "source": [ - "from gensim.matutils import softcossim\n", - "\n", - "similarity = softcossim(sentence_obama, sentence_president, similarity_matrix)\n", + "similarity = similarity_matrix.inner_product(sentence_obama, sentence_president, normalized=True)\n", "print('similarity = %.4f' % similarity)" ] }, @@ -203,12 +201,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "similarity = 0.1439\n" + "similarity = 0.1108\n" ] } ], "source": [ - "similarity = softcossim(sentence_obama, sentence_orange, similarity_matrix)\n", + "similarity = similarity_matrix.inner_product(sentence_obama, sentence_orange, normalized=True)\n", "print('similarity = %.4f' % similarity)" ] }, @@ -217,7 +215,7 @@ "metadata": {}, "source": [ "## Part 2: Similarity queries using `SoftCosineSimilarity`\n", - "You can use SCM to get the most similar documents to a query, using the SoftCosineSimilarity class. Its interface is similar to what is described in the [Similarity Queries](https://radimrehurek.com/gensim/tut3.html) Gensim tutorial.\n", + "You can use SCM to get the most similar documents to a query, using the `SoftCosineSimilarity` class. Its interface is similar to what is described in the [Similarity Queries](https://radimrehurek.com/gensim/tut3.html) Gensim tutorial.\n", "\n", "### Qatar Living unannotated dataset\n", "Contestants solving the community question answering task in the [SemEval 2016][semeval16] and [2017][semeval17] competitions had an unannotated dataset of 189,941 questions and 1,894,456 comments from the [Qatar Living][ql] discussion forums. As our first step, we will use the same dataset to build a corpus.\n", @@ -236,11 +234,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "[nltk_data] Downloading package stopwords to /home/witiko/nltk_data...\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /home/novotny/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "Number of documents: 3\n", - "CPU times: user 1min 59s, sys: 6.06 s, total: 2min 5s\n", - "Wall time: 2min 22s\n" + "CPU times: user 2min 37s, sys: 1.62 s, total: 2min 39s\n", + "Wall time: 2min 39s\n" ] } ], @@ -291,41 +290,60 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2018-02-05 10:52:53,477 : INFO : built Dictionary(462807 unique tokens: ['reclarify', 'depeneded', 'autralia', 'cloudnight', 'openmoko']...) from 2274338 documents (total 40096354 corpus positions)\n", - "2018-02-05 10:56:50,633 : INFO : training on a 200481770 raw words (192577574 effective words) took 224.3s, 858402 effective words/s\n", - "2018-02-05 11:13:14,895 : INFO : constructed a term similarity matrix with 0.003564 % nonzero elements\n" + "2018-09-11 22:06:07,973 : INFO : built Dictionary(462807 unique tokens: ['pples', 'adib', 'strangers', 'kolayaalee', 'softpoint']...) from 2274338 documents (total 40096354 corpus positions)\n", + "2018-09-11 22:06:09,432 : INFO : collecting all words and their counts\n", + "2018-09-11 22:06:17,564 : INFO : collected 462807 word types from a corpus of 40096354 raw words and 2274338 sentences\n", + "2018-09-11 22:06:17,565 : INFO : Loading a fresh vocabulary\n", + "2018-09-11 22:06:18,002 : INFO : effective_min_count=5 retains 104360 unique words (22% of original 462807, drops 358447)\n", + "2018-09-11 22:06:18,003 : INFO : effective_min_count=5 leaves 39565168 word corpus (98% of original 40096354, drops 531186)\n", + "2018-09-11 22:06:18,454 : INFO : deleting the raw counts dictionary of 462807 items\n", + "2018-09-11 22:06:18,474 : INFO : sample=0.001 downsamples 22 most-common words\n", + "2018-09-11 22:06:18,475 : INFO : downsampling leaves estimated 38552993 word corpus (97.4% of prior 39565168)\n", + "2018-09-11 22:06:18,907 : INFO : estimated required memory for 104360 words and 300 dimensions: 302644000 bytes\n", + "2018-09-11 22:06:18,908 : INFO : resetting layer weights\n", + "2018-09-11 22:06:21,082 : INFO : training model with 32 workers on 104360 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2018-09-11 22:06:53,894 : INFO : EPOCH - 1 : training on 40096354 raw words (38515351 effective words) took 32.8s, 1174692 effective words/s\n", + "2018-09-11 22:07:27,121 : INFO : EPOCH - 2 : training on 40096354 raw words (38515107 effective words) took 33.2s, 1159858 effective words/s\n", + "2018-09-11 22:08:00,122 : INFO : EPOCH - 3 : training on 40096354 raw words (38514587 effective words) took 33.0s, 1167509 effective words/s\n", + "2018-09-11 22:08:32,976 : INFO : EPOCH - 4 : training on 40096354 raw words (38515500 effective words) took 32.8s, 1172993 effective words/s\n", + "2018-09-11 22:09:06,211 : INFO : EPOCH - 5 : training on 40096354 raw words (38515593 effective words) took 33.2s, 1159566 effective words/s\n", + "2018-09-11 22:09:06,212 : INFO : training on a 200481770 raw words (192576138 effective words) took 165.1s, 1166216 effective words/s\n", + "2018-09-11 22:09:06,637 : INFO : constructing a sparse term similarity matrix using \n", + "2018-09-11 22:09:06,657 : INFO : iterating over columns in tf-idf order\n", + "2018-09-11 22:25:34,416 : INFO : constructed a sparse term similarity matrix with 0.003654% density\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Number of unique words: 462807\n", - "CPU times: user 1h 2min 21s, sys: 12min 56s, total: 1h 15min 17s\n", - "Wall time: 21min 27s\n" + "CPU times: user 4h 38min 32s, sys: 4h 24min 33s, total: 9h 3min 5s\n", + "Wall time: 20min 43s\n" ] } ], "source": [ "%%time\n", + "from multiprocessing import cpu_count\n", + "\n", "from gensim.corpora import Dictionary\n", "from gensim.models import TfidfModel\n", "from gensim.models import Word2Vec\n", - "from multiprocessing import cpu_count\n", + "from gensim.models import WordEmbeddingSimilarityIndex\n", + "from gensim.similarities import SparseTermSimilarityMatrix\n", "\n", "dictionary = Dictionary(corpus)\n", "tfidf = TfidfModel(dictionary=dictionary)\n", "w2v_model = Word2Vec(corpus, workers=cpu_count(), min_count=5, size=300, seed=12345)\n", - "similarity_matrix = w2v_model.wv.similarity_matrix(dictionary, tfidf, nonzero_limit=100)\n", - "\n", - "print(\"Number of unique words: %d\" % len(dictionary))" + "similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)\n", + "similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf, nonzero_limit=100)" ] }, { @@ -423,7 +441,7 @@ " if dict_index in document] for document in documents]\n", " embeddings = np.array([w2v_model.wv[word] for word in words], dtype=np.float32)\n", " nbow = dict(((index, list(chain([None], zip(*document)))) for index, document in enumerate(documents)))\n", - " nbow[\"query\"] = (None, *zip(*query))\n", + " nbow[\"query\"] = tuple([None] + list(zip(*query)))\n", " distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors(\"query\")\n", " similarities = [-distance for _, distance in sorted(distances)]\n", " return similarities\n", @@ -471,8 +489,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.49 s, sys: 1.28 s, total: 2.77 s\n", - "Wall time: 1min 42s\n" + "CPU times: user 2.14 s, sys: 5.08 s, total: 7.22 s\n", + "Wall time: 2min 51s\n" ] } ], @@ -507,23 +525,23 @@ "\n", "Dataset | Strategy | MAP score | Elapsed time (sec)\n", ":---|:---|:---|---:\n", - "2016-test|softcossim|77.29 ±10.35|0.20 ±0.06\n", + "2016-test|softcossim|77.15 ±10.83|4.48 ±0.56\n", "2016-test|**Winner (UH-PRHLT-primary)**|76.70 ±0.00|\n", - "2016-test|cossim|76.45 ±10.40|0.48 ±0.07\n", - "2016-test|wmd-gensim|76.07 ±11.52|8.36 ±2.05\n", + "2016-test|cossim|76.45 ±10.40|0.25 ±0.04\n", + "2016-test|wmd-gensim|76.15 ±11.51|13.79 ±1.39\n", "2016-test|**Baseline 1 (IR)**|74.75 ±0.00|\n", - "2016-test|wmd-relax|73.01 ±10.33|0.97 ±0.16\n", + "2016-test|wmd-relax|72.03 ±11.33|0.34 ±0.07\n", "2016-test|**Baseline 2 (random)**|46.98 ±0.00|\n", "\n", "\n", "Dataset | Strategy | MAP score | Elapsed time (sec)\n", ":---|:---|:---|---:\n", "2017-test|**Winner (SimBow-primary)**|47.22 ±0.00|\n", - "2017-test|softcossim|46.06 ±18.00|0.15 ±0.03\n", - "2017-test|cossim|44.38 ±14.71|0.43 ±0.07\n", - "2017-test|wmd-gensim|44.20 ±16.02|9.78 ±1.80\n", + "2017-test|wmd-relax|45.04 ±15.44|0.39 ±0.07\n", + "2017-test|cossim|44.38 ±14.71|0.29 ±0.05\n", + "2017-test|softcossim|44.25 ±15.68|4.89 ±0.80\n", + "2017-test|wmd-gensim|44.08 ±15.96|16.69 ±1.90\n", "2017-test|**Baseline 1 (IR)**|41.85 ±0.00|\n", - "2017-test|wmd-relax|41.24 ±14.87|1.00 ±0.26\n", "2017-test|**Baseline 2 (random)**|29.81 ±0.00|" ], "text/plain": [ @@ -565,7 +583,8 @@ "\n", "1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014. ([link to PDF](http://www.scielo.org.mx/pdf/cys/v18n3/v18n3a7.pdf))\n", "2. Delphine Charlet and Geraldine Damnati, SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity between Questions for Community Question Answering, 2017. ([link to PDF](http://www.aclweb.org/anthology/S17-2051))\n", - "3. Thomas Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013. ([link to PDF](https://arxiv.org/pdf/1301.3781.pdf))" + "3. Thomas Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013. ([link to PDF](https://arxiv.org/pdf/1301.3781.pdf))\n", + "4. Vít Novotný. *Implementation Notes for the Soft Cosine Measure*, 2018. ([link to PDF](https://arxiv.org/pdf/1808.09407))" ] } ], @@ -585,7 +604,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.4.2" } }, "nbformat": 4, diff --git a/gensim/matutils.py b/gensim/matutils.py index 74c0107cde..979b99f6d5 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -14,6 +14,7 @@ import math from gensim import utils +from gensim.utils import deprecated import numpy as np import scipy.sparse @@ -796,6 +797,9 @@ def cossim(vec1, vec2): return result +@deprecated( + "Function will be removed in 4.0.0, use " + "gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead") def softcossim(vec1, vec2, similarity_matrix): """Get Soft Cosine Measure between two vectors given a term similarity matrix. @@ -816,8 +820,10 @@ def softcossim(vec1, vec2, similarity_matrix): vec2 : list of (int, float) A document vector in the BoW format. similarity_matrix : {:class:`scipy.sparse.csc_matrix`, :class:`scipy.sparse.csr_matrix`} - A term similarity matrix, typically produced by - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`. + A term similarity matrix. If the matrix is :class:`scipy.sparse.csr_matrix`, it is going + to be transposed. If you rely on the fact that there is at most a constant number of + non-zero elements in a single column, it is your responsibility to ensure that the matrix + is symmetric. Returns ------- diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 96ca698b27..a0ee690550 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -13,7 +13,7 @@ from .logentropy_model import LogEntropyModel # noqa:F401 from .word2vec import Word2Vec # noqa:F401 from .doc2vec import Doc2Vec # noqa:F401 -from .keyedvectors import KeyedVectors # noqa:F401 +from .keyedvectors import KeyedVectors, WordEmbeddingSimilarityIndex # noqa:F401 from .ldamulticore import LdaMulticore # noqa:F401 from .phrases import Phrases # noqa:F401 from .normmodel import NormModel # noqa:F401 diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 702cc6a468..1529668423 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -160,7 +160,6 @@ from __future__ import division # py3 "true division" -from collections import deque from itertools import chain import logging @@ -173,11 +172,12 @@ double, array, zeros, vstack, sqrt, newaxis, integer, \ ndarray, sum as np_sum, prod, argmax import numpy as np + from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.corpora.dictionary import Dictionary from six import string_types, integer_types from six.moves import zip, range -from scipy import sparse, stats +from scipy import stats from gensim.utils import deprecated from gensim.models.utils_any2vec import ( _save_word2vec_format, @@ -186,6 +186,7 @@ _ft_hash, _ft_hash_broken ) +from gensim.similarities.termsim import TermSimilarityIndex, SparseTermSimilarityMatrix logger = logging.getLogger(__name__) @@ -606,6 +607,9 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None): """ return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) + @deprecated( + "Method will be removed in 4.0.0, use " + "gensim.models.keyedvectors.WordEmbeddingSimilarityIndex instead") def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=REAL): """Construct a term similarity matrix for computing Soft Cosine Measure. @@ -615,24 +619,21 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0, Parameters ---------- dictionary : :class:`~gensim.corpora.dictionary.Dictionary` - A dictionary that specifies a mapping between words and the indices of rows and columns - of the resulting term similarity matrix. - tfidf : :class:`gensim.models.tfidfmodel.TfidfModel`, optional - A model that specifies the relative importance of the terms in the dictionary. The rows - of the term similarity matrix will be build in a decreasing order of importance of terms, - or in the order of term identifiers if None. + A dictionary that specifies the considered terms. + tfidf : :class:`gensim.models.tfidfmodel.TfidfModel` or None, optional + A model that specifies the relative importance of the terms in the dictionary. The + columns of the term similarity matrix will be build in a decreasing order of importance + of terms, or in the order of term identifiers if None. threshold : float, optional - Only pairs of words whose embeddings are more similar than `threshold` are considered - when building the sparse term similarity matrix. + Only embeddings more similar than `threshold` are considered when retrieving word + embeddings closest to a given word embedding. exponent : float, optional - The exponent applied to the similarity between two word embeddings when building the term similarity matrix. + Take the word embedding similarities larger than `threshold` to the power of `exponent`. nonzero_limit : int, optional - The maximum number of non-zero elements outside the diagonal in a single row or column - of the term similarity matrix. Setting `nonzero_limit` to a constant ensures that the - time complexity of computing the Soft Cosine Measure will be linear in the document - length rather than quadratic. + The maximum number of non-zero elements outside the diagonal in a single column of the + sparse term similarity matrix. dtype : numpy.dtype, optional - Data-type of the term similarity matrix. + Data-type of the sparse term similarity matrix. Returns ------- @@ -654,66 +655,10 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0, `_. """ - logger.info("constructing a term similarity matrix") - matrix_order = len(dictionary) - matrix_nonzero = [1] * matrix_order - matrix = sparse.identity(matrix_order, dtype=dtype, format="dok") - num_skipped = 0 - # Decide the order of rows. - if tfidf is None: - word_indices = deque(sorted(dictionary.keys())) - else: - assert max(tfidf.idfs) < matrix_order - word_indices = deque([ - index for index, _ - in sorted(tfidf.idfs.items(), key=lambda x: (x[1], -x[0]), reverse=True) - ]) - - # Traverse rows. - for row_number, w1_index in enumerate(list(word_indices)): - word_indices.popleft() - if row_number % 1000 == 0: - logger.info( - "PROGRESS: at %.02f%% rows (%d / %d, %d skipped, %.06f%% density)", - 100.0 * (row_number + 1) / matrix_order, row_number + 1, matrix_order, - num_skipped, 100.0 * matrix.getnnz() / matrix_order**2) - w1 = dictionary[w1_index] - if w1 not in self.vocab: - num_skipped += 1 - continue # A word from the dictionary is not present in the word2vec model. - - # Traverse upper triangle columns. - if matrix_order <= nonzero_limit + 1: # Traverse all columns. - columns = ( - (w2_index, self.similarity(w1, dictionary[w2_index])) - for w2_index in word_indices - if dictionary[w2_index] in self.vocab) - else: # Traverse only columns corresponding to the embeddings closest to w1. - num_nonzero = matrix_nonzero[w1_index] - 1 - columns = ( - (dictionary.token2id[w2], similarity) - for _, (w2, similarity) - in zip( - range(nonzero_limit - num_nonzero), - self.most_similar(positive=[w1], topn=nonzero_limit - num_nonzero) - ) - if w2 in dictionary.token2id - ) - columns = sorted(columns, key=lambda x: x[0]) - - for w2_index, similarity in columns: - # Ensure that we don't exceed `nonzero_limit` by mirroring the upper triangle. - if similarity > threshold and matrix_nonzero[w2_index] <= nonzero_limit: - element = similarity**exponent - matrix[w1_index, w2_index] = element - matrix_nonzero[w1_index] += 1 - matrix[w2_index, w1_index] = element - matrix_nonzero[w2_index] += 1 - logger.info( - "constructed a term similarity matrix with %0.6f %% nonzero elements", - 100.0 * matrix.getnnz() / matrix_order**2 - ) - return matrix.tocsc() + index = WordEmbeddingSimilarityIndex(self, threshold=threshold, exponent=exponent) + similarity_matrix = SparseTermSimilarityMatrix( + index, dictionary, tfidf=tfidf, nonzero_limit=nonzero_limit, dtype=dtype) + return similarity_matrix.matrix def wmdistance(self, document1, document2): """Compute the Word Mover's Distance between two documents. @@ -1386,6 +1331,48 @@ def init_sims(self, replace=False): self.vectors_norm = _l2_norm(self.vectors, replace=replace) +class WordEmbeddingSimilarityIndex(TermSimilarityIndex): + """ + Computes cosine similarities between word embeddings and retrieves the closest word embeddings + by cosine similarity for a given word embedding. + + Parameters + ---------- + keyedvectors : :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors` + The word embeddings. + threshold : float, optional + Only embeddings more similar than `threshold` are considered when retrieving word embeddings + closest to a given word embedding. + exponent : float, optional + Take the word embedding similarities larger than `threshold` to the power of `exponent`. + kwargs : dict or None + A dict with keyword arguments that will be passed to the `keyedvectors.most_similar` method + when retrieving the word embeddings closest to a given word embedding. + + See Also + -------- + :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` + Build a term similarity matrix and compute the Soft Cosine Measure. + + """ + def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None): + assert isinstance(keyedvectors, WordEmbeddingsKeyedVectors) + self.keyedvectors = keyedvectors + self.threshold = threshold + self.exponent = exponent + self.kwargs = kwargs or {} + super(WordEmbeddingSimilarityIndex, self).__init__() + + def most_similar(self, t1, topn=10): + if t1 not in self.keyedvectors.vocab: + logger.debug('an out-of-dictionary term "%s"', t1) + else: + most_similar = self.keyedvectors.most_similar(positive=[t1], topn=topn, **self.kwargs) + for t2, similarity in most_similar: + if similarity > self.threshold: + yield (t2, similarity**self.exponent) + + class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors): """Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model. Used to perform operations on the vectors such as vector lookup, distance, similarity etc. diff --git a/gensim/similarities/__init__.py b/gensim/similarities/__init__.py index 52cbad43e7..1becd76831 100644 --- a/gensim/similarities/__init__.py +++ b/gensim/similarities/__init__.py @@ -4,3 +4,5 @@ # bring classes directly into package namespace, to save some typing from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity, SoftCosineSimilarity, WmdSimilarity # noqa:F401 +from .termsim import TermSimilarityIndex, UniformTermSimilarityIndex, SparseTermSimilarityMatrix # noqa:F401 +from .levenshtein import LevenshteinSimilarityIndex # noqa:F401 diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 91e2b96f10..bb7b4f402b 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -77,6 +77,7 @@ import scipy.sparse from gensim import interfaces, utils, matutils +from .termsim import SparseTermSimilarityMatrix from six.moves import map, range, zip @@ -272,8 +273,6 @@ class Similarity(interfaces.SimilarityABC): Index similarity (dense with cosine distance). :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` Index similarity (sparse with cosine distance). - :class:`~gensim.similarities.docsim.SoftCosineSimilarity` - Index similarity (with soft-cosine distance). :class:`~gensim.similarities.docsim.WmdSimilarity` Index similarity (with word-mover distance). @@ -866,20 +865,18 @@ class SoftCosineSimilarity(interfaces.SimilarityABC): >>> from gensim.test.utils import common_texts >>> from gensim.corpora import Dictionary - >>> from gensim.models import Word2Vec - >>> from gensim.similarities import SoftCosineSimilarity + >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex + >>> from gensim.similarities import SoftCosineSimilarity, TermSimilarityMatrix >>> >>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors + >>> termsim_index = WordEmbeddingSimilarityIndex(model) >>> dictionary = Dictionary(common_texts) >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts] + >>> similarity_matrix = TermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix + >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) >>> - >>> similarity_matrix = model.wv.similarity_matrix(dictionary) # construct similarity matrix - >>> index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) - >>> - >>> # Make a query. - >>> query = 'graph trees computer'.split() - >>> # calculate similarity between query and each doc from bow_corpus - >>> sims = index[dictionary.doc2bow(query)] + >>> query = 'graph trees computer'.split() # make a query + >>> sims = docsim_index[dictionary.doc2bow(query)] # calculate similarity of query to each doc from bow_corpus Check out `Tutorial Notebook `_ @@ -893,9 +890,8 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): ---------- corpus: iterable of list of (int, float) A list of documents in the BoW format. - similarity_matrix : :class:`scipy.sparse.csc_matrix` - A term similarity matrix, typically produced by - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`. + similarity_matrix : :class:`gensim.similarities.SparseTermSimilarityMatrix` + A term similarity matrix. num_best : int, optional The number of results to retrieve for a query, if None - return similarities with all elements from corpus. chunksize: int, optional @@ -903,14 +899,23 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): See Also -------- - :meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix` - A term similarity matrix produced from term embeddings. - :func:`gensim.matutils.softcossim` - The Soft Cosine Measure. + :class:`gensim.similarities.SparseTermSimilarityMatrix` + A sparse term similarity matrix build using a term similarity index. + :class:`gensim.similarities.LevenshteinSimilarityIndex` + A term similarity index that computes Levenshtein similarities between terms. + :class:`gensim.models.WordEmbeddingSimilarityIndex` + A term similarity index that computes cosine similarities between word embeddings. """ + if scipy.sparse.issparse(similarity_matrix): + logger.warn( + "Support for passing an unencapsulated sparse matrix will be removed in 4.0.0, pass " + "a SparseTermSimilarityMatrix instance instead") + self.similarity_matrix = SparseTermSimilarityMatrix(similarity_matrix) + else: + self.similarity_matrix = similarity_matrix + self.corpus = corpus - self.similarity_matrix = similarity_matrix self.num_best = num_best self.chunksize = chunksize @@ -943,31 +948,19 @@ def get_similarities(self, query): Similarity matrix. """ + if not self.corpus: + return numpy.array() is_corpus, query = utils.is_corpus(query) - if not is_corpus: - if isinstance(query, numpy.ndarray): - # Convert document indexes to actual documents. - query = [self.corpus[i] for i in query] - else: - query = [query] - - result = [] - for query_document in query: - # Compute similarity for each query. - qresult = [matutils.softcossim(query_document, corpus_document, self.similarity_matrix) - for corpus_document in self.corpus] - qresult = numpy.array(qresult) - - # Append single query result to list of all results. - result.append(qresult) - - if is_corpus: - result = numpy.array(result) - else: - result = result[0] - - return result + if not is_corpus and isinstance(query, numpy.ndarray): + query = [self.corpus[i] for i in query] # convert document indexes to actual documents + result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True) + + if scipy.sparse.issparse(result): + return numpy.asarray(result.todense()) + if numpy.isscalar(result): + return numpy.array(result) + return numpy.asarray(result)[0] def __str__(self): return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.similarity_matrix.shape[0]) diff --git a/gensim/similarities/levenshtein.py b/gensim/similarities/levenshtein.py new file mode 100644 index 0000000000..e517c51217 --- /dev/null +++ b/gensim/similarities/levenshtein.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2018 Vit Novotny +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module provides a namespace for functions that use the Levenshtein distance. +""" + +from itertools import islice +import logging +from math import floor + +from gensim.similarities.termsim import TermSimilarityIndex + +logger = logging.getLogger(__name__) + + +def levdist(t1, t2, max_distance=float("inf")): + """Get the Levenshtein distance between two terms. + + Return the Levenshtein distance between two terms. The distance is a + number between <1.0, inf>, higher is less similar. + + Parameters + ---------- + t1 : {bytes, str, unicode} + The first compared term. + t2 : {bytes, str, unicode} + The second compared term. + max_distance : {int, float}, optional + If you don't care about distances larger than a known threshold, a more + efficient code path can be taken. For terms that are clearly "too far + apart", we will not compute the distance exactly, but we will return + `max(len(t1), len(t2))` more quickly, meaning "more than + `max_distance`". + Default: always compute distance exactly, no threshold clipping. + + Returns + ------- + int + The Levenshtein distance between `t1` and `t2`. + + """ + import Levenshtein + + distance = Levenshtein.distance(t1, t2) + if distance > max_distance: + return max(len(t1), len(t2)) + return distance + + +def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0): + """Get the Levenshtein similarity between two terms. + + Return the Levenshtein similarity between two terms. The similarity is a + number between <0.0, 1.0>, higher is more similar. + + Parameters + ---------- + t1 : {bytes, str, unicode} + The first compared term. + t2 : {bytes, str, unicode} + The second compared term. + alpha : float, optional + The multiplicative factor alpha defined by Charlet and Damnati (2017). + beta : float, optional + The exponential factor beta defined by Charlet and Damnati (2017). + min_similarity : {int, float}, optional + If you don't care about similarities smaller than a known threshold, a + more efficient code path can be taken. For terms that are clearly "too + far apart", we will not compute the distance exactly, but we will + return zero more quickly, meaning "less than `min_similarity`". + Default: always compute similarity exactly, no threshold clipping. + + Returns + ------- + float + The Levenshtein similarity between `t1` and `t2`. + + Notes + ----- + This notion of Levenshtein similarity was first defined in section 2.2 of + `Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3: + Soft-Cosine Semantic Similarity between Questions for Community Question + Answering", 2017 `_. + + """ + assert alpha >= 0 + assert beta >= 0 + + max_lengths = max(len(t1), len(t2)) + if max_lengths == 0: + return 1.0 + + min_similarity = float(max(min(min_similarity, 1.0), 0.0)) + max_distance = int(floor(max_lengths * (1 - (min_similarity / alpha) ** (1 / beta)))) + distance = levdist(t1, t2, max_distance) + similarity = alpha * (1 - distance * 1.0 / max_lengths)**beta + return similarity + + +class LevenshteinSimilarityIndex(TermSimilarityIndex): + """ + Computes Levenshtein similarities between terms and retrieves most similar + terms for a given term. + + Notes + ----- + This is a naive implementation that iteratively computes pointwise Levenshtein similarities + between individual terms. Using this implementation to compute the similarity of all terms in + real-world dictionaries such as the English Wikipedia will take years. + + Parameters + ---------- + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + A dictionary that specifies the considered terms. + alpha : float, optional + The multiplicative factor alpha defined by Charlet and Damnati (2017). + beta : float, optional + The exponential factor beta defined by Charlet and Damnati (2017). + threshold : float, optional + Only terms more similar than `threshold` are considered when retrieving + the most similar terms for a given term. + + See Also + -------- + :func:`gensim.similarities.levenshtein.levsim` + The Levenshtein similarity. + :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` + Build a term similarity matrix and compute the Soft Cosine Measure. + + """ + def __init__(self, dictionary, alpha=1.8, beta=5.0, threshold=0.0): + self.dictionary = dictionary + self.alpha = alpha + self.beta = beta + self.threshold = threshold + super(LevenshteinSimilarityIndex, self).__init__() + + def most_similar(self, t1, topn=10): + similarities = ( + (levsim(t1, t2, self.alpha, self.beta, self.threshold), t2) + for t2 in self.dictionary.values() + if t1 != t2 + ) + most_similar = ( + (t2, similarity) + for (similarity, t2) in sorted(similarities, reverse=True) + if similarity > 0 + ) + return islice(most_similar, topn) diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py new file mode 100644 index 0000000000..6a0b6d12b5 --- /dev/null +++ b/gensim/similarities/termsim.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2018 Vit Novotny +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module provides classes that deal with term similarities. +""" + +from itertools import chain +import logging +from math import sqrt + +import numpy as np +from scipy import sparse + +from gensim.matutils import corpus2csc +from gensim.utils import SaveLoad, is_corpus + +logger = logging.getLogger(__name__) + + +class TermSimilarityIndex(SaveLoad): + """ + Retrieves most similar terms for a given term. + + See Also + -------- + :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` + Build a term similarity matrix and compute the Soft Cosine Measure. + + """ + def most_similar(self, term, topn=10): + """Get most similar terms for a given term. + + Return most similar terms for a given term along with the similarities. + + Parameters + ---------- + term : str + Tne term for which we are retrieving `topn` most similar terms. + topn : int, optional + The maximum number of most similar terms to `term` that will be retrieved. + + Returns + ------- + iterable of (str, float) + Most similar terms along with their similarities to `term`. Only terms distinct from + `term` must be returned. + + """ + raise NotImplementedError + + +class UniformTermSimilarityIndex(TermSimilarityIndex): + """ + Retrieves most similar terms for a given term under the hypothesis that the similarities between + distinct terms are uniform. + + Parameters + ---------- + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + A dictionary that specifies the considered terms. + term_similarity : float, optional + The uniform similarity between distinct terms. + + See Also + -------- + :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` + Build a term similarity matrix and compute the Soft Cosine Measure. + + Notes + ----- + This class is mainly intended for testing SparseTermSimilarityMatrix and other classes that + depend on the TermSimilarityIndex. + + """ + def __init__(self, dictionary, term_similarity=0.5): + self.dictionary = sorted(dictionary.items()) + self.term_similarity = term_similarity + + def most_similar(self, t1, topn=10): + for __, (t2_index, t2) in zip(range(topn), ( + (t2_index, t2) for t2_index, t2 in self.dictionary if t2 != t1)): + yield (t2, self.term_similarity) + + +def _shortest_uint_dtype(max_value): + """Get the shortest unsingned integer data-type required for representing values up to a given + maximum value. + + Returns the shortest unsingned integer data-type required for representing values up to a given + maximum value. + + Parameters + ---------- + max_value : int + The maximum value we wish to represent. + + Returns + ------- + data-type + The shortest unsigned integer data-type required for representing values up to a given + maximum value. + """ + if max_value < 2**8: + return np.uint8 + elif max_value < 2**16: + return np.uint16 + elif max_value < 2**32: + return np.uint32 + return np.uint64 + + +class SparseTermSimilarityMatrix(SaveLoad): + """ + Builds a sparse term similarity matrix using a term similarity index. + + Notes + ----- + Building a DOK matrix, and converting it to a CSC matrix carries a significant memory overhead. + Future work should switch to building arrays of rows, columns, and non-zero elements and + directly passing these arrays to the CSC matrix constructor without copying. + + Examples + -------- + >>> from gensim.test.utils import common_texts + >>> from gensim.corpora import Dictionary + >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex + >>> from gensim.similarities import SoftCosineSimilarity, TermSimilarityMatrix + >>> + >>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors + >>> termsim_index = WordEmbeddingSimilarityIndex(model) + >>> dictionary = Dictionary(common_texts) + >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts] + >>> similarity_matrix = TermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix + >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) + >>> + >>> query = 'graph trees computer'.split() # make a query + >>> sims = docsim_index[dictionary.doc2bow(query)] # calculate similarity of query to each doc from bow_corpus + + Check out `Tutorial Notebook + `_ + for more examples. + + Parameters + ---------- + source : :class:`~gensim.similarities.termsim.TermSimilarityIndex` or :class:`scipy.sparse.spmatrix` + The source of the term similarity. Either a term similarity index that will be used for + building the term similarity matrix, or an existing sparse term similarity matrix that will + be encapsulated and stored in the matrix attribute. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` or None, optional + A dictionary that specifies a mapping between terms and the indices of rows and columns + of the resulting term similarity matrix. The dictionary may only be `None` when `source` is + a :class:`scipy.sparse.spmatrix`. + tfidf : :class:`gensim.models.tfidfmodel.TfidfModel` or None, optional + A model that specifies the relative importance of the terms in the dictionary. The columns + of the term similarity matrix will be build in a decreasing order of importance of + terms, or in the order of term identifiers if None. + symmetric : bool, optional + Whether the symmetry of the term similarity matrix will be enforced. This parameter only has + an effect when `source` is a :class:`scipy.sparse.spmatrix`. Positive definiteness is a + necessary precondition if you later wish to derive a change-of-basis matrix from the term + similarity matrix using Cholesky factorization. + positive_definite: bool, optional + Whether the positive definiteness of the term similarity matrix will be enforced through + strict column diagonal dominance. Positive definiteness is a necessary precondition if you + later wish to derive a change-of-basis matrix from the term similarity matrix using Cholesky + factorization. + nonzero_limit : {int, None}, optional + The maximum number of non-zero elements outside the diagonal in a single column of the + sparse term similarity matrix. If None, then no limit will be imposed. + dtype : numpy.dtype, optional + Data-type of the sparse term similarity matrix. + + Attributes + ---------- + matrix : :class:`scipy.sparse.csc_matrix` + The encapsulated sparse term similarity matrix. + """ + PROGRESS_MESSAGE_PERIOD = 1000 # how many columns are processed between progress messages + + def __init__(self, source, dictionary=None, tfidf=None, symmetric=True, positive_definite=False, nonzero_limit=100, + dtype=np.float32): + if sparse.issparse(source): + self.matrix = source.tocsc() # encapsulate the passed sparse matrix + return + + index = source + assert isinstance(index, TermSimilarityIndex) + assert dictionary is not None + matrix_order = len(dictionary) + + logger.info("constructing a sparse term similarity matrix using %s", index) + + if nonzero_limit is None: + nonzero_limit = matrix_order + + if tfidf is None: + logger.info("iterating over columns in dictionary order") + columns = sorted(dictionary.keys()) + else: + assert max(tfidf.idfs) == matrix_order - 1 + logger.info("iterating over columns in tf-idf order") + columns = [ + term_index for term_index, _ + in sorted( + tfidf.idfs.items(), + key=lambda x: (lambda term_index, term_idf: (term_idf, -term_index))(*x), reverse=True)] + + column_nonzero = np.array([1] * matrix_order, dtype=_shortest_uint_dtype(nonzero_limit)) + column_sum = np.zeros(matrix_order, dtype=dtype) + matrix = sparse.identity(matrix_order, dtype=dtype, format="dok") + + for column_number, t1_index in enumerate(columns): + if column_number % self.PROGRESS_MESSAGE_PERIOD == 0: + logger.info( + "PROGRESS: at %.02f%% columns (%d / %d, %.06f%% density, " + "%.06f%% projected density)", + 100.0 * (column_number + 1) / matrix_order, column_number + 1, matrix_order, + 100.0 * matrix.getnnz() / matrix_order**2, + 100.0 * np.clip( + (1.0 * (matrix.getnnz() - matrix_order) / matrix_order**2) + * (1.0 * matrix_order / (column_number + 1)) + + (1.0 / matrix_order), # add density correspoding to the main diagonal + 0.0, 1.0)) + + t1 = dictionary[t1_index] + num_nonzero = column_nonzero[t1_index] - 1 + num_rows = nonzero_limit - num_nonzero + most_similar = [ + (dictionary.token2id[term], similarity) + for term, similarity in index.most_similar(t1, num_rows) + if term in dictionary.token2id] + + if tfidf is None: + rows = sorted(most_similar) + else: + rows = sorted( + most_similar, + key=lambda x: (lambda term_index, _: (tfidf.idfs[term_index], -term_index))(*x), reverse=True) + + for row_number, (t2_index, similarity) in zip(range(num_rows), rows): + if positive_definite and column_sum[t1_index] + similarity >= 1.0: + break + if symmetric: + if column_nonzero[t2_index] <= nonzero_limit \ + and (not positive_definite or column_sum[t2_index] + similarity < 1.0) \ + and not (t1_index, t2_index) in matrix: + matrix[t1_index, t2_index] = similarity + column_nonzero[t1_index] += 1 + column_sum[t1_index] += abs(similarity) + matrix[t2_index, t1_index] = similarity + column_nonzero[t2_index] += 1 + column_sum[t2_index] += abs(similarity) + else: + matrix[t1_index, t2_index] = similarity + column_sum[t1_index] += abs(similarity) + + logger.info( + "constructed a sparse term similarity matrix with %0.06f%% density", + 100.0 * matrix.getnnz() / matrix_order**2) + + matrix = matrix.T + assert sparse.issparse(matrix) + self.__init__(matrix) + + def inner_product(self, X, Y, normalized=False): + """Get the inner product(s) between real vectors / corpora X and Y. + + Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a + non-orthogonal normalized basis, where the dot product between the basis vectors is given by + the sparse term similarity matrix. + + Parameters + ---------- + vec1 : list of (int, float) or iterable of list of (int, float) + A query vector / corpus in the sparse bag-of-words format. + vec2 : list of (int, float) or iterable of list of (int, float) + A document vector / corpus in the sparse bag-of-words format. + normalized : bool, optional + Whether the inner product should be L2-normalized. The normalized inner product + corresponds to the Soft Cosine Measure (SCM). SCM is a number between <-1.0, 1.0>, + where higher is more similar. + + Returns + ------- + `self.matrix.dtype`, `scipy.sparse.csr_matrix`, or :class:`numpy.matrix` + The inner product(s) between `X` and `Y`. + + References + ---------- + The soft cosine measure was perhaps first described by [sidorovetal14]_. + + .. [sidorovetal14] Grigori Sidorov et al., "Soft Similarity and Soft Cosine Measure: Similarity + of Features in Vector Space Model", 2014, http://www.cys.cic.ipn.mx/ojs/index.php/CyS/article/view/2043/1921. + + """ + if not X or not Y: + return self.matrix.dtype.type(0.0) + + is_corpus_X, X = is_corpus(X) + is_corpus_Y, Y = is_corpus(Y) + + if not is_corpus_X and not is_corpus_Y: + X = dict(X) + Y = dict(Y) + word_indices = np.array(sorted(set(chain(X, Y)))) + dtype = self.matrix.dtype + X = np.array([X[i] if i in X else 0 for i in word_indices], dtype=dtype) + Y = np.array([Y[i] if i in Y else 0 for i in word_indices], dtype=dtype) + matrix = self.matrix[word_indices[:, None], word_indices].todense() + + result = X.T.dot(matrix).dot(Y) + + if normalized: + X_norm = X.T.dot(matrix).dot(X)[0, 0] + Y_norm = Y.T.dot(matrix).dot(Y)[0, 0] + + assert \ + X_norm > 0.0 and Y_norm > 0.0, \ + u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \ + u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x." + + result /= sqrt(X_norm) * sqrt(Y_norm) + result = np.clip(result, -1.0, 1.0) + + return result[0, 0] + elif not is_corpus_X or not is_corpus_Y: + if is_corpus_X and not is_corpus_Y: + is_corpus_X, X, is_corpus_Y, Y = is_corpus_Y, Y, is_corpus_X, X # make Y the corpus + transposed = True + else: + transposed = False + + dtype = self.matrix.dtype + expanded_X = corpus2csc([X], num_terms=self.matrix.shape[0], dtype=dtype).T.dot(self.matrix) + word_indices = np.array(sorted(expanded_X.nonzero()[1])) + del expanded_X + + X = dict(X) + X = np.array([X[i] if i in X else 0 for i in word_indices], dtype=dtype) + Y = corpus2csc(Y, num_terms=self.matrix.shape[0], dtype=dtype)[word_indices, :].todense() + matrix = self.matrix[word_indices[:, None], word_indices].todense() + if normalized: + # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T + X_norm = np.multiply(X.T.dot(matrix), X.T).sum(axis=1).T + Y_norm = np.multiply(Y.T.dot(matrix), Y.T).sum(axis=1).T + + assert \ + X_norm.min() > 0.0 and Y_norm.min() >= 0.0, \ + u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \ + u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x." + + X = np.multiply(X, 1 / np.sqrt(X_norm)).T + Y = np.multiply(Y, 1 / np.sqrt(Y_norm)) + Y = np.nan_to_num(Y) # Account for division by zero when Y_norm.min() == 0.0 + + result = X.T.dot(matrix).dot(Y) + + if normalized: + result = np.clip(result, -1.0, 1.0) + + if transposed: + result = result.T + + return result + else: # if is_corpus_X and is_corpus_Y: + dtype = self.matrix.dtype + X = corpus2csc(X if is_corpus_X else [X], num_terms=self.matrix.shape[0], dtype=dtype) + Y = corpus2csc(Y if is_corpus_Y else [Y], num_terms=self.matrix.shape[0], dtype=dtype) + matrix = self.matrix + + if normalized: + # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T + X_norm = X.T.dot(matrix).multiply(X.T).sum(axis=1).T + Y_norm = Y.T.dot(matrix).multiply(Y.T).sum(axis=1).T + + assert \ + X_norm.min() > 0.0 and Y_norm.min() >= 0.0, \ + u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \ + u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x." + + X = X.multiply(sparse.csr_matrix(1 / np.sqrt(X_norm))) + Y = Y.multiply(sparse.csr_matrix(1 / np.sqrt(Y_norm))) + Y[Y == np.inf] = 0 # Account for division by zero when Y_norm.min() == 0.0 + + result = X.T.dot(matrix).dot(Y) + + if normalized: + result.data = np.clip(result.data, -1.0, 1.0) + + return result diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index fc15dcd871..abe1bcdcfe 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -15,7 +15,7 @@ import numpy as np from gensim.corpora import Dictionary -from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel +from gensim.models import KeyedVectors as EuclideanKeyedVectors, WordEmbeddingSimilarityIndex from gensim.test.utils import datapath import gensim.models.keyedvectors @@ -24,6 +24,51 @@ logger = logging.getLogger(__name__) +class TestWordEmbeddingSimilarityIndex(unittest.TestCase): + def setUp(self): + self.vectors = EuclideanKeyedVectors.load_word2vec_format( + datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64) + + def test_most_similar(self): + """Test most_similar returns expected results.""" + + # check the handling of out-of-dictionary terms + index = WordEmbeddingSimilarityIndex(self.vectors) + self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10)))) + self.assertEqual(0, len(list(index.most_similar(u"out-of-dictionary term", topn=10)))) + + # check that the topn works as expected + index = WordEmbeddingSimilarityIndex(self.vectors) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertLess(0, len(results)) + self.assertGreaterEqual(10, len(results)) + results = list(index.most_similar(u"holiday", topn=20)) + self.assertLess(10, len(results)) + self.assertGreaterEqual(20, len(results)) + + # check that the term itself is not returned + index = WordEmbeddingSimilarityIndex(self.vectors) + terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.vectors.vocab))] + self.assertFalse(u"holiday" in terms) + + # check that the threshold works as expected + index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertLess(0, len(results)) + self.assertGreaterEqual(10, len(results)) + + index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertEqual(0, len(results)) + + # check that the exponent works as expected + index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0) + first_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0) + second_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + self.assertTrue(np.allclose(first_similarities**2.0, second_similarities)) + + class TestEuclideanKeyedVectors(unittest.TestCase): def setUp(self): self.vectors = EuclideanKeyedVectors.load_word2vec_format( @@ -32,60 +77,14 @@ def setUp(self): def test_similarity_matrix(self): """Test similarity_matrix returns expected results.""" - documents = [["government", "denied", "holiday"], - ["holiday", "slowing", "hollingworth"]] + documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]] dictionary = Dictionary(documents) - - # checking symmetry and the existence of ones on the diagonal similarity_matrix = self.vectors.similarity_matrix(dictionary).todense() - self.assertTrue((similarity_matrix.T == similarity_matrix).all()) + + # checking the existence of ones on the main diagonal self.assertTrue( (np.diag(similarity_matrix) == np.ones(similarity_matrix.shape[0])).all()) - # checking that thresholding works as expected - similarity_matrix = self.vectors.similarity_matrix(dictionary, threshold=0.45).todense() - self.assertEqual(18, np.sum(similarity_matrix == 0)) - - # checking that exponent works as expected - similarity_matrix = self.vectors.similarity_matrix(dictionary, exponent=1.0).todense() - self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix), places=5) - - # checking that nonzero_limit works as expected - similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=4).todense() - self.assertEqual(4, np.sum(similarity_matrix == 0)) - - similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense() - self.assertEqual(20, np.sum(similarity_matrix == 0)) - - # check that processing rows in the order given by IDF has desired effect - - # The complete similarity matrix we would obtain with nonzero_limit would look as follows: - documents = [["honour", "understanding"], ["understanding", "mean", "knop"]] - dictionary = Dictionary(documents) - tfidf = TfidfModel(dictionary=dictionary) - - # All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1. - # The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0. - # - # If we do not pass the tfidf parameter to the similarity_matrix - # method, then we process rows in the order from 1 to 4. If we do pass - # the tfidf parameter to the similarity_matrix method, then we first - # process the rows 1, 3, 4 that correspond to terms with IDF of 1.0 and - # then the row 2 that corresponds to the term "understanding" with IDF - # of 0. Since the method is greedy, we will end up with two different - # similarity matrices. - - similarity_matrix = self.vectors.similarity_matrix( - dictionary, nonzero_limit=2).todense() - self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([ - [1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])))) - - similarity_matrix = self.vectors.similarity_matrix( - dictionary, tfidf, nonzero_limit=2).todense() - self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([ - [1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0], [0, 0.90007025, 1, 0], - [0.9112908, 0, 0, 1]])))) - def test_most_similar(self): """Test most_similar returns expected results.""" expected = [ diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index e1f876e216..7aafbd34d7 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -11,12 +11,14 @@ import logging import unittest +import math import os import numpy import scipy from smart_open import smart_open +from gensim.corpora import Dictionary from gensim.models import word2vec from gensim.models import doc2vec from gensim.models import KeyedVectors @@ -25,6 +27,10 @@ from gensim.models import Word2Vec, FastText from gensim.test.utils import (datapath, get_tmpfile, common_texts as texts, common_dictionary as dictionary, common_corpus as corpus) +from gensim.similarities import UniformTermSimilarityIndex +from gensim.similarities import SparseTermSimilarityMatrix +from gensim.similarities import LevenshteinSimilarityIndex +from gensim.similarities.levenshtein import levdist, levsim try: from pyemd import emd # noqa:F401 @@ -371,7 +377,7 @@ def setUp(self): similarity_matrix = scipy.sparse.identity(12, format="lil") similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5 similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5 - self.similarity_matrix = similarity_matrix.tocsc() + self.similarity_matrix = SparseTermSimilarityMatrix(similarity_matrix) def factoryMethod(self): # Override factoryMethod. @@ -393,8 +399,6 @@ def testFull(self, num_best=None): self.assertAlmostEqual(1.0, sims[0]) # Similarity of a document with itself is 1.0. self.assertTrue(numpy.alltrue(sims[1:] >= 0.0)) self.assertTrue(numpy.alltrue(sims[1:] < 1.0)) - expected = 2.1889350195476758 - self.assertAlmostEqual(expected, numpy.sum(sims)) # Corpora for query in ( @@ -425,8 +429,8 @@ def testNonIncreasing(self): sims = index[query] sims2 = numpy.asarray(sims)[:, 1] # Just the similarities themselves. - # The difference of adjacent elements should be negative. - cond = sum(numpy.diff(sims2) < 0) == len(sims2) - 1 + # The difference of adjacent elements should be less than or equal to zero. + cond = sum(numpy.diff(sims2) <= 0) == len(sims2) - 1 self.assertTrue(cond) def testChunking(self): @@ -684,6 +688,385 @@ def testSaveLoad(self): self.assertEqual(self.index.num_trees, self.index2.num_trees) +class TestUniformTermSimilarityIndex(unittest.TestCase): + def setUp(self): + self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]] + self.dictionary = Dictionary(self.documents) + + def test_most_similar(self): + """Test most_similar returns expected results.""" + + # check that the topn works as expected + index = UniformTermSimilarityIndex(self.dictionary) + results = list(index.most_similar(u"holiday", topn=1)) + self.assertLess(0, len(results)) + self.assertGreaterEqual(1, len(results)) + results = list(index.most_similar(u"holiday", topn=4)) + self.assertLess(1, len(results)) + self.assertGreaterEqual(4, len(results)) + + # check that the term itself is not returned + index = UniformTermSimilarityIndex(self.dictionary) + terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))] + self.assertFalse(u"holiday" in terms) + + # check that the term_similarity works as expected + index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.2) + similarities = numpy.array([ + similarity for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))]) + self.assertTrue(numpy.all(similarities == 0.2)) + + +class TestSparseTermSimilarityMatrix(unittest.TestCase): + def setUp(self): + self.documents = [ + [u"government", u"denied", u"holiday"], + [u"government", u"denied", u"holiday", u"slowing", u"hollingworth"]] + self.dictionary = Dictionary(self.documents) + self.tfidf = TfidfModel(dictionary=self.dictionary) + self.index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5) + + def test_type(self): + """Test the type of the produced matrix.""" + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix + self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix)) + + def test_diagonal(self): + """Test the existence of ones on the main diagonal.""" + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix.todense() + self.assertTrue(numpy.all(numpy.diag(matrix) == numpy.ones(matrix.shape[0]))) + + def test_order(self): + """Test the matrix order.""" + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix.todense() + self.assertEqual(matrix.shape[0], len(self.dictionary)) + self.assertEqual(matrix.shape[1], len(self.dictionary)) + + def test_dtype(self): + """Test the dtype parameter of the matrix constructor.""" + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, dtype=numpy.float32).matrix.todense() + self.assertEqual(numpy.float32, matrix.dtype) + + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, dtype=numpy.float64).matrix.todense() + self.assertEqual(numpy.float64, matrix.dtype) + + def test_nonzero_limit(self): + """Test the nonzero_limit parameter of the matrix constructor.""" + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=100).matrix.todense() + self.assertGreaterEqual(101, numpy.max(numpy.sum(matrix != 0, axis=0))) + + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=4).matrix.todense() + self.assertGreaterEqual(5, numpy.max(numpy.sum(matrix != 0, axis=0))) + + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=1).matrix.todense() + self.assertGreaterEqual(2, numpy.max(numpy.sum(matrix != 0, axis=0))) + + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary, nonzero_limit=0).matrix.todense() + self.assertEqual(1, numpy.max(numpy.sum(matrix != 0, axis=0))) + self.assertTrue(numpy.all(matrix == numpy.eye(matrix.shape[0]))) + + def test_symmetric(self): + """Test the symmetric parameter of the matrix constructor.""" + matrix = SparseTermSimilarityMatrix(self.index, self.dictionary).matrix.todense() + self.assertTrue(numpy.all(matrix == matrix.T)) + + matrix = SparseTermSimilarityMatrix( + self.index, self.dictionary, nonzero_limit=1).matrix.todense() + expected_matrix = numpy.array([ + [1.0, 0.5, 0.0, 0.0, 0.0], + [0.5, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + + matrix = SparseTermSimilarityMatrix( + self.index, self.dictionary, nonzero_limit=1, symmetric=False).matrix.todense() + expected_matrix = numpy.array([ + [1.0, 0.5, 0.5, 0.5, 0.5], + [0.5, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + + def test_positive_definite(self): + """Test the positive_definite parameter of the matrix constructor.""" + matrix = SparseTermSimilarityMatrix( + self.index, self.dictionary, nonzero_limit=2).matrix.todense() + expected_matrix = numpy.array([ + [1.0, 0.5, 0.5, 0.0, 0.0], + [0.5, 1.0, 0.0, 0.5, 0.0], + [0.5, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.5, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + + matrix = SparseTermSimilarityMatrix( + self.index, self.dictionary, nonzero_limit=2, positive_definite=True).matrix.todense() + expected_matrix = numpy.array([ + [1.0, 0.5, 0.0, 0.0, 0.0], + [0.5, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + + def test_tfidf(self): + """Test the tfidf parameter of the matrix constructor.""" + matrix = SparseTermSimilarityMatrix( + self.index, self.dictionary, nonzero_limit=1).matrix.todense() + expected_matrix = numpy.array([ + [1.0, 0.5, 0.0, 0.0, 0.0], + [0.5, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + + matrix = SparseTermSimilarityMatrix( + self.index, self.dictionary, nonzero_limit=1, tfidf=self.tfidf).matrix.todense() + expected_matrix = numpy.array([ + [1.0, 0.0, 0.0, 0.5, 0.0], + [0.0, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.5, 0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + + def test_encapsulation(self): + """Test the matrix encapsulation.""" + + # check that a sparse matrix will be converted to a CSC format + expected_matrix = numpy.array([ + [1.0, 2.0, 3.0], + [0.0, 1.0, 4.0], + [0.0, 0.0, 1.0]]) + + matrix = SparseTermSimilarityMatrix(scipy.sparse.csc_matrix(expected_matrix)).matrix + self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix)) + self.assertTrue(numpy.all(matrix.todense() == expected_matrix)) + + matrix = SparseTermSimilarityMatrix(scipy.sparse.csr_matrix(expected_matrix)).matrix + self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix)) + self.assertTrue(numpy.all(matrix.todense() == expected_matrix)) + + def test_inner_product(self): + """Test the inner product.""" + + matrix = SparseTermSimilarityMatrix( + UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5), self.dictionary) + + # check zero vectors work as expected + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + + self.assertEqual(0.0, matrix.inner_product([], vec2)) + self.assertEqual(0.0, matrix.inner_product(vec1, [])) + self.assertEqual(0.0, matrix.inner_product([], [])) + + self.assertEqual(0.0, matrix.inner_product([], vec2, normalized=True)) + self.assertEqual(0.0, matrix.inner_product(vec1, [], normalized=True)) + self.assertEqual(0.0, matrix.inner_product([], [], normalized=True)) + + # check that real-world vectors work as expected + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = 0.0 + expected_result += 2 * 1.0 * 1 # government * s_{ij} * government + expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday + result = matrix.inner_product(vec1, vec2) + self.assertAlmostEqual(expected_result, result, places=5) + + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = matrix.inner_product(vec1, vec2) + expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) + expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) + result = matrix.inner_product(vec1, vec2, normalized=True) + self.assertAlmostEqual(expected_result, result, places=5) + + # check that real-world (vector, corpus) pairs work as expected + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = 0.0 + expected_result += 2 * 1.0 * 1 # government * s_{ij} * government + expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday + expected_result = numpy.full((1, 2), expected_result) + result = matrix.inner_product(vec1, [vec2] * 2) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = matrix.inner_product(vec1, vec2) + expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) + expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) + expected_result = numpy.full((1, 2), expected_result) + result = matrix.inner_product(vec1, [vec2] * 2, normalized=True) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + # check that real-world (corpus, vector) pairs work as expected + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = 0.0 + expected_result += 2 * 1.0 * 1 # government * s_{ij} * government + expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday + expected_result = numpy.full((3, 1), expected_result) + result = matrix.inner_product([vec1] * 3, vec2) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = matrix.inner_product(vec1, vec2) + expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) + expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) + expected_result = numpy.full((3, 1), expected_result) + result = matrix.inner_product([vec1] * 3, vec2, normalized=True) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + # check that real-world corpora work as expected + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = 0.0 + expected_result += 2 * 1.0 * 1 # government * s_{ij} * government + expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government + expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday + expected_result = numpy.full((3, 2), expected_result) + result = matrix.inner_product([vec1] * 3, [vec2] * 2) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + expected_result = matrix.inner_product(vec1, vec2) + expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) + expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) + expected_result = numpy.full((3, 2), expected_result) + result = matrix.inner_product([vec1] * 3, [vec2] * 2, normalized=True) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + +class TestLevenshteinDistance(unittest.TestCase): + def test_max_distance(self): + t1 = "holiday" + t2 = "day" + max_distance = max(len(t1), len(t2)) + + self.assertEqual(4, levdist(t1, t2)) + self.assertEqual(4, levdist(t1, t2, 4)) + self.assertEqual(max_distance, levdist(t1, t2, 2)) + self.assertEqual(max_distance, levdist(t1, t2, -2)) + + +class TestLevenshteinSimilarity(unittest.TestCase): + def test_empty_strings(self): + t1 = "" + t2 = "" + + self.assertEqual(1.0, levsim(t1, t2)) + + def test_negative_hyperparameters(self): + t1 = "holiday" + t2 = "day" + alpha = 2.0 + beta = 2.0 + + with self.assertRaises(AssertionError): + levsim(t1, t2, -alpha, beta) + + with self.assertRaises(AssertionError): + levsim(t1, t2, alpha, -beta) + + with self.assertRaises(AssertionError): + levsim(t1, t2, -alpha, -beta) + + def test_min_similarity(self): + t1 = "holiday" + t2 = "day" + alpha = 2.0 + beta = 2.0 + similarity = alpha * (1 - 4.0 / 7)**beta + assert similarity > 0.1 and similarity < 0.5 + + self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta)) + + self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2)) + self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2.0)) + + self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0)) + self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0.0)) + + self.assertEqual(similarity, levsim(t1, t2, alpha, beta, 0.1)) + self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 0.5)) + self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 1.0)) + + self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2)) + self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2.0)) + + +class TestLevenshteinSimilarityIndex(unittest.TestCase): + def setUp(self): + self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]] + self.dictionary = Dictionary(self.documents) + + def test_most_similar(self): + """Test most_similar returns expected results.""" + + index = LevenshteinSimilarityIndex(self.dictionary) + results = list(index.most_similar(u"holiday", topn=1)) + self.assertLess(0, len(results)) + self.assertGreaterEqual(1, len(results)) + results = list(index.most_similar(u"holiday", topn=4)) + self.assertLess(1, len(results)) + self.assertGreaterEqual(4, len(results)) + + # check the order of the results + results = index.most_similar(u"holiday", topn=4) + terms, _ = tuple(zip(*results)) + self.assertEqual((u"hollingworth", u"slowing", u"denied", u"government"), terms) + + # check that the term itself is not returned + index = LevenshteinSimilarityIndex(self.dictionary) + terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))] + self.assertFalse(u"holiday" in terms) + + # check that the threshold works as expected + index = LevenshteinSimilarityIndex(self.dictionary, threshold=0.0) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertLess(0, len(results)) + self.assertGreaterEqual(10, len(results)) + + index = LevenshteinSimilarityIndex(self.dictionary, threshold=1.0) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertEqual(0, len(results)) + + # check that the alpha works as expected + index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0) + first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + index = LevenshteinSimilarityIndex(self.dictionary, alpha=2.0) + second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + self.assertTrue(numpy.allclose(2.0 * first_similarities, second_similarities)) + + # check that the beta works as expected + index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=1.0) + first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=2.0) + second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + self.assertTrue(numpy.allclose(first_similarities ** 2.0, second_similarities)) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/setup.py b/setup.py index f9db98f8fc..79f1bdc98b 100644 --- a/setup.py +++ b/setup.py @@ -232,6 +232,7 @@ def finalize_options(self): 'testfixtures', 'scikit-learn', 'Morfessor==2.0.2a4', + 'python-Levenshtein >= 0.10.2', 'visdom >= 0.1.8', ]