Implement Soft Cosine Measure (piskvorky#1827)

* Implement Soft Cosine Similarity * Added numpy-style documentation for Soft Cosine Similarity * Added unit tests for Soft Cosine Similarity * Make WmdSimilarity and SoftCosineSimilarity handle empty queries * Rename Soft Cosine Similarity to Soft Cosine Measure * Add links to Soft Cosine Measure papers * Remove unused variables and parameters for Soft Cosine Measure * Replace explicit timers with magic %time in Soft Cosine Measure notebook * Rename var in term similarity matrix construction to reflect symmetry * Update SoftCosineSimilarity class example to define all variables * Make the code in Soft Cosine Measure notebook more compact * Use hanging indents in EuclideanKeyedVectors.similarity_matrix * Simplified expressions in WmdSimilarity and SoftCosineSimilarity * Extract the sparse2coo function to the global scope * Fix __str__ of SoftCosineSimilarity * Use hanging indents in SoftCossim.__init__ * Fix formatting of the matutils module * Make similarity matrix info messages appear at fixed frequency * Construct term similarity matrix rows for important terms first * Optimize softcossim for an estimated 100-fold constant speed increase * Remove unused import in gensim.similarities.docsim * Fix imports in gensim.models.keyedvectors * replace reference to anonymous link * Update "See Also" references to new *2vec implementation * Fix formatting error in gensim.models.keyedvectors * Update Soft Cosine Measure tutorial notebook * Update Soft Cosine Measure tutorial notebook * Use smaller glove-wiki-gigaword-50 model in Soft Cosine Measure notebook * Use gensim-data to load SemEval datasets in Soft Cosine Measure notebook * Use backwards-compatible syntax in Soft Cosine Similarity notebook * Remove unnecessary package requirements in Soft Cosine Measure notebook * Fix Soft Cosine Measure notebook to use true gensim-data dataset names * fix docs[1] * fix docs[2] * fix docs[3] * small fixes * small fixes[2]
sj29-innovate · Feb 21, 2018 · 804a6cc · 804a6cc
1 parent 420b989
commit 804a6cc
Show file tree

Hide file tree

Showing 9 changed files with 1,041 additions and 48 deletions.
diff --git a/docs/notebooks/soft_cosine_tutorial.ipynb b/docs/notebooks/soft_cosine_tutorial.ipynb
diff --git a/docs/notebooks/soft_cosine_tutorial.png b/docs/notebooks/soft_cosine_tutorial.png
diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -9,6 +9,7 @@
 from __future__ import with_statement
 
 
+from itertools import chain
 import logging
 import math
 
@@ -755,6 +756,77 @@ def cossim(vec1, vec2):
     return result
 
 
+def softcossim(vec1, vec2, similarity_matrix):
+    """Get Soft Cosine Measure between two vectors given a term similarity matrix.
+
+    Return Soft Cosine Measure between two sparse vectors given a sparse term similarity matrix
+    in the :class:`scipy.sparse.csc_matrix` format. The similarity is a number between <-1.0, 1.0>,
+    higher is more similar.
+
+    Parameters
+    ----------
+    vec1 : list of (int, float)
+        A query vector in the BoW format.
+    vec2 : list of (int, float)
+        A document vector in the BoW format.
+    similarity_matrix : {:class:`scipy.sparse.csc_matrix`, :class:`scipy.sparse.csr_matrix`}
+        A term similarity matrix, typically produced by
+        :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`.
+
+    Returns
+    -------
+    `similarity_matrix.dtype`
+        The Soft Cosine Measure between `vec1` and `vec2`.
+
+    Raises
+    ------
+    ValueError
+        When the term similarity matrix is in an unknown format.
+
+    See Also
+    --------
+    :meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`
+        A term similarity matrix produced from term embeddings.
+    :class:`gensim.similarities.docsim.SoftCosineSimilarity`
+        A class for performing corpus-based similarity queries with Soft Cosine Measure.
+
+    References
+    ----------
+    Soft Cosine Measure was perhaps first defined by [sidorovetal14]_.
+
+    .. [sidorovetal14] Grigori Sidorov et al., "Soft Similarity and Soft Cosine Measure: Similarity
+       of Features in Vector Space Model", 2014, http://www.cys.cic.ipn.mx/ojs/index.php/CyS/article/view/2043/1921.
+
+    """
+    if not isinstance(similarity_matrix, scipy.sparse.csc_matrix):
+        if isinstance(similarity_matrix, scipy.sparse.csr_matrix):
+            similarity_matrix = similarity_matrix.T
+        else:
+            raise ValueError('unknown similarity matrix format')
+
+    if not vec1 or not vec2:
+        return 0.0
+
+    vec1 = dict(vec1)
+    vec2 = dict(vec2)
+    word_indices = sorted(set(chain(vec1, vec2)))
+    dtype = similarity_matrix.dtype
+    vec1 = np.array([vec1[i] if i in vec1 else 0 for i in word_indices], dtype=dtype)
+    vec2 = np.array([vec2[i] if i in vec2 else 0 for i in word_indices], dtype=dtype)
+    dense_matrix = similarity_matrix[[[i] for i in word_indices], word_indices].todense()
+    vec1len = vec1.T.dot(dense_matrix).dot(vec1)[0, 0]
+    vec2len = vec2.T.dot(dense_matrix).dot(vec2)[0, 0]
+
+    assert \
+        vec1len > 0.0 and vec2len > 0.0, \
+        u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \
+        u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x."
+
+    result = vec1.T.dot(dense_matrix).dot(vec2)[0, 0]
+    result /= math.sqrt(vec1len) * math.sqrt(vec2len)  # rescale by vector lengths
+    return np.clip(result, -1.0, 1.0)
+
+
 def isbow(vec):
     """Checks if vector passed is in BoW format.
 

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -80,7 +80,7 @@
 from gensim.corpora.dictionary import Dictionary
 from six import string_types, integer_types
 from six.moves import xrange, zip
-from scipy import stats
+from scipy import sparse, stats
 from gensim.utils import deprecated
 from gensim.models.utils_any2vec import _save_word2vec_format, _load_word2vec_format, _compute_ngrams
 
@@ -191,8 +191,8 @@ def rank(self, entity1, entity2):
 
 
 class WordEmbeddingsKeyedVectors(BaseKeyedVectors):
-    """Class containing common methods for operations over word vectors.
-    """
+    """Class containing common methods for operations over word vectors."""
+
     def __init__(self, vector_size):
         super(WordEmbeddingsKeyedVectors, self).__init__(vector_size=vector_size)
         self.vectors_norm = None
@@ -432,6 +432,113 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
         """
         return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
 
+    def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=REAL):
+        """Constructs a term similarity matrix for computing Soft Cosine Measure.
+
+        Constructs a a sparse term similarity matrix in the :class:`scipy.sparse.csc_matrix` format for computing
+        Soft Cosine Measure between documents.
+
+        Parameters
+        ----------
+        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
+            A dictionary that specifies a mapping between words and the indices of rows and columns
+            of the resulting term similarity matrix.
+        tfidf : :class:`gensim.models.tfidfmodel.TfidfModel`, optional
+            A model that specifies the relative importance of the terms in the dictionary. The rows
+            of the term similarity matrix will be build in an increasing order of importance of terms,
+            or in the order of term identifiers if None.
+        threshold : float, optional
+            Only pairs of words whose embeddings are more similar than `threshold` are considered
+            when building the sparse term similarity matrix.
+        exponent : float, optional
+            The exponent applied to the similarity between two word embeddings when building the term similarity matrix.
+        nonzero_limit : int, optional
+            The maximum number of non-zero elements outside the diagonal in a single row or column
+            of the term similarity matrix. Setting `nonzero_limit` to a constant ensures that the
+            time complexity of computing the Soft Cosine Measure will be linear in the document
+            length rather than quadratic.
+        dtype : numpy.dtype, optional
+            Data-type of the term similarity matrix.
+
+        Returns
+        -------
+        :class:`scipy.sparse.csc_matrix`
+            Term similarity matrix.
+
+        See Also
+        --------
+        :func:`gensim.matutils.softcossim`
+            The Soft Cosine Measure.
+        :class:`gensim.similarities.docsim.SoftCosineSimilarity`
+            A class for performing corpus-based similarity queries with Soft Cosine Measure.
+
+
+        Notes
+        -----
+        The constructed matrix corresponds to the matrix Mrel defined in section 2.1 of
+        `Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity
+        between Questions for Community Question Answering", 2017
+        <http://www.aclweb.org/anthology/S/S17/S17-2051.pdf>`__.
+
+        """
+        logger.info("constructing a term similarity matrix")
+        matrix_order = len(dictionary)
+        matrix_nonzero = [1] * matrix_order
+        matrix = sparse.identity(matrix_order, dtype=dtype, format="dok")
+        num_skipped = 0
+        # Decide the order of rows.
+        if tfidf is None:
+            word_indices = range(matrix_order)
+        else:
+            assert max(tfidf.idfs) < matrix_order
+            word_indices = [
+                index for index, _ in sorted(tfidf.idfs.items(), key=lambda x: x[1], reverse=True)
+            ]
+
+        # Traverse rows.
+        for row_number, w1_index in enumerate(word_indices):
+            if row_number % 1000 == 0:
+                logger.info(
+                    "PROGRESS: at %.02f%% rows (%d / %d, %d skipped, %.06f%% density)",
+                    100.0 * (row_number + 1) / matrix_order, row_number + 1, matrix_order,
+                    num_skipped, 100.0 * matrix.getnnz() / matrix_order**2)
+            w1 = dictionary[w1_index]
+            if w1 not in self.vocab:
+                num_skipped += 1
+                continue  # A word from the dictionary is not present in the word2vec model.
+            # Traverse upper triangle columns.
+            if matrix_order <= nonzero_limit + 1:  # Traverse all columns.
+                columns = (
+                    (w2_index, self.similarity(w1, dictionary[w2_index]))
+                    for w2_index in range(w1_index + 1, matrix_order)
+                    if w1_index != w2_index and dictionary[w2_index] in self.vocab)
+            else:  # Traverse only columns corresponding to the embeddings closest to w1.
+                num_nonzero = matrix_nonzero[w1_index] - 1
+                columns = (
+                    (dictionary.token2id[w2], similarity)
+                    for _, (w2, similarity)
+                    in zip(
+                        range(nonzero_limit - num_nonzero),
+                        self.most_similar(positive=[w1], topn=nonzero_limit - num_nonzero)
+                    )
+                    if w2 in dictionary.token2id
+                )
+                columns = sorted(columns, key=lambda x: x[0])
+
+            for w2_index, similarity in columns:
+                # Ensure that we don't exceed `nonzero_limit` by mirroring the upper triangle.
+                if similarity > threshold and matrix_nonzero[w2_index] <= nonzero_limit:
+                    element = similarity**exponent
+                    matrix[w1_index, w2_index] = element
+                    matrix_nonzero[w1_index] += 1
+                    matrix[w2_index, w1_index] = element
+                    matrix_nonzero[w2_index] += 1
+        logger.info(
+            "constructed a term similarity matrix with %0.6f %% nonzero elements",
+            100.0 * matrix.getnnz() / matrix_order**2
+        )
+        return matrix.tocsc()
+
     def wmdistance(self, document1, document2):
         """
         Compute the Word Mover's Distance between two documents. When using this

diff --git a/gensim/similarities/__init__.py b/gensim/similarities/__init__.py
@@ -3,4 +3,4 @@
 """
 
 # bring classes directly into package namespace, to save some typing
-from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity, WmdSimilarity  # noqa:F401
+from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity, SoftCosineSimilarity, WmdSimilarity  # noqa:F401
diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py
@@ -563,6 +563,108 @@ def __str__(self):
         return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1])
 
 
+class SoftCosineSimilarity(interfaces.SimilarityABC):
+    """Document similarity (like MatrixSimilarity) that uses Soft Cosine Measure as a similarity measure."""
+
+    def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256):
+        """
+
+        Parameters
+        ----------
+        corpus: iterable of list of (int, float)
+            A list of documents in the BoW format.
+        similarity_matrix : :class:`scipy.sparse.csc_matrix`
+            A term similarity matrix, typically produced by
+            :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`.
+        num_best : int, optional
+            The number of results to retrieve for a query, if None - return similarities with all elements from corpus.
+        chunksize: int, optional
+            Size of one corpus chunk.
+
+
+        See Also
+        --------
+        :meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`
+            A term similarity matrix produced from term embeddings.
+        :func:`gensim.matutils.softcossim`
+            The Soft Cosine Measure.
+
+        Examples
+        --------
+        >>> from gensim.corpora import Dictionary
+        >>> import gensim.downloader as api
+        >>> from gensim.models import Word2Vec
+        >>> from gensim.similarities import SoftCosineSimilarity
+        >>> from gensim.utils import simple_preprocess
+        >>>
+        >>> # Prepare the model
+        >>> corpus = api.load("text8")
+        >>> model = Word2Vec(corpus, workers=3, size=100)
+        >>> dictionary = Dictionary(corpus)
+        >>> bow_corpus = [dictionary.doc2bow(document) for document in corpus]
+        >>> similarity_matrix = model.wv.similarity_matrix(dictionary)
+        >>> index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
+        >>>
+        >>> # Make a query.
+        >>> query = 'Yummy! Great view of the Bellagio Fountain show.'
+        >>> # calculate similarity between query and each doc from bow_corpus
+        >>> sims = index[dictionary.doc2bow(simple_preprocess(query))]
+
+        See `Tutorial Notebook
+        <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb>`_
+        for more examples.
+
+        """
+        self.corpus = corpus
+        self.similarity_matrix = similarity_matrix
+        self.num_best = num_best
+        self.chunksize = chunksize
+
+        # Normalization of features is undesirable, since soft cosine similarity requires special
+        # normalization using the similarity matrix. Therefore, we would just be normalizing twice,
+        # increasing the numerical error.
+        self.normalize = False
+
+        # index is simply an array from 0 to size of corpus.
+        self.index = numpy.arange(len(corpus))
+
+    def __len__(self):
+        return len(self.corpus)
+
+    def get_similarities(self, query):
+        """
+        **Do not use this function directly; use the self[query] syntax instead.**
+        """
+        if isinstance(query, numpy.ndarray):
+            # Convert document indexes to actual documents.
+            query = [self.corpus[i] for i in query]
+
+        if not query or not isinstance(query[0], list):
+            query = [query]
+
+        n_queries = len(query)
+        result = []
+        for qidx in range(n_queries):
+            # Compute similarity for each query.
+            qresult = [matutils.softcossim(document, query[qidx], self.similarity_matrix)
+                       for document in self.corpus]
+            qresult = numpy.array(qresult)
+
+            # Append single query result to list of all results.
+            result.append(qresult)
+
+        if len(result) == 1:
+            # Only one query.
+            result = result[0]
+        else:
+            result = numpy.array(result)
+
+        return result
+
+    def __str__(self):
+        return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.similarity_matrix.shape[0])
+
+
 class WmdSimilarity(interfaces.SimilarityABC):
     """
     Document similarity (like MatrixSimilarity) that uses the negative of WMD
@@ -605,7 +707,7 @@ def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=T
         self.normalize = False
 
         # index is simply an array from 0 to size of corpus.
-        self.index = numpy.array(range(len(corpus)))
+        self.index = numpy.arange(len(corpus))
 
         if normalize_w2v_and_replace:
             # Normalize vectors in word2vec class to length 1.
@@ -622,7 +724,7 @@ def get_similarities(self, query):
             # Convert document indexes to actual documents.
             query = [self.corpus[i] for i in query]
 
-        if not isinstance(query[0], list):
+        if not query or not isinstance(query[0], list):
             query = [query]
 
         n_queries = len(query)

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 
+from gensim.corpora import Dictionary
 from gensim.models import KeyedVectors as EuclideanKeyedVectors
 from gensim.test.utils import datapath
 
@@ -26,6 +27,33 @@ def setUp(self):
         self.vectors = EuclideanKeyedVectors.load_word2vec_format(
             datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
 
+    def similarity_matrix(self):
+        """Test similarity_matrix returns expected results."""
+
+        corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]]
+        dictionary = Dictionary(corpus)
+        corpus = [dictionary.doc2bow(document) for document in corpus]
+
+        # checking symmetry and the existence of ones on the diagonal
+        similarity_matrix = self.similarity_matrix(corpus, dictionary).todense()
+        self.assertTrue((similarity_matrix.T == similarity_matrix).all())
+        self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all())
+
+        # checking that thresholding works as expected
+        similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense()
+        self.assertEquals(18, np.sum(similarity_matrix == 0))
+
+        # checking that exponent works as expected
+        similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense()
+        self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix))
+
+        # checking that nonzero_limit works as expected
+        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense()
+        self.assertEquals(4, np.sum(similarity_matrix == 0))
+
+        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense()
+        self.assertEquals(20, np.sum(similarity_matrix == 0))
+
     def test_most_similar(self):
         """Test most_similar returns expected results."""
         expected = [