Skip to content

Commit

Permalink
Added unit tests for Soft Cosine Similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed Jan 7, 2018
1 parent fe53356 commit 9c382c1
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 4 deletions.
15 changes: 15 additions & 0 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,11 @@ def softcossim(vec1, vec2, similarity_matrix):
similarity_matrix.dtype
The Soft Cosine Similarity between `vec1` and `vec2`.
Raises
------
ValueError
When the term similarity matrix is in an unknown format.
See Also
--------
gensim.models.keyedvectors.EuclideanKeyedVectors.similarity_matrix
Expand All @@ -511,6 +516,12 @@ def softdot(vec1, vec2):
vec2 = vec2.tocsc()
return (vec1.T).dot(similarity_matrix).dot(vec2)[0, 0]

if not isinstance(similarity_matrix, scipy.sparse.csc_matrix):
if isinstance(similarity_matrix, scipy.sparse.csr_matrix):
similarity_matrix = similarity_matrix.T
else:
raise ValueError('unknown similarity matrix format')

if not vec1 or not vec2:
return 0.0
vec1 = sparse2coo(vec1)
Expand All @@ -522,6 +533,10 @@ def softdot(vec1, vec2):
" bag-of-words vector x."
result = softdot(vec1, vec2)
result /= math.sqrt(vec1len) * math.sqrt(vec2len) # rescale by vector lengths
if result > 1.0:
return 1.0
if result < -1.0:
return -1.0
return result


Expand Down
5 changes: 2 additions & 3 deletions gensim/similarities/docsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,6 @@ class SoftCosineSimilarity(interfaces.SimilarityABC):

def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256):
self.corpus = corpus
self.similarity_matrix = similarity_matrix
self.num_best = num_best
self.chunksize = chunksize

Expand All @@ -627,7 +626,7 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256):
with warnings.catch_warnings():
warnings.simplefilter("ignore", scipy.sparse.SparseEfficiencyWarning)
identity_matrix[nonzero_columns] = similarity_matrix.T[nonzero_columns]
self.similarity_matrix = identity_matrix.T
self._similarity_matrix = identity_matrix.T

def __len__(self):
return len(self.corpus)
Expand All @@ -647,7 +646,7 @@ def get_similarities(self, query):
result = []
for qidx in range(n_queries):
# Compute similarity for each query.
qresult = [matutils.softcossim(document, query[qidx], self.similarity_matrix)
qresult = [matutils.softcossim(document, query[qidx], self._similarity_matrix)
for document in self.corpus]
qresult = numpy.array(qresult)

Expand Down
32 changes: 32 additions & 0 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import numpy as np

from gensim.corpora import Dictionary
from gensim.models.keyedvectors import EuclideanKeyedVectors
from gensim.test.utils import datapath

Expand All @@ -26,6 +27,37 @@ def setUp(self):
self.vectors = EuclideanKeyedVectors.load_word2vec_format(
datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)

def similarity_matrix(self):
"""Test similarity_matrix returns expected results."""

corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]]
dictionary = Dictionary(corpus)
corpus = [dictionary.doc2bow(document) for document in corpus]

# checking symmetry and the existence of ones on the diagonal
similarity_matrix = self.similarity_matrix(corpus, dictionary).todense()
self.assertTrue((similarity_matrix.T == similarity_matrix).all())
self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all())

# checking that thresholding works as expected
similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense()
expected = 18
self.assertEquals(expected, np.sum(similarity_matrix == 0))

# checking that exponent works as expected
similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense()
expected = 9.5788956
self.assertAlmostEqual(expected, np.sum(similarity_matrix))

# checking that nonzero_limit works as expected
similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense()
expected = 4
self.assertEquals(expected, np.sum(similarity_matrix == 0))

similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense()
expected = 20
self.assertEquals(expected, np.sum(similarity_matrix == 0))

def test_most_similar(self):
"""Test most_similar returns expected results."""
expected = [
Expand Down
85 changes: 85 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import numpy
import scipy

from gensim.corpora import Dictionary
from gensim.models import word2vec
from gensim.models import doc2vec
from gensim.models import KeyedVectors
Expand Down Expand Up @@ -159,6 +160,8 @@ def testPersistency(self):
index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5)
elif self.cls == similarities.WmdSimilarity:
index = self.cls(texts, self.w2v_model)
elif self.cls == similarities.SoftCosineSimilarity:
index = self.cls(self.corpus, self.similarity_matrix)
else:
index = self.cls(corpus, num_features=len(dictionary))
index.save(fname)
Expand All @@ -184,6 +187,8 @@ def testPersistencyCompressed(self):
index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5)
elif self.cls == similarities.WmdSimilarity:
index = self.cls(texts, self.w2v_model)
elif self.cls == similarities.SoftCosineSimilarity:
index = self.cls(self.corpus, self.similarity_matrix)
else:
index = self.cls(corpus, num_features=len(dictionary))
index.save(fname)
Expand All @@ -209,6 +214,8 @@ def testLarge(self):
index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5)
elif self.cls == similarities.WmdSimilarity:
index = self.cls(texts, self.w2v_model)
elif self.cls == similarities.SoftCosineSimilarity:
index = self.cls(self.corpus, self.similarity_matrix)
else:
index = self.cls(corpus, num_features=len(dictionary))
# store all arrays separately
Expand Down Expand Up @@ -236,6 +243,8 @@ def testLargeCompressed(self):
index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5)
elif self.cls == similarities.WmdSimilarity:
index = self.cls(texts, self.w2v_model)
elif self.cls == similarities.SoftCosineSimilarity:
index = self.cls(self.corpus, self.similarity_matrix)
else:
index = self.cls(corpus, num_features=len(dictionary))
# store all arrays separately
Expand Down Expand Up @@ -263,6 +272,8 @@ def testMmap(self):
index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5)
elif self.cls == similarities.WmdSimilarity:
index = self.cls(texts, self.w2v_model)
elif self.cls == similarities.SoftCosineSimilarity:
index = self.cls(self.corpus, self.similarity_matrix)
else:
index = self.cls(corpus, num_features=len(dictionary))
# store all arrays separately
Expand Down Expand Up @@ -291,6 +302,8 @@ def testMmapCompressed(self):
index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5)
elif self.cls == similarities.WmdSimilarity:
index = self.cls(texts, self.w2v_model)
elif self.cls == similarities.SoftCosineSimilarity:
index = self.cls(self.corpus, self.similarity_matrix)
else:
index = self.cls(corpus, num_features=len(dictionary))
# store all arrays separately
Expand Down Expand Up @@ -382,6 +395,78 @@ def testIter(self):
self.assertTrue(numpy.alltrue(sims <= 1.0))


class TestSoftCosineSimilarity(unittest.TestCase, _TestSimilarityABC):
def setUp(self):
self.cls = similarities.SoftCosineSimilarity
self.dictionary = Dictionary(texts)
self.corpus = [dictionary.doc2bow(document) for document in texts]
similarity_matrix = scipy.sparse.identity(12, format="lil")
similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5
similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5
self.similarity_matrix = similarity_matrix.tocsc()

def testFull(self, num_best=None):
# Override testFull.

index = self.cls(self.corpus, self.similarity_matrix, num_best=num_best)
query = self.dictionary.doc2bow(texts[0])
sims = index[query]

if num_best is not None:
# Sparse array.
for i, sim in sims:
self.assertTrue(numpy.alltrue(sim <= 1.0))
self.assertTrue(numpy.alltrue(sim >= 0.0))
else:
self.assertTrue(sims[0] == 1.0) # Similarity of a document with itself is 1.0.
self.assertTrue(numpy.alltrue(sims[1:] >= 0.0))
self.assertTrue(numpy.alltrue(sims[1:] < 1.0))
expected = 2.1889350195476758
self.assertAlmostEqual(expected, numpy.sum(sims))

def testNonIncreasing(self):
''' Check that similarities are non-increasing when `num_best` is not
`None`.'''
# NOTE: this could be implemented for other similarities as well (i.e.
# in _TestSimilarityABC).

index = self.cls(self.corpus, self.similarity_matrix, num_best=5)
query = self.dictionary.doc2bow(texts[0])
sims = index[query]
sims2 = numpy.asarray(sims)[:, 1] # Just the similarities themselves.

# The difference of adjacent elements should be negative.
cond = sum(numpy.diff(sims2) < 0) == len(sims2) - 1
self.assertTrue(cond)

def testChunking(self):
# Override testChunking.

index = self.cls(self.corpus, self.similarity_matrix)
query = [self.dictionary.doc2bow(document) for document in texts[:3]]
sims = index[query]

for i in range(3):
self.assertTrue(numpy.alltrue(sims[i, i] == 1.0)) # Similarity of a document with itself is 1.0.

# test the same thing but with num_best
index.num_best = 5
sims = index[query]
for i, chunk in enumerate(sims):
expected = i
self.assertEquals(expected, chunk[0][0])
expected = 1.0
self.assertEquals(expected, chunk[0][1])

def testIter(self):
# Override testIter.

index = self.cls(self.corpus, self.similarity_matrix)
for sims in index:
self.assertTrue(numpy.alltrue(sims >= 0.0))
self.assertTrue(numpy.alltrue(sims <= 1.0))


class TestSparseMatrixSimilarity(unittest.TestCase, _TestSimilarityABC):
def setUp(self):
self.cls = similarities.SparseMatrixSimilarity
Expand Down
34 changes: 33 additions & 1 deletion gensim/test/test_similarity_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import unittest

from gensim import matutils
from scipy.sparse import csr_matrix
from scipy.sparse import csr_matrix, csc_matrix
import numpy as np
import math
from gensim.corpora.mmcorpus import MmCorpus
Expand Down Expand Up @@ -236,6 +236,38 @@ def test_distributions(self):
self.assertAlmostEqual(expected, result)


class TestSoftCosineSimilarity(unittest.TestCase):
def test_inputs(self):

# checking empty inputs
vec_1 = []
vec_2 = []
similarity_matrix = csc_matrix((0, 0))
result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
expected = 0.0
self.assertEqual(expected, result)

# checking CSR term similarity matrix format
similarity_matrix = csr_matrix((0, 0))
result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
expected = 0.0
self.assertEqual(expected, result)

# checking unknown term similarity matrix format
with self.assertRaises(ValueError):
matutils.softcossim(vec_1, vec_2, np.matrix([]))

def test_distributions(self):

# checking bag of words as inputs
vec_1 = [(0, 1.0), (2, 1.0)] # hello world
vec_2 = [(1, 1.0), (2, 1.0)] # hi world
similarity_matrix = csc_matrix([[1, 0.5, 0], [0.5, 1, 0], [0, 0, 1]])
result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
expected = 0.75
self.assertAlmostEqual(expected, result)


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()

0 comments on commit 9c382c1

Please sign in to comment.