Skip to content

Commit

Permalink
Support fast inner product between a document and a corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed May 20, 2018
1 parent da6e6dd commit 093d569
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 1 deletion.
41 changes: 40 additions & 1 deletion gensim/similarities/termsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,46 @@ def inner_product(self, X, Y, normalized=False):
result = np.clip(result, -1.0, 1.0)

return result[0, 0]
else:
elif not is_corpus_X or not is_corpus_Y:
if is_corpus_X and not is_corpus_Y:
is_corpus_X, X, is_corpus_Y, Y = is_corpus_Y, Y, is_corpus_X, X # make Y the corpus
transposed = True
else:
transposed = False

dtype = self.matrix.dtype
expanded_X = corpus2csc([X], num_terms=self.matrix.shape[0], dtype=dtype).T.dot(self.matrix)
word_indices = sorted(expanded_X.nonzero()[1])
del expanded_X

X = dict(X)
X = np.array([X[i] if i in X else 0 for i in word_indices], dtype=dtype)
Y = corpus2csc(Y, num_terms=self.matrix.shape[0], dtype=dtype)[word_indices, :].todense()
matrix = self.matrix[[[i] for i in word_indices], word_indices].todense()

if normalized:
# use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T
X_norm = np.multiply(X.T.dot(matrix), X.T).sum(axis=1).T
Y_norm = np.multiply(Y.T.dot(matrix), Y.T).sum(axis=1).T

assert \
X_norm.min() >= 0.0 and Y_norm.min() >= 0.0, \
u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \
u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x."

X = np.multiply(X, 1 / np.sqrt(X_norm)).T
Y = np.multiply(Y, 1 / np.sqrt(Y_norm))

result = X.T.dot(matrix).dot(Y)

if normalized:
result = np.clip(result.data, -1.0, 1.0)

if transposed:
result = result.T

return result
else: # if is_corpus_X and is_corpus_Y:
dtype = self.matrix.dtype
X = corpus2csc(X if is_corpus_X else [X], num_terms=self.matrix.shape[0], dtype=dtype)
Y = corpus2csc(Y if is_corpus_Y else [Y], num_terms=self.matrix.shape[0], dtype=dtype)
Expand Down
46 changes: 46 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,52 @@ def test_inner_product(self):
result = matrix.inner_product(vec1, vec2, normalized=True)
self.assertAlmostEqual(expected_result, result, places=5)

# check that real-world (vector, corpus) pairs work as expected
vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
expected_result = 0.0
expected_result += 2 * 1.0 * 1 # government * s_{ij} * government
expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday
expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government
expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday
expected_result = numpy.full((1, 2), expected_result)
result = matrix.inner_product(vec1, [vec2] * 2)
self.assertTrue(isinstance(result, numpy.ndarray))
self.assertTrue(numpy.allclose(expected_result, result))

vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
expected_result = matrix.inner_product(vec1, vec2)
expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
expected_result = numpy.full((1, 2), expected_result)
result = matrix.inner_product(vec1, [vec2] * 2, normalized=True)
self.assertTrue(isinstance(result, numpy.ndarray))
self.assertTrue(numpy.allclose(expected_result, result))

# check that real-world (corpus, vector) pairs work as expected
vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
expected_result = 0.0
expected_result += 2 * 1.0 * 1 # government * s_{ij} * government
expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday
expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government
expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday
expected_result = numpy.full((3, 1), expected_result)
result = matrix.inner_product([vec1] * 3, vec2)
self.assertTrue(isinstance(result, numpy.ndarray))
self.assertTrue(numpy.allclose(expected_result, result))

vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
expected_result = matrix.inner_product(vec1, vec2)
expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
expected_result = numpy.full((3, 1), expected_result)
result = matrix.inner_product([vec1] * 3, vec2, normalized=True)
self.assertTrue(isinstance(result, numpy.ndarray))
self.assertTrue(numpy.allclose(expected_result, result))

# check that real-world corpora work as expected
vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
Expand Down

0 comments on commit 093d569

Please sign in to comment.