Skip to content

Commit

Permalink
Make similarity_matrix support non-contiguous dictionaries
Browse files Browse the repository at this point in the history
Closes #2041
  • Loading branch information
Witiko committed May 13, 2018
1 parent 8b81091 commit d4053b2
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"""
from __future__ import division # py3 "true division"

from collections import deque
import logging

try:
Expand Down Expand Up @@ -554,16 +555,17 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
num_skipped = 0
# Decide the order of rows.
if tfidf is None:
word_indices = range(matrix_order)
word_indices = deque(sorted(dictionary.keys()))
else:
assert max(tfidf.idfs) < matrix_order
word_indices = [
word_indices = deque([
index for index, _
in sorted(tfidf.idfs.items(), key=lambda x: (x[1], -x[0]), reverse=True)
]
])

# Traverse rows.
for row_number, w1_index in enumerate(word_indices):
for row_number, w1_index in enumerate(list(word_indices)):
word_indices.popleft()
if row_number % 1000 == 0:
logger.info(
"PROGRESS: at %.02f%% rows (%d / %d, %d skipped, %.06f%% density)",
Expand All @@ -578,8 +580,8 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
if matrix_order <= nonzero_limit + 1: # Traverse all columns.
columns = (
(w2_index, self.similarity(w1, dictionary[w2_index]))
for w2_index in range(w1_index + 1, matrix_order)
if w1_index != w2_index and dictionary[w2_index] in self.vocab)
for w2_index in word_indices
if dictionary[w2_index] in self.vocab)
else: # Traverse only columns corresponding to the embeddings closest to w1.
num_nonzero = matrix_nonzero[w1_index] - 1
columns = (
Expand Down

0 comments on commit d4053b2

Please sign in to comment.