Order dictionary by decreasing cfs in KeyedVectors.vectors_for_all

piskvorky · May 28, 2021 · 6a8c688 · 6a8c688
1 parent 13a7ecd
commit 6a8c688
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 2 deletions.
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -1724,7 +1724,8 @@ def vectors_for_all(self, keys: Union[Iterable, Dictionary]) -> 'KeyedVectors':
 
         """
         if isinstance(keys, Dictionary):
-            vocabulary = [key for key in keys.token2id if key in self]
+            term_ids = sorted(keys.cfs.items(), key=lambda x: (-x[1], x[0]))  # sort by decreasing frequency
+            vocabulary = [term_id for term_id, freq in term_ids if term_id in self]
         else:
             vocabulary = (key for key in keys if key in self)
             vocabulary = list(OrderedDict.fromkeys(vocabulary))  # deduplicate keys

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -64,7 +64,7 @@ def test_vectors_for_all_dictionary(self):
         words = Dictionary([[
             'conflict',
             'administration',
-            'terrorism',
+            'terrorism', 'terrorism',
             'an out-of-vocabulary word',
             'another out-of-vocabulary word',
         ]])
@@ -78,6 +78,11 @@ def test_vectors_for_all_dictionary(self):
         predicted = vectors_for_all['conflict']
         self.assertTrue(np.allclose(expected, predicted))
 
+        # terrorism has the highest frequency, so it should be indexed first
+        expected = 'terrorism'
+        predicted = vectors_for_all.index_to_key[0]
+        self.assertEqual(expected, predicted)
+
     def test_most_similar_topn(self):
         """Test most_similar returns correct results when `topn` is specified."""
         self.assertEqual(len(self.vectors.most_similar('war', topn=5)), 5)