New KeyedVectors.vectors_for_all method for vectorizing all words in …

…a dictionary (#3157) * Add KeyedVectors.vectors_for_all * Add examples for KeyedVectors.vectors_for_all * Support Dictionary in KeyedVectors.vectors_for_all * Don't sort keys in KeyedVectors.vectors_for_all, just deduplicate * Use docstrings in imperative mode (PEP8) Co-authored-by: Radim Řehůřek <me@radimrehurek.com> * Guard against KeyError in KeyedVectors.vectors_for_all * Unit-test dictionary parameter of KeyedVectors.vectors_for_all * Order dictionary by decreasing cfs in KeyedVectors.vectors_for_all * Add allow_inference parameter to KeyedVectors.vectors_for_all * Add copy_vecattrs parameter to KeyedVectors.vectors_for_all * Move copy_vecattrs tests for KeyedVectors.vectors_for_all * Fix translation of term ids to terms in KeyedVectors.vectors_for_all * Fix a typo in KeyedVectors.vectors_for_all unit test * Do not make assumptions about fake counts in _add_word_to_kv * Document that KeyedVectors.vectors_for_all allows arbitrary keys * Add notes about the behavior of KeyedVectors.vectors_for_all * Properly reference Dictionary in KeyedVectors.vectors_for_all docstring * Make deduplication in KeyedVectors.vectors_for_all a oneliner * Remove an unnecessary temporary variable in KeyedVectors.vectors_for_all * Make deduplication in KeyedVectors.vectors_for_all a oneliner (cont.) * Add Dictionary.most_common * Remove test_vectors_for_all_dictionary unit test * Remove a trailing bracket in an example * Fix unit tests for Dictionary.most_common * Update an example for SparseTermSimilarityMatrix * Remove Gensim downloader from KeyedVectors.vectors_for_all example * Remove include_counts parameter from Dictionary.most_common * Shorten the KeyedVectors.vectors_for_all example * Remove include_counts parameter from Dictionary.most_common (cont.) * Use pytest assertion syntax in unit tests * Remove an unnecessary comment in KeyedVectors.vectors_for_all * Remove an unnecessary comment in KeyedVectors.vectors_for_all Co-authored-by: Michael Penkov <m@penkov.dev> * Remove an unnecessary variable in KeyedVectors.vectors_for_all * Make the creation of new vocab in KeyedVectors.vectors_for_all explicit * Make AnnoyIndexer use the correct word-vectors in example * Apply suggestions from code review * Apply suggestions from code review * Update CHANGELOG.md Co-authored-by: Radim Řehůřek <me@radimrehurek.com> Co-authored-by: Michael Penkov <m@penkov.dev>
piskvorky · Jun 29, 2021 · a93067d · a93067d
1 parent 2a41200
commit a93067d
Show file tree

Hide file tree

Showing 7 changed files with 226 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ Changes
 * [#3115](https://github.com/RaRe-Technologies/gensim/pull/3115): Make LSI dispatcher CLI param for number of jobs optional, by [@robguinness](https://github.com/robguinness)
 * [#3128](https://github.com/RaRe-Technologies/gensim/pull/3128): Materialize and copy the corpus passed to SoftCosineSimilarity, by [@Witiko](https://github.com/Witiko)
 * [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Added import to Nmf docs, and to models/__init__.py, by [@properGrammar](https://github.com/properGrammar)
+* [#3157](https://github.com/RaRe-Technologies/gensim/pull/3157): New KeyedVectors.vectors_for_all method for vectorizing all words in a dictionary, by [@Witiko](https://github.com/Witiko)
 * [#3163](https://github.com/RaRe-Technologies/gensim/pull/3163): Optimize word mover distance (WMD) computation, by [@flowlight0](https://github.com/flowlight0)
 * [#2965](https://github.com/RaRe-Technologies/gensim/pull/2965): Remove strip_punctuation2 alias of strip_punctuation, by [@sciatro](https://github.com/sciatro)
 * [#3169](https://github.com/RaRe-Technologies/gensim/pull/3169): Implement `shrink_windows` argument for Word2Vec., by [@M-Demay](https://github.com/M-Demay)

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -10,6 +10,7 @@
 from collections.abc import Mapping
 import logging
 import itertools
+from typing import Optional, List, Tuple
 
 from gensim import utils
 
@@ -689,6 +690,30 @@ def load_from_text(fname):
                 result.dfs[wordid] = int(docfreq)
         return result
 
+    def most_common(self, n: Optional[int] = None) -> List[Tuple[str, int]]:
+        """Return a list of the n most common words and their counts from the most common to the least.
+
+        Words with equal counts are ordered in the increasing order of their ids.
+
+        Parameters
+        ----------
+        n : int or None, optional
+            The number of most common words to be returned. If `None`, all words in the dictionary
+            will be returned. Default is `None`.
+
+        Returns
+        -------
+        most_common : list of (str, int)
+            The n most common words and their counts from the most common to the least.
+
+        """
+        most_common = [
+            (self[word], count)
+            for word, count
+            in sorted(self.cfs.items(), key=lambda x: (-x[1], x[0]))[:n]
+        ]
+        return most_common
+
     @staticmethod
     def from_corpus(corpus, id2word=None):
         """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -171,6 +171,7 @@
 import itertools
 import warnings
 from numbers import Integral
+from typing import Iterable
 
 from numpy import (
     dot, float32 as REAL, double, array, zeros, vstack,
@@ -1689,6 +1690,70 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
             msg=f"merged {overlap_count} vectors into {self.vectors.shape} matrix from {fname}",
         )
 
+    def vectors_for_all(self, keys: Iterable, allow_inference: bool = True,
+                        copy_vecattrs: bool = False) -> 'KeyedVectors':
+        """Produce vectors for all given keys as a new :class:`KeyedVectors` object.
+
+        Notes
+        -----
+        The keys will always be deduplicated. For optimal performance, you should not pass entire
+        corpora to the method. Instead, you should construct a dictionary of unique words in your
+        corpus:
+
+        >>> from collections import Counter
+        >>> import itertools
+        >>>
+        >>> from gensim.models import FastText
+        >>> from gensim.test.utils import datapath, common_texts
+        >>>
+        >>> model_corpus_file = datapath('lee_background.cor')  # train word vectors on some corpus
+        >>> model = FastText(corpus_file=model_corpus_file, vector_size=20, min_count=1)
+        >>> corpus = common_texts  # infer word vectors for words from another corpus
+        >>> word_counts = Counter(itertools.chain.from_iterable(corpus))  # count words in your corpus
+        >>> words_by_freq = (k for k, v in word_counts.most_common())
+        >>> word_vectors = model.wv.vectors_for_all(words_by_freq)  # create word-vectors for words in your corpus
+
+        Parameters
+        ----------
+        keys : iterable
+            The keys that will be vectorized.
+        allow_inference : bool, optional
+            In subclasses such as :class:`~gensim.models.fasttext.FastTextKeyedVectors`,
+            vectors for out-of-vocabulary keys (words) may be inferred. Default is True.
+        copy_vecattrs : bool, optional
+            Additional attributes set via the :meth:`KeyedVectors.set_vecattr` method
+            will be preserved in the produced :class:`KeyedVectors` object. Default is False.
+            To ensure that *all* the produced vectors will have vector attributes assigned,
+            you should set `allow_inference=False`.
+
+        Returns
+        -------
+        keyedvectors : :class:`~gensim.models.keyedvectors.KeyedVectors`
+            Vectors for all the given keys.
+
+        """
+        # Pick only the keys that actually exist & deduplicate them.
+        # We keep the original key order, to improve cache locality, for performance.
+        vocab, seen = [], set()
+        for key in keys:
+            if key not in seen:
+                seen.add(key)
+                if key in (self if allow_inference else self.key_to_index):
+                    vocab.append(key)
+
+        kv = KeyedVectors(self.vector_size, len(vocab), dtype=self.vectors.dtype)
+
+        for key in vocab:  # produce and index vectors for all the given keys
+            weights = self[key]
+            _add_word_to_kv(kv, None, key, weights, len(vocab))
+            if copy_vecattrs:
+                for attr in self.expandos:
+                    try:
+                        kv.set_vecattr(key, attr, self.get_vecattr(key, attr))
+                    except KeyError:
+                        pass
+        return kv
+
     def _upconvert_old_d2vkv(self):
         """Convert a deserialized older Doc2VecKeyedVectors instance to latest generic KeyedVectors"""
         self.vocab = self.doctags

diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py
@@ -102,6 +102,28 @@ class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
     Computes cosine similarities between word embeddings and retrieves most
     similar terms for a given term.
 
+    Notes
+    -----
+    By fitting the word embeddings to a vocabulary that you will be using, you
+    can eliminate all out-of-vocabulary (OOV) words that you would otherwise
+    receive from the `most_similar` method. In subword models such as fastText,
+    this procedure will also infer word-vectors for words from your vocabulary
+    that previously had no word-vector.
+
+    >>> from gensim.test.utils import common_texts, datapath
+    >>> from gensim.corpora import Dictionary
+    >>> from gensim.models import FastText
+    >>> from gensim.models.word2vec import LineSentence
+    >>> from gensim.similarities import WordEmbeddingSimilarityIndex
+    >>>
+    >>> model = FastText(common_texts, vector_size=20, min_count=1)  # train word-vectors on a corpus
+    >>> different_corpus = LineSentence(datapath('lee_background.cor'))
+    >>> dictionary = Dictionary(different_corpus)  # construct a vocabulary on a different corpus
+    >>> words = [word for word, count in dictionary.most_common()]
+    >>> word_vectors = model.wv.vectors_for_all(words)  # remove OOV word-vectors and infer word-vectors for new words
+    >>> assert len(dictionary) == len(word_vectors)  # all words from our vocabulary received their word-vectors
+    >>> termsim_index = WordEmbeddingSimilarityIndex(word_vectors)
+
     Parameters
     ----------
     keyedvectors : :class:`~gensim.models.keyedvectors.KeyedVectors`
@@ -404,25 +426,29 @@ class SparseTermSimilarityMatrix(SaveLoad):
 
     Examples
     --------
-    >>> from gensim.test.utils import common_texts
+    >>> from gensim.test.utils import common_texts as corpus, datapath
     >>> from gensim.corpora import Dictionary
     >>> from gensim.models import Word2Vec
     >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
     >>> from gensim.similarities.index import AnnoyIndexer
-    >>> from scikits.sparse.cholmod import cholesky
     >>>
-    >>> model = Word2Vec(common_texts, vector_size=20, min_count=1)  # train word-vectors
-    >>> annoy = AnnoyIndexer(model, num_trees=2)  # use annoy for faster word similarity lookups
-    >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv, kwargs={'indexer': annoy})
-    >>> dictionary = Dictionary(common_texts)
-    >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
-    >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, symmetric=True, dominant=True)
-    >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
+    >>> model_corpus_file = datapath('lee_background.cor')
+    >>> model = Word2Vec(corpus_file=model_corpus_file, vector_size=20, min_count=1)  # train word-vectors
     >>>
-    >>> query = 'graph trees computer'.split()  # make a query
-    >>> sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus
+    >>> dictionary = Dictionary(corpus)
+    >>> tfidf = TfidfModel(dictionary=dictionary)
+    >>> words = [word for word, count in dictionary.most_common()]
+    >>> word_vectors = model.wv.vectors_for_all(words, allow_inference=False)  # produce vectors for words in corpus
+    >>>
+    >>> indexer = AnnoyIndexer(word_vectors, num_trees=2)  # use Annoy for faster word similarity lookups
+    >>> termsim_index = WordEmbeddingSimilarityIndex(word_vectors, kwargs={'indexer': indexer})
+    >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf)  # compute word similarities
     >>>
-    >>> word_embeddings = cholesky(similarity_matrix.matrix).L()  # obtain word embeddings from similarity matrix
+    >>> tfidf_corpus = tfidf[[dictionary.doc2bow(document) for document in common_texts]]
+    >>> docsim_index = SoftCosineSimilarity(tfidf_corpus, similarity_matrix, num_best=10)  # index tfidf_corpus
+    >>>
+    >>> query = 'graph trees computer'.split()  # make a query
+    >>> sims = docsim_index[dictionary.doc2bow(query)]  # find the ten closest documents from tfidf_corpus
 
     Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_scm.html>`_
     for more examples.

diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py
@@ -359,6 +359,18 @@ def test_patch_with_special_tokens(self):
         self.assertNotIn((1, 1), d.doc2bow(corpus_with_special_tokens[0]))
         self.assertIn((1, 1), d.doc2bow(corpus_with_special_tokens[1]))
 
+    def test_most_common_with_n(self):
+        texts = [['human', 'human', 'human', 'computer', 'computer', 'interface', 'interface']]
+        d = Dictionary(texts)
+        expected = [('human', 3), ('computer', 2)]
+        assert d.most_common(n=2) == expected
+
+    def test_most_common_without_n(self):
+        texts = [['human', 'human', 'human', 'computer', 'computer', 'interface', 'interface']]
+        d = Dictionary(texts)
+        expected = [('human', 3), ('computer', 2), ('interface', 2)]
+        assert d.most_common(n=None) == expected
+
 
 # endclass TestDictionary
 

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -714,6 +714,54 @@ def obsolete_testLoadOldModel(self):
         self.assertEqual(model.wv.vectors_vocab.shape, (12, 100))
         self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100))
 
+    def test_vectors_for_all_with_inference(self):
+        """Test vectors_for_all can infer new vectors."""
+        words = [
+            'responding',
+            'approached',
+            'chairman',
+            'an out-of-vocabulary word',
+            'another out-of-vocabulary word',
+        ]
+        vectors_for_all = self.test_model.wv.vectors_for_all(words)
+
+        expected = 5
+        predicted = len(vectors_for_all)
+        assert expected == predicted
+
+        expected = self.test_model.wv['responding']
+        predicted = vectors_for_all['responding']
+        assert np.allclose(expected, predicted)
+
+        smaller_distance = np.linalg.norm(
+            vectors_for_all['an out-of-vocabulary word']
+            - vectors_for_all['another out-of-vocabulary word']
+        )
+        greater_distance = np.linalg.norm(
+            vectors_for_all['an out-of-vocabulary word']
+            - vectors_for_all['responding']
+        )
+        assert greater_distance > smaller_distance
+
+    def test_vectors_for_all_without_inference(self):
+        """Test vectors_for_all does not infer new vectors when prohibited."""
+        words = [
+            'responding',
+            'approached',
+            'chairman',
+            'an out-of-vocabulary word',
+            'another out-of-vocabulary word',
+        ]
+        vectors_for_all = self.test_model.wv.vectors_for_all(words, allow_inference=False)
+
+        expected = 3
+        predicted = len(vectors_for_all)
+        assert expected == predicted
+
+        expected = self.test_model.wv['responding']
+        predicted = vectors_for_all['responding']
+        assert np.allclose(expected, predicted)
+
 
 @pytest.mark.parametrize('shrink_windows', [True, False])
 def test_cbow_hs_training(shrink_windows):

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -39,6 +39,43 @@ def test_most_similar(self):
         predicted = [result[0] for result in self.vectors.most_similar('war', topn=5)]
         self.assertEqual(expected, predicted)
 
+    def test_vectors_for_all_list(self):
+        """Test vectors_for_all returns expected results with a list of keys."""
+        words = [
+            'conflict',
+            'administration',
+            'terrorism',
+            'an out-of-vocabulary word',
+            'another out-of-vocabulary word',
+        ]
+        vectors_for_all = self.vectors.vectors_for_all(words)
+
+        expected = 3
+        predicted = len(vectors_for_all)
+        assert expected == predicted
+
+        expected = self.vectors['conflict']
+        predicted = vectors_for_all['conflict']
+        assert np.allclose(expected, predicted)
+
+    def test_vectors_for_all_with_copy_vecattrs(self):
+        """Test vectors_for_all returns can copy vector attributes."""
+        words = ['conflict']
+        vectors_for_all = self.vectors.vectors_for_all(words, copy_vecattrs=True)
+
+        expected = self.vectors.get_vecattr('conflict', 'count')
+        predicted = vectors_for_all.get_vecattr('conflict', 'count')
+        assert expected == predicted
+
+    def test_vectors_for_all_without_copy_vecattrs(self):
+        """Test vectors_for_all returns can copy vector attributes."""
+        words = ['conflict']
+        vectors_for_all = self.vectors.vectors_for_all(words, copy_vecattrs=False)
+
+        not_expected = self.vectors.get_vecattr('conflict', 'count')
+        predicted = vectors_for_all.get_vecattr('conflict', 'count')
+        assert not_expected != predicted
+
     def test_most_similar_topn(self):
         """Test most_similar returns correct results when `topn` is specified."""
         self.assertEqual(len(self.vectors.most_similar('war', topn=5)), 5)