From 6438c82d3f5113228a94c9cddaed14731ad4aa2b Mon Sep 17 00:00:00 2001 From: Eric Lind Date: Sat, 22 Jul 2017 15:49:18 -0400 Subject: [PATCH 1/3] Improve speed of FastTextKeyedVectors __contains__ The current implementation of __contains__ in FastTextKeyedVectors is `O(n*m)` where `n` is the number of character ngrams in the query word and `m` is the size of the vocabulary. This is very slow for large corpora. The new implementation is O(n). --- gensim/models/wrappers/fasttext.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 9f68d67ca0..7124ceabac 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -123,11 +123,8 @@ def __contains__(self, word): if word in self.vocab: return True else: - word_ngrams = set(FastText.compute_ngrams(word, self.min_n, self.max_n)) - if len(word_ngrams & set(self.ngrams.keys())): - return True - else: - return False + word_ngrams = FastText.compute_ngrams(word, self.min_n, self.max_n) + return bool(any(ng in self.ngrams for ng in word_ngrams)) class FastText(Word2Vec): From 65c026eeda2eb2c9eba79fa29baf556009622b83 Mon Sep 17 00:00:00 2001 From: Eric Lind Date: Sat, 22 Jul 2017 20:40:47 -0400 Subject: [PATCH 2/3] any() was unnecessary. --- gensim/models/wrappers/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 7124ceabac..40470cca77 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -124,7 +124,7 @@ def __contains__(self, word): return True else: word_ngrams = FastText.compute_ngrams(word, self.min_n, self.max_n) - return bool(any(ng in self.ngrams for ng in word_ngrams)) + return any(ng in self.ngrams for ng in word_ngrams) class FastText(Word2Vec): From 61487b3b228c82050e2a43c6489677efc8452419 Mon Sep 17 00:00:00 2001 From: Eric Lind Date: Fri, 28 Jul 2017 21:14:46 -0400 Subject: [PATCH 3/3] Update variable name and docstring to improve clarity --- gensim/models/wrappers/fasttext.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 40470cca77..300c6e5a58 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -116,15 +116,14 @@ def init_sims(self, replace=False): def __contains__(self, word): """ - Check if word is present in the vocabulary, or if any word ngrams are present. A vector for the word is - guaranteed to exist if `__contains__` returns True. - + Check if `word` or any character ngrams in `word` are present in the vocabulary. + A vector for the word is guaranteed to exist if `__contains__` returns True. """ if word in self.vocab: return True else: - word_ngrams = FastText.compute_ngrams(word, self.min_n, self.max_n) - return any(ng in self.ngrams for ng in word_ngrams) + char_ngrams = FastText.compute_ngrams(word, self.min_n, self.max_n) + return any(ng in self.ngrams for ng in char_ngrams) class FastText(Word2Vec):