Skip to content

Commit

Permalink
Remove direct access to properties moved to KeyedVectors (#1147)
Browse files Browse the repository at this point in the history
  • Loading branch information
tmylk authored Feb 16, 2017
1 parent 38393cc commit f05b7b1
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 140 deletions.
104 changes: 0 additions & 104 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,6 @@ class Word2Vec(utils.SaveLoad):
"""

# TODO: delete this flag after direct access to syn0norm, syn0, vocab is removed
keyed_vector_warnings = True

def __init__(
self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
Expand Down Expand Up @@ -1092,21 +1089,6 @@ def seeded_vector(self, seed_string):
once = random.RandomState(self.hashfxn(seed_string) & 0xffffffff)
return (once.rand(self.vector_size) - 0.5) / self.vector_size

def save_word2vec_format(self, *args, **kwargs):
if Word2Vec.keyed_vector_warnings:
logger.warning('word2vec.save_word2vec_format will be deprected in future gensim releases. Please use model.wv.save_word2vec_format')
return self.wv.save_word2vec_format(*args, **kwargs)

@classmethod
def load_word2vec_format(cls, *args, **kwargs):
if Word2Vec.keyed_vector_warnings:
logger.warning('Word2vec.load_word2vec_format will be deprected in future gensim releases. Please use KeyedVectors.load_word2vec_format')

wv = KeyedVectors.load_word2vec_format(*args, **kwargs)
result = cls(size=wv.syn0.shape[1])
result.wv = wv
return result

def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'):
"""
Merge the input-hidden weight matrix from the original C word2vec-tool format
Expand Down Expand Up @@ -1177,86 +1159,6 @@ def doesnt_match(self, words):
def __getitem__(self, words):
return self.wv.__getitem__(words)

@staticmethod
def disable_keyed_vectors_warnings():
Word2Vec.keyed_vector_warnings = False

@staticmethod
def enable_keyed_vectors_warnings():
Word2Vec.keyed_vector_warnings = True

@property
def syn0norm(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm')
return self.wv.syn0norm

@syn0norm.setter
def syn0norm(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm')
self.wv.syn0norm = value

@syn0norm.deleter
def syn0norm(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm')
del self.wv.syn0norm

@property
def syn0(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0')
return self.wv.syn0

@syn0.setter
def syn0(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0')
self.wv.syn0 = value

@syn0.deleter
def syn0(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0')
del self.wv.syn0

@property
def vocab(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab')
return self.wv.vocab

@vocab.setter
def vocab(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab')
self.wv.vocab = value

@vocab.deleter
def vocab(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab')
del self.wv.vocab

@property
def index2word(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word')
return self.wv.index2word

@index2word.setter
def index2word(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word')
self.wv.index2word = value

@index2word.deleter
def index2word(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word')
del self.wv.index2word

def __contains__(self, word):
return self.wv.__contains__(word)

Expand Down Expand Up @@ -1329,19 +1231,14 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa

def save(self, *args, **kwargs):
# don't bother storing the cached normalized vectors, recalculable table
# TODO: after introducing KeyedVectors now syn0, vocab, id2word are saved TWO times. Once in word2vec and once in keyedvectors
# After keyedvectors are deprecated it will be only once
Word2Vec.disable_keyed_vectors_warnings()
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table'])

super(Word2Vec, self).save(*args, **kwargs)
Word2Vec.enable_keyed_vectors_warnings()

save.__doc__ = utils.SaveLoad.save.__doc__

@classmethod
def load(cls, *args, **kwargs):
Word2Vec.disable_keyed_vectors_warnings()
model = super(Word2Vec, cls).load(*args, **kwargs)
# update older models
if hasattr(model, 'table'):
Expand All @@ -1363,7 +1260,6 @@ def load(cls, *args, **kwargs):
if not hasattr(model, 'train_count'):
model.train_count = 0
model.total_train_time = 0
Word2Vec.enable_keyed_vectors_warnings()
return model

def _load_specials(self, *args, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion gensim/similarities/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def build_from_word2vec(self):
"""Build an Annoy index using word vectors from a Word2Vec model"""

self.model.init_sims()
return self._build_from_model(self.model.wv.syn0norm, self.model.index2word
return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word
, self.model.vector_size)

def build_from_doc2vec(self):
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def test_mixed_tag_types(self):

def models_equal(self, model, model2):
# check words/hidden-weights
self.assertEqual(len(model.vocab), len(model2.vocab))
self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab))
self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0))
if model.hs:
self.assertTrue(np.allclose(model.syn1, model2.syn1))
Expand All @@ -306,7 +306,7 @@ def test_delete_temporary_training_data(self):
self.assertTrue(hasattr(model, 'syn0_lockf'))
model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False)
self.assertTrue(len(model['human']), 10)
self.assertTrue(model.vocab['graph'].count, 5)
self.assertTrue(model.wv.vocab['graph'].count, 5)
self.assertTrue(not hasattr(model, 'syn1'))
self.assertTrue(not hasattr(model, 'syn1neg'))
self.assertTrue(not hasattr(model, 'syn0_lockf'))
Expand Down
6 changes: 3 additions & 3 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,16 +485,16 @@ def testLoadMissingRaisesError(self):
self.assertRaises(IOError, test_index.load, fname='test-index')

def assertVectorIsSimilarToItself(self, model, index):
vector = model.syn0norm[0]
label = model.index2word[0]
vector = model.wv.syn0norm[0]
label = model.wv.index2word[0]
approx_neighbors = index.most_similar(vector, 1)
word, similarity = approx_neighbors[0]

self.assertEqual(word, label)
self.assertEqual(similarity, 1.0)

def assertApproxNeighborsMatchExact(self, model, index):
vector = model.syn0norm[0]
vector = model.wv.syn0norm[0]
approx_neighbors = model.most_similar([vector], topn=5, indexer=index)
exact_neighbors = model.most_similar(positive=[vector], topn=5)

Expand Down
62 changes: 32 additions & 30 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,63 +210,65 @@ def testLoadPreKeyedVectorModel(self):

def testLoadPreKeyedVectorModelCFormat(self):
"""Test loading pre-KeyedVectors word2vec model saved in word2vec format"""
model = word2vec.Word2Vec.load_word2vec_format(datapath('word2vec_pre_kv_c'))
self.assertTrue(model.wv.syn0.shape[0] == len(model.wv.vocab))
model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'))
self.assertTrue(model.syn0.shape[0] == len(model.vocab))

def testPersistenceWord2VecFormat(self):
"""Test storing/loading the entire model in word2vec format."""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
binary_model.init_sims(replace=False)
self.assertTrue(np.allclose(model['human'], binary_model['human']))
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
model.wv.save_word2vec_format(testfile(), binary=True)
binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
binary_model_kv.init_sims(replace=False)
self.assertTrue(np.allclose(model['human'], binary_model_kv['human']))
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
norm_only_model.init_sims(replace=True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
limited_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, limit=3)
self.assertEquals(len(limited_model.wv.syn0), 3)
half_precision_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, datatype=np.float16)
self.assertEquals(binary_model.wv.syn0.nbytes, half_precision_model.wv.syn0.nbytes * 2)
limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3)
self.assertEquals(len(limited_model_kv.syn0), 3)
half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, datatype=np.float16)
self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)

def testNoTrainingCFormat(self):
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
model.wv.save_word2vec_format(testfile(), binary=True)
kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec()
binary_model.wv = kv
self.assertRaises(ValueError, binary_model.train, sentences)


def testTooShortBinaryWord2VecFormat(self):
tfile = testfile()
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(tfile, binary=True)
model.wv.save_word2vec_format(tfile, binary=True)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True)
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=True)

def testTooShortTextWord2VecFormat(self):
tfile = testfile()
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(tfile, binary=False)
model.wv.save_word2vec_format(tfile, binary=False)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=False)
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False)

def testPersistenceWord2VecFormatNonBinary(self):
"""Test storing/loading the entire model in word2vec non-binary format."""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=False)
text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
model.wv.save_word2vec_format(testfile(), binary=False)
text_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False)
text_model.init_sims(False)
self.assertTrue(np.allclose(model['human'], text_model['human'], atol=1e-6))
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False)
norm_only_model.init_sims(True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6))
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4))
Expand All @@ -276,9 +278,9 @@ def testPersistenceWord2VecFormatWithVocab(self):
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab')
model.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(testfile(), testvocab, binary=True)
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count)
model.wv.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True)
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count)

def testPersistenceKeyedVectorsFormatWithVocab(self):
"""Test storing/loading the entire model and vocabulary in word2vec format."""
Expand All @@ -292,15 +294,15 @@ def testPersistenceKeyedVectorsFormatWithVocab(self):

def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self):
"""Test storing/loading the entire model and vocabulary in word2vec format chained with
saving and loading via `save` and `load` methods`."""
saving and loading via `save` and `load` methods`.
It was possible prior to 1.0.0 release, now raises Exception"""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab')
model.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab.save(testfile())
binary_model_with_vocab = word2vec.Word2Vec.load(testfile())
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count)
model.wv.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab_kv.save(testfile())
self.assertRaises(AttributeError, word2vec.Word2Vec.load, testfile())


def testLargeMmap(self):
Expand Down Expand Up @@ -416,7 +418,7 @@ def model_sanity(self, model, train=True):
orig0 = np.copy(model.wv.syn0[0])
model.train(list_corpus)
self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training
sims = model.most_similar('war', topn=len(model.index2word))
sims = model.most_similar('war', topn=len(model.wv.index2word))
t_rank = [word for word, score in sims].index('terrorism')
# in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war'
self.assertLess(t_rank, 50)
Expand Down

0 comments on commit f05b7b1

Please sign in to comment.