diff --git a/docs/notebooks/translation_matrix.ipynb b/docs/notebooks/translation_matrix.ipynb index 0d252f4545..7e8bab2cbc 100644 --- a/docs/notebooks/translation_matrix.ipynb +++ b/docs/notebooks/translation_matrix.ipynb @@ -160,7 +160,7 @@ }, "outputs": [], "source": [ - "#Load the target language word vector\n", + "# Load the target language word vector\n", "target_word_vec_file = \"IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt\"\n", "target_word_vec = KeyedVectors.load_word2vec_format(target_word_vec_file, binary=False)" ] @@ -193,7 +193,7 @@ } ], "source": [ - "transmat = translation_matrix.TranslationMatrix(word_pair, source_word_vec, target_word_vec)\n", + "transmat = translation_matrix.TranslationMatrix(source_word_vec, target_word_vec, word_pair)\n", "transmat.train(word_pair)\n", "print \"the shape of translation matrix is: \", transmat.translation_matrix.shape" ] @@ -232,9 +232,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/robotcator/PycharmProjects/gensim/gensim/models/translation_matrix.py:220: UserWarning: The parameter source_lang_vec isn't specified, use the model's source language word vector as default.\n", + "/home/robotcator/PycharmProjects/gensim/gensim/models/translation_matrix.py:224: UserWarning: The parameter source_lang_vec isn't specified, use the model's source language word vector as default.\n", " warnings.warn(\"The parameter source_lang_vec isn't specified, use the model's source language word vector as default.\")\n", - "/home/robotcator/PycharmProjects/gensim/gensim/models/translation_matrix.py:224: UserWarning: The parameter target_lang_vec isn't specified, use the model's target language word vector as default.\n", + "/home/robotcator/PycharmProjects/gensim/gensim/models/translation_matrix.py:228: UserWarning: The parameter target_lang_vec isn't specified, use the model's target language word vector as default.\n", " warnings.warn(\"The parameter target_lang_vec isn't specified, use the model's target language word vector as default.\")\n" ] } @@ -243,7 +243,7 @@ "# the piar is (English, Italian), we can see whether the translated word is right or not \n", "words = [(\"one\", \"uno\"), (\"two\", \"due\"), (\"three\", \"tre\"), (\"four\", \"quattro\"), (\"five\", \"cinque\")]\n", "source_word, target_word = zip(*words)\n", - "translated_word = transmat.translate(source_word, 5)" + "translated_word = transmat.translate(source_word, 5, )" ] }, { @@ -400,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "collapsed": false, "deletable": true, @@ -421,7 +421,7 @@ " sub_pair = word_pair[: (idx + 1) * step]\n", "\n", " startTime = time.time()\n", - " transmat = translation_matrix.TranslationMatrix(sub_pair, source_word_vec, target_word_vec)\n", + " transmat = translation_matrix.TranslationMatrix(source_word_vec, target_word_vec, sub_pair)\n", " transmat.train(sub_pair)\n", " endTime = time.time()\n", " \n", @@ -431,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "collapsed": false, "deletable": true, @@ -469,16 +469,16 @@ 20940 ], "y": [ - 0.6137721538543701, - 0.5209219455718994, - 0.728323221206665, - 0.9210660457611084, - 1.135694980621338, - 1.3455538749694824, - 1.5958888530731201, - 1.819183111190796, - 2.0376498699188232, - 2.496438980102539 + 0.5877759456634521, + 0.8401670455932617, + 0.9247369766235352, + 1.2453999519348145, + 1.60801100730896, + 1.892496109008789, + 2.141044855117798, + 2.1962528228759766, + 2.7086141109466553, + 3.2611770629882812 ] } ], @@ -487,10 +487,10 @@ } }, "text/html": [ - "
" + "
" ], "text/vnd.plotly.v1+html": [ - "
" + "
" ] }, "metadata": {}, @@ -542,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "collapsed": false, "deletable": true, @@ -572,7 +572,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "collapsed": false, "deletable": true, @@ -606,7 +606,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "collapsed": false, "deletable": true, @@ -675,10 +675,10 @@ } }, "text/html": [ - "
" + "
" ], "text/vnd.plotly.v1+html": [ - "
" + "
" ] }, "metadata": {}, @@ -723,7 +723,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "collapsed": false, "deletable": true, @@ -778,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": { "collapsed": false, "deletable": true, @@ -882,10 +882,10 @@ } }, "text/html": [ - "
" + "
" ], "text/vnd.plotly.v1+html": [ - "
" + "
" ] }, "metadata": {}, @@ -963,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": { "collapsed": false, "deletable": true, @@ -997,7 +997,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": { "collapsed": false, "deletable": true, @@ -1072,10 +1072,10 @@ } }, "text/html": [ - "
" + "
" ], "text/vnd.plotly.v1+html": [ - "
" + "
" ] }, "metadata": {}, @@ -1108,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": { "collapsed": false, "deletable": true, @@ -1163,7 +1163,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": { "collapsed": false, "deletable": true, @@ -1261,10 +1261,10 @@ } }, "text/html": [ - "
" + "
" ], "text/vnd.plotly.v1+html": [ - "
" + "
" ] }, "metadata": {}, @@ -1338,12 +1338,16 @@ "editable": true }, "source": [ - "# Tranlation Matrix Revisit" + "# Tranlation Matrix Revisit \n", + "## Warning: this part is unstable/experimental, it requires more experimentation and will change soon!" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "As dicussion in this [PR](https://github.com/RaRe-Technologies/gensim/pull/1434), Translation Matrix not only can used to translate the words from one source language to another target lanuage, but also to translate new document vectors back to old model space.\n", "\n", @@ -1352,7 +1356,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In this notebook, we use the IMDB dataset as example. For more information about this dataset, please refer to [this](http://ai.stanford.edu/~amaas/data/sentiment/). And some of code are borrowed from this [notebook](http://localhost:8888/notebooks/docs/notebooks/doc2vec-IMDB.ipynb)" ] @@ -1361,7 +1368,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1411,7 +1420,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here, we train two Doc2vec model, the parameters can be determined by yourself. We trained on 15k documents for the `model1` and 50k documents for the `model2`. But you should mixed some documents which from the 15k document in `model` to the `model2` as dicussed before. " ] @@ -1420,7 +1432,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1453,7 +1467,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For the IMDB training dataset, we train an classifier on the train data which has 25k documents with positive and negative label. Then using this classifier to predict the test data. To see what accuracy can the document vectors which learned by different method achieve." ] @@ -1462,7 +1479,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1480,7 +1499,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For the experiment one, we use the vector which learned by the Doc2vec method.To evalute those document vector, we use split those 50k document into two part, one for training and the other for testing." ] @@ -1489,7 +1511,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1558,7 +1582,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "For the experiment two, the document vectors are learned by the back-mapping method, which has a linear mapping for the `model1` and `model2`. Using this method like translation matrix for the word translation, If we provide the vector for the addtional 35k document vector in `model2`, we can infer this vector for the `model1`." ] @@ -1567,7 +1594,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1634,14 +1663,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "As we can see that, the vectors learned by back-mapping method performed not bad but still need improved." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Visulization\n", " we pick some documents and extract the vector both from `model1` and `model2`, we can see that they also share the similar geometric arrangment." @@ -1651,7 +1686,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1689,7 +1726,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1792,7 +1831,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -1915,7 +1956,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "You probably will see kinds of colors point. One for the `model1`, the `sdoc0` to `sdoc4` document vector are learned by Doc2vec and `sdoc5` and `sdoc6` are learned by back-mapping. One for the `model2`, the `tdoc0` to `tdoc6` are learned by Doc2vec. We can see that some of points learned from the back-mapping method still have the relative position with the point learned by Doc2vec." ] @@ -1924,7 +1968,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [] diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index ac1155f20b..e32b580ffd 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -13,7 +13,8 @@ standard nearest neighbour method or globally corrected neighbour retrieval method [1]. This method can be used to augment the existing phrase tables with more candidate translations, or -filter out errors from the translation tables and known dictionaries [2]. +filter out errors from the translation tables and known dictionaries [2]. What's more, It also work +for any two sets of named-vectors where there are some paired-guideposts to learn the transformation. Initialize a model with e.g.:: @@ -45,7 +46,7 @@ class Space(object): Attributes: `mat` (ndarray): each row is the word vector of the lexicon - `index2word` (list): a list of lexicon + `index2word` (list): a list of words in the `Space` object `word2index` (dict): map the word to index """ def __init__(self, matrix, index2word): @@ -65,7 +66,7 @@ def __init__(self, matrix, index2word): @classmethod def build(cls, lang_vec, lexicon=None): """ - construct a space class for the lexicon, if it's provided. + Construct a space class for the lexicon, if it's provided. Args: `lang_vec`: word2vec model that extract word vector for lexicon `lexicon`: the default is None, if it is not provided, the lexicon @@ -91,7 +92,7 @@ def build(cls, lang_vec, lexicon=None): return Space(mat, words) def normalize(self): - """ normalized the word vector's matrix """ + """ Normalize the word vector's matrix """ self.mat = self.mat / np.sqrt(np.sum(np.multiply(self.mat, self.mat), axis=1, keepdims=True)) @@ -104,17 +105,17 @@ class TranslationMatrix(utils.SaveLoad): 1. constructor, 2. the `train` method, which initialize everything needed to build a translation matrix 3. the `translate` method, which given new word and its vector representation, - we map it to the other language space by computing z = Wx, then return the + We map it to the other language space by computing z = Wx, then return the word whose representation is close to z. - the details use seen the notebook (translation_matrix.ipynb) + The details use seen the notebook (translation_matrix.ipynb) - >>> transmat = TranslationMatrix(word_pair, source_lang_vec, target_lang_vec) + >>> transmat = TranslationMatrix(source_lang_vec, target_lang_vec, word_pair) >>> transmat.train(word_pair) >>> translated_word = transmat.translate(words, topn=3) """ - def __init__(self, word_pair, source_lang_vec, target_lang_vec, random_state=None): + def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): """ Initialize the model from a list pair of `word_pair`. Each word_pair is tupe with source language word and target language word. @@ -127,10 +128,8 @@ def __init__(self, word_pair, source_lang_vec, target_lang_vec, random_state=Non `target_lang_vec` (KeyedVectors): a set of word vector of target language """ - if len(word_pair[0]) != 2: - raise ValueError("Each training data item must contain two different language words.") - - self.source_word, self.target_word = zip(*word_pair) + self.source_word = None + self.target_word = None self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec @@ -139,18 +138,23 @@ def __init__(self, word_pair, source_lang_vec, target_lang_vec, random_state=Non self.source_space = None self.target_space = None - def train(self, word_pair): + if word_pairs is not None: + if len(word_pairs[0]) != 2: + raise ValueError("Each training data item must contain two different language words.") + self.train(word_pairs) + + def train(self, word_pairs): """ - build the translation matrix that mapping from source space to target space. + Build the translation matrix that mapping from source space to target space. Args: - `word_pair` (list): a list pair of words - `source_space` (Space object): source language space - `target_space` (Space object): target language space + `word_pairs` (list): a list pair of words Returns: `translation matrix` that mapping from the source language to target language """ + self.source_word, self.target_word = zip(*word_pairs) + self.source_space = Space.build(self.source_lang_vec, set(self.source_word)) self.target_space = Space.build(self.target_lang_vec, set(self.target_word)) @@ -161,7 +165,6 @@ def train(self, word_pair): m2 = self.target_space.mat[[self.target_space.word2index[item] for item in self.target_word], :] self.translation_matrix = np.linalg.lstsq(m1, m2, -1)[0] - return self.translation_matrix def save(self, *args, **kwargs): """ @@ -173,13 +176,13 @@ def save(self, *args, **kwargs): @classmethod def load(cls, *args, **kwargs): - """ load the pre-trained translation matrix model""" + """ Load the pre-trained translation matrix model""" model = super(TranslationMatrix, cls).load(*args, **kwargs) return model def apply_transmat(self, words_space): """ - mapping the source word vector to the target word vector using translation matrix + Map the source word vector to the target word vector using translation matrix Args: `words_space`: the `Space` object that constructed for those words to be translate @@ -190,7 +193,7 @@ def apply_transmat(self, words_space): def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec=None, target_lang_vec=None): """ - translate the word from the source language to the target language, and return the topn + Translate the word from the source language to the target language, and return the topn most similar words. Args: `source_words`(str/list): single word or a list of words to be translated @@ -214,7 +217,7 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec # pass only one word to translate source_words = [source_words] - # if the language word vector not provided by user, use the model's + # If the language word vector not provided by user, use the model's # language word vector as default if source_lang_vec is None: warnings.warn("The parameter source_lang_vec isn't specified, use the model's source language word vector as default.") @@ -224,7 +227,7 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec warnings.warn("The parameter target_lang_vec isn't specified, use the model's target language word vector as default.") target_lang_vec = self.target_lang_vec - # if additional is provided, bootstrapping vocabulary from the source language word vector model. + # If additional is provided, bootstrapping vocabulary from the source language word vector model. if gc: if sample_num is None: raise RuntimeError("When using the globally corrected neighbour retrieval method, the `sample_num` parameter(i.e. the number of words sampled from source space) must be provided.") @@ -236,28 +239,28 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec source_space = Space.build(source_lang_vec, source_words) target_space = Space.build(target_lang_vec, ) - # normalize the source vector and target vector + # Normalize the source vector and target vector source_space.normalize() target_space.normalize() - # map the source language to the target language + # Map the source language to the target language mapped_source_space = self.apply_transmat(source_space) - # use the cosine similarity metric + # Use the cosine similarity metric sim_matrix = -np.dot(target_space.mat, mapped_source_space.mat.T) - # if `gc=1`, using corrected retrieval method + # If `gc=1`, using corrected retrieval method if gc: srtd_idx = np.argsort(np.argsort(sim_matrix, axis=1), axis=1) sim_matrix_idx = np.argsort(srtd_idx + sim_matrix, axis=0) else: sim_matrix_idx = np.argsort(sim_matrix, axis=0) - # translate the words and for each word return the `topn` similar words + # Translate the words and for each word return the `topn` similar words translated_word = OrderedDict() for idx, word in enumerate(source_words): translated_target_word = [] - # search the most `topn` similar words + # Search the most `topn` similar words for j in range(topn): map_space_id = sim_matrix_idx[j, source_space.word2index[word]] translated_target_word.append(target_space.index2word[map_space_id]) @@ -292,7 +295,7 @@ def __init__(self, tagged_docs, source_lang_vec, target_lang_vec, random_state=N Examples: [("one", "uno"), ("two", "due")] Args: - `tagged_docs` (list): a list tagged document + `tagged_docs` (list): a list of tagged document `source_lang_vec` (Doc2vec): provide the document vector `target_lang_vec` (Doc2vec): provide the document vector """ @@ -306,7 +309,7 @@ def __init__(self, tagged_docs, source_lang_vec, target_lang_vec, random_state=N def train(self, tagged_docs): """ - build the translation matrix that mapping from the source model's vector to target model's vector + Build the translation matrix that mapping from the source model's vector to target model's vector Returns: `translation matrix` that mapping from the source model's vector to target model's vector @@ -320,7 +323,7 @@ def train(self, tagged_docs): def infer_vector(self, target_doc_vec): """ - translate the target model's document vector to the source model's document vector + Translate the target model's document vector to the source model's document vector Returns: `infered_vec` the tagged_doc's document vector in the source model diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index 72cf0637a0..fc241d978d 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -25,64 +25,55 @@ def temp_save_file(): class TestTranslationMatrix(unittest.TestCase): def setUp(self): - self.train_file = datapath("OPUS_en_it_europarl_train_one2ten.txt") - self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") - with utils.smart_open(self.train_file, "r") as f: - self.word_pair = [tuple(utils.to_unicode(line).strip().split()) for line in f] + self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"), + ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"), + ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"), + ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana") + ] + + self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")] self.source_word_vec = KeyedVectors.load_word2vec_format(self.source_word_vec_file, binary=False) self.target_word_vec = KeyedVectors.load_word2vec_format(self.target_word_vec_file, binary=False) def test_translation_matrix(self): - model = translation_matrix.TranslationMatrix(self.word_pair, self.source_word_vec, self.target_word_vec) - transmat = model.train(self.word_pair) - self.assertEqual(transmat.shape, (300, 300)) + model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) + model.train(self.word_pairs) + self.assertEqual(model.translation_matrix.shape, (300, 300)) def testPersistence(self): """Test storing/loading the entire model.""" - model = translation_matrix.TranslationMatrix(self.word_pair, self.source_word_vec, self.target_word_vec) - model.train(self.word_pair) + model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) + model.train(self.word_pairs) model.save(temp_save_file()) loaded_model = translation_matrix.TranslationMatrix.load(temp_save_file()) self.assertTrue(np.allclose(model.translation_matrix, loaded_model.translation_matrix)) def test_translate_nn(self): - # test the nearest neighbor retrieval method - model = translation_matrix.TranslationMatrix(self.word_pair, self.source_word_vec, self.target_word_vec) - model.train(self.word_pair) - - test_word_pair = [("one", "uno"), ("two", "due"), ("apple", "mela"), ("orange", "aranicione"), ("dog", "cane"), ("pig", "maiale"), ("cat", "gatto")] - test_source_word, test_target_word = zip(*test_word_pair) - translated_words = model.translate(test_source_word, topn=3) - - self.assertTrue("uno" in translated_words["one"]) - self.assertTrue("due" in translated_words["two"]) - self.assertTrue("mela" in translated_words["apple"]) - self.assertTrue("arancione" in translated_words["orange"]) - self.assertTrue("cane" in translated_words["dog"]) - self.assertTrue("maiale" in translated_words["pig"]) - self.assertTrue("gatto" in translated_words["cat"]) + # Test the nearest neighbor retrieval method + model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) + model.train(self.word_pairs) + + test_source_word, test_target_word = zip(*self.test_word_pairs) + translated_words = model.translate(test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec) + + for idx, item in enumerate(self.test_word_pairs): + self.assertTrue(item[1] in translated_words[item[0]]) def test_translate_gc(self): - # test globally corrected neighbour retrieval method - model = translation_matrix.TranslationMatrix(self.word_pair, self.source_word_vec, self.target_word_vec) - model.train(self.word_pair) - - test_word_pair = [("one", "uno"), ("two", "due"), ("apple", "mela"), ("orange", "aranicione"), ("dog", "cane"), ("pig", "maiale"), ("cat", "gatto")] - test_source_word, test_target_word = zip(*test_word_pair) - translated_words = model.translate(test_source_word, topn=3, gc=1, sample_num=10) - - self.assertTrue("uno" in translated_words["one"]) - self.assertTrue("due" in translated_words["two"]) - self.assertTrue("mela" in translated_words["apple"]) - self.assertTrue("arancione" in translated_words["orange"]) - self.assertTrue("cane" in translated_words["dog"]) - self.assertTrue("maiale" in translated_words["pig"]) - self.assertTrue("gatto" in translated_words["cat"]) + # Test globally corrected neighbour retrieval method + model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) + model.train(self.word_pairs) + + test_source_word, test_target_word = zip(*self.test_word_pairs) + translated_words = model.translate(test_source_word, topn=5, gc=1, sample_num=3, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec) + + for idx, item in enumerate(self.test_word_pairs): + self.assertTrue(item[1] in translated_words[item[0]]) def read_sentiment_docs(filename):