From d0c57e5d3abb5e9e4d49d009302354e039ae497c Mon Sep 17 00:00:00 2001 From: KokuKUSIAKU Date: Tue, 19 Dec 2017 21:50:59 +0000 Subject: [PATCH 1/7] convert Space class doc to numpy style --- gensim/models/translation_matrix.py | 34 +++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 78233136d2..5184c71e4e 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -45,16 +45,36 @@ class Space(object): """ An auxiliary class for storing the the words space - Attributes: - `mat` (ndarray): each row is the word vector of the lexicon - `index2word` (list): a list of words in the `Space` object - `word2index` (dict): map the word to index + Parameters + ---------- + matrix : numpy.array + N * length_of_word_vec, which store the word's vector + index2word : list + a list of words in the `Space` object + word2index : dict + a dict for word indexing + + Attributes + ---------- + mat : N-dimensional array + each row is the word vector of the lexicon + index2word : list + a list of words in the `Space` object + word2index : dict + map the word to index + + Methods + ------- + __init__(matrix, index2word) + build a dictionary to map word to index + normalise() + Normalize the word vector's matrix + """ def __init__(self, matrix, index2word): """ - `matrix`: N * length_of_word_vec, which store the word's vector - `index2word`: a list of words in the `Space` object - `word2index`: a dict which for word indexing + build a dictionary to map word to index + """ self.mat = matrix self.index2word = index2word From 004a11a54fbb7786bc59537a697251e119066ead Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 13:01:11 +0500 Subject: [PATCH 2/7] fix docstrings[1] --- gensim/models/translation_matrix.py | 127 +++++++++++++++------------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 5184c71e4e..11641337e0 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -1,79 +1,78 @@ #!/usr/bin/env python # encoding: utf-8 -import warnings -import numpy as np - -from collections import OrderedDict -from gensim import utils -from six import string_types - -""" -Produce translation matrix to translate the word from one language to another language, using either -standard nearest neighbour method or globally corrected neighbour retrieval method [1]. +"""Produce translation matrix to translate the word from one language to another language, using either +standard nearest neighbour method or globally corrected neighbour retrieval method [1]_. This method can be used to augment the existing phrase tables with more candidate translations, or -filter out errors from the translation tables and known dictionaries [2]. What's more, It also work +filter out errors from the translation tables and known dictionaries [2]_. What's more, It also work for any two sets of named-vectors where there are some paired-guideposts to learn the transformation. -Initialize a model with e.g.:: +Examples +-------- +Initialize a word-vector models (this can be instances of :class:`~gensim.models.word2vec.Word2Vec` too) - >>> transmat = TranslationMatrix(word_pair, source_word_vec, target_word_vec) +>>> from gensim.models import KeyedVectors +>>> from gensim.test.utils import datapath, temporary_file +>>> +>>> model_en = KeyedVectors.load_word2vec_format(datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) +>>> model_it = KeyedVectors.load_word2vec_format(datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) -Train a model with e.g.:: +Define word pairs (that will be used for construction of translation matrix - >>> transmat.train(word_pair) +>>> word_pairs = [ +... ("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"), +... ("seven", "sette"), ("eight", "otto"), +... ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"), +... ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana") +... ] + +Fit :class:`~gensim.models.translation_matrix.TranslationMatrix` + +>>> trans_model = TranslationMatrix(model_en, model_it, word_pairs=word_pairs) + +Apply model (translate words "dog" and "one") -Persist a model to disk with:: +>>> trans_model.translate(["dog", "one"], topn=3) +OrderedDict([('dog', [u'cane', u'gatto', u'cavallo']), ('one', [u'uno', u'due', u'tre'])]) - >>> transmat.save(fname) - >>> transmat = TranslationMatrix.load(fname) -Translate the source words to target words, for example +Save / load model - >>> transmat.translate(["one", "two", "three"], topn=3) +>>> with temporary_file("model_file") as fname: +... trans_model.save(fname) # save model to file +... loaded_trans_model = TranslationMatrix.load(fname) # load model + +References +---------- .. [1] Dinu, Georgiana, Angeliki Lazaridou, and Marco Baroni. "Improving zero-shot learning by mitigating the - hubness problem." arXiv preprint arXiv:1412.6568 (2014). + hubness problem", https://arxiv.org/abs/1412.6568 .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. - Distributed Representations of Words and Phrases and their Compositionality. - In Proceedings of NIPS, 2013. + "Distributed Representations of Words and Phrases and their Compositionality", https://arxiv.org/abs/1310.4546 + """ +import warnings +import numpy as np + +from collections import OrderedDict +from gensim import utils +from six import string_types + class Space(object): - """ - An auxiliary class for storing the the words space - - Parameters - ---------- - matrix : numpy.array - N * length_of_word_vec, which store the word's vector - index2word : list - a list of words in the `Space` object - word2index : dict - a dict for word indexing - - Attributes - ---------- - mat : N-dimensional array - each row is the word vector of the lexicon - index2word : list - a list of words in the `Space` object - word2index : dict - map the word to index - - Methods - ------- - __init__(matrix, index2word) - build a dictionary to map word to index - normalise() - Normalize the word vector's matrix + """An auxiliary class for storing the the words space.""" - """ def __init__(self, matrix, index2word): """ - build a dictionary to map word to index + + Parameters + ---------- + matrix : iterable of numpy.ndarray + Matrix that contains word-vectors. + index2word : list of str + Words which correspond to the `matrix`. """ self.mat = matrix @@ -86,14 +85,20 @@ def __init__(self, matrix, index2word): @classmethod def build(cls, lang_vec, lexicon=None): - """ - Construct a space class for the lexicon, if it's provided. - Args: - `lang_vec`: word2vec model that extract word vector for lexicon - `lexicon`: the default is None, if it is not provided, the lexicon is all the lang_vec's word, - i.e. lang_vec.vocab.keys() - Returns: - `Space` object for the lexicon + """Construct a space class for the lexicon, if it's provided. + + Parameters + ---------- + lang_vec : {:class:`~gensim.models.keyedvectors.KeyedVectors`, :class:`~gensim.models.word2vec.Word2Vec`} + Model from which the vectors will be extracted. + lexicon : list of str, optional + Words which contains in the `lang_vec`, if `lexicon = None`, the lexicon is all the lang_vec's word. + + Returns + ------- + :class:`~gensim.models.translation_matrix.Space` + Object that stored word-vectors + """ # `words` to store all the word that # `mat` to store all the word vector for the word in 'words' list @@ -113,7 +118,7 @@ def build(cls, lang_vec, lexicon=None): return Space(mat, words) def normalize(self): - """ Normalize the word vector's matrix """ + """Normalize the word vector's matrix.""" self.mat = self.mat / np.sqrt(np.sum(np.multiply(self.mat, self.mat), axis=1, keepdims=True)) From 71e801a0633638bc5205a30ef58933e05fa8da32 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 14:43:53 +0500 Subject: [PATCH 3/7] fix docstrings[2] --- gensim/models/translation_matrix.py | 103 +++++++++++++++++----------- 1 file changed, 64 insertions(+), 39 deletions(-) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 11641337e0..dfe978a544 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -10,7 +10,7 @@ Examples -------- -Initialize a word-vector models (this can be instances of :class:`~gensim.models.word2vec.Word2Vec` too) +Initialize a word-vector models >>> from gensim.models import KeyedVectors >>> from gensim.test.utils import datapath, temporary_file @@ -89,7 +89,7 @@ def build(cls, lang_vec, lexicon=None): Parameters ---------- - lang_vec : {:class:`~gensim.models.keyedvectors.KeyedVectors`, :class:`~gensim.models.word2vec.Word2Vec`} + lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors` Model from which the vectors will be extracted. lexicon : list of str, optional Words which contains in the `lang_vec`, if `lexicon = None`, the lexicon is all the lang_vec's word. @@ -123,36 +123,53 @@ def normalize(self): class TranslationMatrix(utils.SaveLoad): - """ - Objects of this class realize the translation matrix which map the source language - to the target language. + """Objects of this class realize the translation matrix which map the source language to the target language. The main methods are: - 1. constructor, - 2. the `train` method, which initialize everything needed to build a translation matrix - 3. the `translate` method, which given new word and its vector representation. - We map it to the other language space by computing z = Wx, then return the word whose representation is close to z. - The details use seen the notebook (translation_matrix.ipynb) - - >>> transmat = TranslationMatrix(source_lang_vec, target_lang_vec, word_pair) - >>> transmat.train(word_pair) - >>> translated_word = transmat.translate(words, topn=3) + The details use seen the notebook [3]_ + + Examples + -------- + >>> from gensim.models import KeyedVectors + >>> from gensim.test.utils import datapath, temporary_file + >>> + >>> model_en = KeyedVectors.load_word2vec_format(datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) + >>> model_it = KeyedVectors.load_word2vec_format(datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) + >>> + >>> word_pairs = [ + ... ("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"), + ... ("seven", "sette"), ("eight", "otto"), + ... ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"), + ... ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana") + ... ] + >>> + >>> trans_model = TranslationMatrix(model_en, model_it) + >>> trans_model.train(word_pairs) + >>> trans_model.translate(["dog", "one"], topn=3) + OrderedDict([('dog', [u'cane', u'gatto', u'cavallo']), ('one', [u'uno', u'due', u'tre'])]) + + + References + ---------- + .. [3] https://github.com/RaRe-Technologies/gensim/blob/3.2.0/docs/notebooks/translation_matrix.ipynb """ def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_state=None): """ - Initialize the model from a list pair of `word_pair`. Each word_pair is tupe - with source language word and target language word. - - Examples: [("one", "uno"), ("two", "due")] + Parameters + ---------- + source_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors` + Word vectors for source language. + target_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors` + Word vectors for target language. + word_pairs : list of (str, str), optional + Pairs of words that will be used for training. + random_state : {None, int, array_like}, optional + Seed for random state. - Args: - `word_pair` (list): a list pair of words - `source_lang_vec` (KeyedVectors): a set of word vector of source language - `target_lang_vec` (KeyedVectors): a set of word vector of target language """ self.source_word = None @@ -171,14 +188,13 @@ def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_sta self.train(word_pairs) def train(self, word_pairs): - """ - Build the translation matrix that mapping from source space to target space. + """Build the translation matrix that mapping from source space to target space. - Args: - `word_pairs` (list): a list pair of words + Parameters + ---------- + word_pairs : list of (str, str), optional + Pairs of words that will be used for training. - Returns: - `translation matrix` that mapping from the source language to target language """ self.source_word, self.target_word = zip(*word_pairs) @@ -194,27 +210,36 @@ def train(self, word_pairs): self.translation_matrix = np.linalg.lstsq(m1, m2, -1)[0] def save(self, *args, **kwargs): - """ - Save the model to file but ignoring the souce_space and target_space - """ + """Save the model to file but ignoring the `source_space` and `target_space`""" kwargs['ignore'] = kwargs.get('ignore', ['source_space', 'target_space']) - super(TranslationMatrix, self).save(*args, **kwargs) @classmethod def load(cls, *args, **kwargs): - """ Load the pre-trained translation matrix model""" + """Load the pre-trained translation matrix model. + + Parameters + ---------- + *args : str + Path to stored model + + """ model = super(TranslationMatrix, cls).load(*args, **kwargs) return model def apply_transmat(self, words_space): - """ - Map the source word vector to the target word vector using translation matrix - Args: - `words_space`: the `Space` object that constructed for those words to be translate + """Map the source word vector to the target word vector using translation matrix. + + Parameters + ---------- + words_space : :class:`~gensim.models.translation_matrix.Space` + Object that constructed for those words to be translate. + + Returns + ------- + :class:`~gensim.models.translation_matrix.Space` + Object that constructed for those mapped words. - Returns: - A `Space` object that constructed for those mapped words """ return Space(np.dot(words_space.mat, self.translation_matrix), words_space.index2word) From 86aad4cf8f7a855f9fa0fd715c74838f2eccdad4 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 14:50:11 +0500 Subject: [PATCH 4/7] remove useless load --- gensim/models/translation_matrix.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index dfe978a544..f3b9e9f6f4 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -214,19 +214,6 @@ def save(self, *args, **kwargs): kwargs['ignore'] = kwargs.get('ignore', ['source_space', 'target_space']) super(TranslationMatrix, self).save(*args, **kwargs) - @classmethod - def load(cls, *args, **kwargs): - """Load the pre-trained translation matrix model. - - Parameters - ---------- - *args : str - Path to stored model - - """ - model = super(TranslationMatrix, cls).load(*args, **kwargs) - return model - def apply_transmat(self, words_space): """Map the source word vector to the target word vector using translation matrix. @@ -261,8 +248,6 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec Returns: A OrderedDict object, each item is (word : `topn` translated words) - [1] Dinu, Georgiana, Angeliki Lazaridou, and Marco Baroni. "Improving zero-shot learning by mitigating the - hubness problem." arXiv preprint arXiv:1412.6568 (2014). """ if isinstance(source_words, string_types): From e51ba9c9f179a86ca6b447afa8464a3ec90792fb Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 16:40:36 +0500 Subject: [PATCH 5/7] fix docstrings[3] --- gensim/models/translation_matrix.py | 82 +++++++++++++++++------------ 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index f3b9e9f6f4..70870ab49a 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -10,6 +10,8 @@ Examples -------- +**How to make translation between two set of word-vectors** + Initialize a word-vector models >>> from gensim.models import KeyedVectors @@ -44,6 +46,10 @@ ... loaded_trans_model = TranslationMatrix.load(fname) # load model +**How to make translation between two :class:`~gensim.models.doc2vec.Doc2Vec` models** + + + References ---------- .. [1] Dinu, Georgiana, Angeliki Lazaridou, and Marco Baroni. "Improving zero-shot learning by mitigating the @@ -231,22 +237,28 @@ def apply_transmat(self, words_space): return Space(np.dot(words_space.mat, self.translation_matrix), words_space.index2word) def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec=None, target_lang_vec=None): - """ - Translate the word from the source language to the target language, and return the topn - most similar words. - Args: - `source_words`(str/list): single word or a list of words to be translated - `topn`: return the top N similar words. By default (`topn=5`) - `gc`: defines the training algorithm. By default (`gc=0`), use standard NN retrieval. - Otherwise use globally corrected neighbour retrieval method(as described in[1]). - `sample_num`: an int parameter that specify the number of word to sample from the source lexicon. - if `gc=1`, then `sample_num` must be provided. - `source_lang_vec`: you can specify the source language vector for translation, the default is to use - the model's source language vector. - `target_lang_vec`: you can specify the target language vector for retrieving the most similar word, - the default is to use the model's target language vector. - Returns: - A OrderedDict object, each item is (word : `topn` translated words) + """Translate the word from the source language to the target language. + + Parameters + ---------- + source_words : {str, list of str} + Single word or a list of words to be translated + topn : int, optional + Number of words than will be returned as translation for each `source_words` + gc : int, optional + Define translation algorithm, if `gc == 0` - use standard NN retrieval, + otherwise, use globally corrected neighbour retrieval method (as described in [1]_). + sample_num : int, optional + Number of word to sample from the source lexicon, if `gc == 1`, then `sample_num` **must** be provided. + source_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional + New source language vectors for translation, by default, used the model's source language vector. + target_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional + New target language vectors for translation, by default, used the model's target language vector. + + Returns + ------- + :class:`collections.OrderedDict` + Ordered dict where each item is `word`: [`translated_word_1`, `translated_word_2`, ...] """ @@ -315,25 +327,20 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec class BackMappingTranslationMatrix(utils.SaveLoad): - """ - Objects of this class realize the BackMapping translation matrix which map the - source model's document vector to the target model's document vector(old model). - The main methods are: + """Realize the BackMapping translation matrix which map the source model's document vector + to the target model's document vector(old model). - 1. constructor, initializing - 2. the `train` method, which build a translation matrix - 3. the `infer_vector` method, which given the target model's document vector + We map it to the other language space by computing z = Wx, then return the + word whose representation is close to z. - We map it to the other language space by computing z = Wx, then return the - word whose representation is close to z. + the details use seen the notebook [3]_. - the details use seen the notebook (translation matrix revist.ipynb) + >>> transmat = BackMappingTranslationMatrix(tagged, source_lang_vec, target_lang_vec) + >>> transmat.train(word_pair) + >>> infered_vec = transmat.infer_vector(tagged_doc) - >>> transmat = BackMappingTranslationMatrix(tagged, source_lang_vec, target_lang_vec) - >>> transmat.train(word_pair) - >>> infered_vec = transmat.infer_vector(tagged_doc) - """ + """ def __init__(self, tagged_docs, source_lang_vec, target_lang_vec, random_state=None): """ Initialize the model from a list of `tagged_docs`. Each word_pair is tupe @@ -369,11 +376,18 @@ def train(self, tagged_docs): return self.translation_matrix def infer_vector(self, target_doc_vec): - """ - Translate the target model's document vector to the source model's document vector + """Translate the target model's document vector to the source model's document vector + + Parameters + ---------- + target_doc_vec : numpy.ndarray + Document vector + + Returns + ------- + numpy.ndarray + Vector `target_doc_vec` in the source model. - Returns: - `infered_vec` the tagged_doc's document vector in the source model """ infered_vec = np.dot(target_doc_vec, self.translation_matrix) return infered_vec From 0fcc2cc9fd551a297ad23b26815bccedbeacd2d7 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 17:17:31 +0500 Subject: [PATCH 6/7] add missing import --- gensim/models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index c25e8e7795..88e0abf28d 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -20,6 +20,7 @@ from .atmodel import AuthorTopicModel # noqa:F401 from .ldaseqmodel import LdaSeqModel # noqa:F401 from .fasttext import FastText # noqa:F401 +from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401 from . import wrappers # noqa:F401 From a9b0c6c6626f5b98ec73767b051db2040fd0e497 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 17:17:49 +0500 Subject: [PATCH 7/7] fix docstrings[4] --- gensim/models/translation_matrix.py | 75 +++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 70870ab49a..1dd7cd6b25 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -16,6 +16,7 @@ >>> from gensim.models import KeyedVectors >>> from gensim.test.utils import datapath, temporary_file +>>> from gensim.models import TranslationMatrix >>> >>> model_en = KeyedVectors.load_word2vec_format(datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) >>> model_it = KeyedVectors.load_word2vec_format(datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) @@ -48,6 +49,25 @@ **How to make translation between two :class:`~gensim.models.doc2vec.Doc2Vec` models** +Prepare data and models + +>>> from gensim.test.utils import datapath +>>> from gensim.test.test_translation_matrix import read_sentiment_docs +>>> from gensim.models import Doc2Vec, BackMappingTranslationMatrix +>>> +>>> data = read_sentiment_docs(datapath("alldata-id-10.txt"))[:5] +>>> src_model = Doc2Vec.load(datapath("small_tag_doc_5_iter50")) +>>> dst_model = Doc2Vec.load(datapath("large_tag_doc_10_iter50")) + +Train backward translation + +>>> model_trans = BackMappingTranslationMatrix(data, src_model, dst_model) +>>> trans_matrix = model_trans.train(data) + + +Apply model + +>>> result = model_trans.infer_vector(dst_model.docvecs[data[3].tags]) References @@ -335,25 +355,37 @@ class BackMappingTranslationMatrix(utils.SaveLoad): the details use seen the notebook [3]_. - >>> transmat = BackMappingTranslationMatrix(tagged, source_lang_vec, target_lang_vec) - >>> transmat.train(word_pair) - >>> infered_vec = transmat.infer_vector(tagged_doc) - + Examples + -------- + >>> from gensim.test.utils import datapath + >>> from gensim.test.test_translation_matrix import read_sentiment_docs + >>> from gensim.models import Doc2Vec, BackMappingTranslationMatrix + >>> + >>> data = read_sentiment_docs(datapath("alldata-id-10.txt"))[:5] + >>> src_model = Doc2Vec.load(datapath("small_tag_doc_5_iter50")) + >>> dst_model = Doc2Vec.load(datapath("large_tag_doc_10_iter50")) + >>> + >>> model_trans = BackMappingTranslationMatrix(data, src_model, dst_model) + >>> trans_matrix = model_trans.train(data) + >>> + >>> result = model_trans.infer_vector(dst_model.docvecs[data[3].tags]) """ def __init__(self, tagged_docs, source_lang_vec, target_lang_vec, random_state=None): """ - Initialize the model from a list of `tagged_docs`. Each word_pair is tupe - with source language word and target language word. - Examples: [("one", "uno"), ("two", "due")] + Parameters + ---------- + tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + Documents that will be used for training + source_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec` + Source Doc2Vec model. + target_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec` + Target Doc2Vec model. + random_state : {None, int, array_like}, optional + Seed for random state. - Args: - `tagged_docs` (list): a list of tagged document - `source_lang_vec` (Doc2vec): provide the document vector - `target_lang_vec` (Doc2vec): provide the document vector """ - self.tagged_docs = tagged_docs self.source_lang_vec = source_lang_vec self.target_lang_vec = target_lang_vec @@ -362,13 +394,19 @@ def __init__(self, tagged_docs, source_lang_vec, target_lang_vec, random_state=N self.translation_matrix = None def train(self, tagged_docs): - """ - Build the translation matrix that mapping from the source model's vector to target model's vector + """Build the translation matrix that mapping from the source model's vector to target model's vector - Returns: - `translation matrix` that mapping from the source model's vector to target model's vector - """ + Parameters + ---------- + tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + THIS ARGUMENT WILL BE IGNORED. + + Returns + ------- + numpy.ndarray + Translation matrix that mapping from the source model's vector to target model's vector. + """ m1 = [self.source_lang_vec.docvecs[item.tags].flatten() for item in self.tagged_docs] m2 = [self.target_lang_vec.docvecs[item.tags].flatten() for item in self.tagged_docs] @@ -389,5 +427,4 @@ def infer_vector(self, target_doc_vec): Vector `target_doc_vec` in the source model. """ - infered_vec = np.dot(target_doc_vec, self.translation_matrix) - return infered_vec + return np.dot(target_doc_vec, self.translation_matrix)