diff --git a/CHANGELOG.md b/CHANGELOG.md index 6828e3ac89..56eef755ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,11 @@ Changes ## Unreleased +This release contains a major refactoring. + ### :+1: Improvements +* KeyedVectors & X2Vec API streamlining, consistency (PR [#2698](https://github.com/RaRe-Technologies/gensim/pull/2698), __[@gojomo](https://github.com/gojomo)__) * No more wheels for x32 platforms (if you need x32 binaries, please build them yourself). (__[menshikh-iv](https://github.com/menshikh-iv)__, [#6](https://github.com/RaRe-Technologies/gensim-wheels/pull/6)) * Speed up random number generation in word2vec model (PR [#2864](https://github.com/RaRe-Technologies/gensim/pull/2864), __[@zygm0nt](https://github.com/zygm0nt)__) diff --git a/MANIFEST.in b/MANIFEST.in index 2ad20ee9f8..8aa14d25b8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -28,8 +28,6 @@ include gensim/models/fasttext_inner.pxd include gensim/models/fasttext_corpusfile.cpp include gensim/models/fasttext_corpusfile.pyx -include gensim/models/_utils_any2vec.c -include gensim/models/_utils_any2vec.pyx include gensim/corpora/_mmreader.c include gensim/corpora/_mmreader.pyx include gensim/_matutils.c diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 6218336b06..1e3e341487 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -53,8 +53,6 @@ Modules: models/coherencemodel models/basemodel models/callbacks - models/utils_any2vec - models/_utils_any2vec models/word2vec_inner models/doc2vec_inner models/fasttext_inner @@ -63,13 +61,6 @@ Modules: models/wrappers/ldavowpalwabbit.rst models/wrappers/wordrank models/wrappers/varembed - models/wrappers/fasttext - models/deprecated/doc2vec - models/deprecated/fasttext - models/deprecated/word2vec - models/deprecated/keyedvectors - models/deprecated/fasttext_wrapper - models/base_any2vec similarities/docsim similarities/termsim similarities/index diff --git a/docs/src/auto_examples/tutorials/run_fasttext.rst b/docs/src/auto_examples/tutorials/run_fasttext.rst index 23277ad4c3..1cef50800d 100644 --- a/docs/src/auto_examples/tutorials/run_fasttext.rst +++ b/docs/src/auto_examples/tutorials/run_fasttext.rst @@ -479,7 +479,7 @@ The example training corpus is a toy corpus, results are not expected to be good .. code-block:: none /Volumes/work/workspace/gensim_misha/gensim/models/keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future. - vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) + vectors = vstack(self.get_vector(word, use_norm=True) for word in used_words).astype(REAL) 'breakfast' diff --git a/docs/src/auto_examples/tutorials/run_word2vec.rst b/docs/src/auto_examples/tutorials/run_word2vec.rst index 6bc27f3bf6..67921de03c 100644 --- a/docs/src/auto_examples/tutorials/run_word2vec.rst +++ b/docs/src/auto_examples/tutorials/run_word2vec.rst @@ -308,7 +308,7 @@ Which of the below does not belong in the sequence? .. code-block:: none /home/misha/git/gensim/gensim/models/keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future. - vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) + vectors = vstack(self.get_vector(word, use_norm=True) for word in used_words).astype(REAL) car diff --git a/docs/src/models/_utils_any2vec.rst b/docs/src/models/_utils_any2vec.rst deleted file mode 100644 index 46e5541ec3..0000000000 --- a/docs/src/models/_utils_any2vec.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models._utils_any2vec` -- Cython utils for any2vec models -=============================================================== - -.. automodule:: gensim.models._utils_any2vec - :synopsis: Cython utils for any2vec models - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/base_any2vec.rst b/docs/src/models/base_any2vec.rst deleted file mode 100644 index e6685cda66..0000000000 --- a/docs/src/models/base_any2vec.rst +++ /dev/null @@ -1,10 +0,0 @@ -:mod:`models.base_any2vec` -- Base classes for any2vec models -============================================================= - -.. automodule:: gensim.models.base_any2vec - :synopsis: Base classes for any2vec models - :members: - :inherited-members: - :special-members: __getitem__ - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/doc2vec.rst b/docs/src/models/deprecated/doc2vec.rst deleted file mode 100644 index e8fb2d96b3..0000000000 --- a/docs/src/models/deprecated/doc2vec.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.deprecated.doc2vec` -- Deep learning with paragraph2vec -==================================================================== - -.. automodule:: gensim.models.deprecated.doc2vec - :synopsis: Deep learning with doc2vec - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/fasttext.rst b/docs/src/models/deprecated/fasttext.rst deleted file mode 100644 index 08de0234d2..0000000000 --- a/docs/src/models/deprecated/fasttext.rst +++ /dev/null @@ -1,10 +0,0 @@ -:mod:`models.deprecated.fasttext` -- FastText model -=================================================== - -.. automodule:: gensim.models.deprecated.fasttext - :synopsis: FastText model - :members: - :inherited-members: - :special-members: __getitem__ - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/fasttext_wrapper.rst b/docs/src/models/deprecated/fasttext_wrapper.rst deleted file mode 100644 index 020504de24..0000000000 --- a/docs/src/models/deprecated/fasttext_wrapper.rst +++ /dev/null @@ -1,10 +0,0 @@ -:mod:`models.deprecated.fasttext_wrapper` -- Wrapper for Facebook implementation of FastText model -================================================================================================== - -.. automodule:: gensim.models.deprecated.fasttext_wrapper - :synopsis: FastText model - :members: - :inherited-members: - :special-members: __getitem__ - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/keyedvectors.rst b/docs/src/models/deprecated/keyedvectors.rst deleted file mode 100644 index 7d55cbc798..0000000000 --- a/docs/src/models/deprecated/keyedvectors.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.deprecated.keyedvectors` -- Store and query word vectors -===================================================================== - -.. automodule:: gensim.models.deprecated.keyedvectors - :synopsis: Store and query word vectors - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/word2vec.rst b/docs/src/models/deprecated/word2vec.rst deleted file mode 100644 index 3b80aaf196..0000000000 --- a/docs/src/models/deprecated/word2vec.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.deprecated.word2vec` -- Deep learning with word2vec -================================================================ - -.. automodule:: gensim.models.deprecated.word2vec - :synopsis: Deep learning with word2vec - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/utils_any2vec.rst b/docs/src/models/utils_any2vec.rst deleted file mode 100644 index 123ee265e6..0000000000 --- a/docs/src/models/utils_any2vec.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.utils_any2vec` -- Utils for any2vec models -======================================================= - -.. automodule:: gensim.models.utils_any2vec - :synopsis: Utils for any2vec models - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/wrappers/fasttext.rst b/docs/src/models/wrappers/fasttext.rst deleted file mode 100644 index 4476cc7b43..0000000000 --- a/docs/src/models/wrappers/fasttext.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.wrappers.fasttext` -- Wrapper for FastText implementation from Facebook -==================================================================================== - -.. automodule:: gensim.models.wrappers.fasttext - :synopsis: FastText - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index a8d8e498fa..c9ebf1841b 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -22,10 +22,10 @@ import logging import os import math -import numpy -import scipy.sparse as sparse import time +import numpy +import scipy.sparse as sparse from six.moves import range import gensim @@ -263,9 +263,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp is_corpus, corpus = gensim.utils.is_corpus(corpus) if not is_corpus: - raise ValueError( - "Cannot initialize shards without a corpus to read from! (Got corpus type: {0})".format(type(corpus)) - ) + raise ValueError("Cannot initialize shards without a corpus to read from! Corpus type: %s" % type(corpus)) proposed_dim = self._guess_n_features(corpus) if proposed_dim != self.dim: @@ -360,7 +358,7 @@ def load_shard(self, n): filename = self._shard_name(n) if not os.path.isfile(filename): - raise ValueError('Attempting to load nonexistent shard no. {0}'.format(n)) + raise ValueError('Attempting to load nonexistent shard no. %s' % n) shard = gensim.utils.unpickle(filename) self.current_shard = shard @@ -387,11 +385,9 @@ def shard_by_offset(self, offset): """ k = int(offset / self.shardsize) if offset >= self.n_docs: - raise ValueError('Too high offset specified ({0}), available ' - 'docs: {1}'.format(offset, self.n_docs)) + raise ValueError('Too high offset specified (%s), available docs: %s' % (offset, self.n_docs)) if offset < 0: - raise ValueError('Negative offset {0} currently not' - ' supported.'.format(offset)) + raise ValueError('Negative offset %s currently not supported.' % offset) return k def in_current(self, offset): @@ -411,7 +407,7 @@ def in_next(self, offset): """ if self.current_shard_n == self.n_shards: return False # There's no next shard. - return (self.offsets[self.current_shard_n + 1] <= offset) and (offset < self.offsets[self.current_shard_n + 2]) + return self.offsets[self.current_shard_n + 1] <= offset and offset < self.offsets[self.current_shard_n + 2] def resize_shards(self, shardsize): """ @@ -440,9 +436,8 @@ def resize_shards(self, shardsize): if new_stop > self.n_docs: # Sanity check assert new_shard_idx == n_new_shards - 1, \ - 'Shard no. {0} that ends at {1} over last document' \ - ' ({2}) is not the last projected shard ({3})???' \ - ''.format(new_shard_idx, new_stop, self.n_docs, n_new_shards) + 'Shard no. %r that ends at %r over last document (%r) is not the last projected shard (%r)' % ( + new_shard_idx, new_stop, self.n_docs, n_new_shards) new_stop = self.n_docs new_shard = self[new_start:new_stop] @@ -466,9 +461,9 @@ def resize_shards(self, shardsize): for old_shard_n, old_shard_name in enumerate(old_shard_names): os.remove(old_shard_name) except Exception as e: - logger.error( - 'Exception occurred during old shard no. %d removal: %s.\nAttempting to at least move new shards in.', - old_shard_n, str(e) + logger.exception( + 'Error during old shard no. %d removal: %s.\nAttempting to at least move new shards in.', + old_shard_n, str(e), ) finally: # If something happens with cleaning up - try to at least get the @@ -479,7 +474,7 @@ def resize_shards(self, shardsize): # If something happens when we're in this stage, we're screwed. except Exception as e: logger.exception(e) - raise RuntimeError('Resizing completely failed for some reason. Sorry, dataset is probably ruined...') + raise RuntimeError('Resizing completely failed. Sorry, dataset is probably ruined...') finally: # Sets the new shard stats. self.n_shards = n_new_shards @@ -524,19 +519,18 @@ def _guess_n_features(self, corpus): else: if not self.dim: raise TypeError( - "Couldn't find number of features, refusing to guess " - "(dimension set to {0}, type of corpus: {1})." - .format(self.dim, type(corpus)) + "Couldn't find number of features, refusing to guess. Dimension: %s, corpus: %s)" % ( + self.dim, type(corpus), + ) ) - else: - logger.warning("Couldn't find number of features, trusting supplied dimension (%d)", self.dim) - n_features = self.dim + logger.warning("Couldn't find number of features, trusting supplied dimension (%d)", self.dim) + n_features = self.dim if self.dim and n_features != self.dim: logger.warning( "Discovered inconsistent dataset dim (%d) and feature count from corpus (%d). " "Coercing to dimension given by argument.", - self.dim, n_features + self.dim, n_features, ) return n_features @@ -591,7 +585,7 @@ def __getitem__(self, offset): start = offset.start stop = offset.stop if stop > self.n_docs: - raise IndexError('Requested slice offset {0} out of range ({1} docs)'.format(stop, self.n_docs)) + raise IndexError('Requested slice offset %s out of range (%s docs)' % (stop, self.n_docs)) # - get range of shards over which to iterate first_shard = self.shard_by_offset(start) @@ -674,21 +668,23 @@ def __getitem__(self, offset): def __add_to_slice(self, s_result, result_start, result_stop, start, stop): """ - Add the rows of the current shard from `start` to `stop` + Add rows of the current shard from `start` to `stop` into rows `result_start` to `result_stop` of `s_result`. - Operation is based on the self.sparse_serialize setting. If the shard + Operation is based on the ``self.sparse_serialize`` setting. If the shard contents are dense, then s_result is assumed to be an ndarray that already supports row indices `result_start:result_stop`. If the shard contents are sparse, assumes that s_result has `result_start` rows and we should add them up to `result_stop`. - Returns the resulting s_result. + Return the resulting ``s_result``. + """ if (result_stop - result_start) != (stop - start): raise ValueError( - 'Result start/stop range different than stop/start range (%d - %d vs. %d - %d)' - % (result_start, result_stop, start, stop) + 'Result start/stop range different than stop/start range (%s - %s vs. %s - %s)' % ( + result_start, result_stop, start, stop, + ) ) # Dense data: just copy using numpy's slice notation @@ -699,16 +695,16 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): # A bit more difficult, we're using a different structure to build the # result. - else: - if s_result.shape != (result_start, self.dim): - raise ValueError( - 'Assuption about sparse s_result shape invalid: {0} expected rows, {1} real rows.' - .format(result_start, s_result.shape[0]) + if s_result.shape != (result_start, self.dim): + raise ValueError( + 'Assuption about sparse s_result shape invalid: %s expected rows, %s real rows.' % ( + result_start, s_result.shape[0], ) + ) - tmp_matrix = self.current_shard[start:stop] - s_result = sparse.vstack([s_result, tmp_matrix]) - return s_result + tmp_matrix = self.current_shard[start:stop] + s_result = sparse.vstack([s_result, tmp_matrix]) + return s_result def _getitem_format(self, s_result): if self.sparse_serialization: @@ -817,5 +813,9 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to - provide a compatible method signature with superclass.""" - serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + provide a compatible method signature with superclass. + + """ + serializer.save_corpus( + fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs, + ) diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index a0ee690550..ee054b167d 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -13,7 +13,7 @@ from .logentropy_model import LogEntropyModel # noqa:F401 from .word2vec import Word2Vec # noqa:F401 from .doc2vec import Doc2Vec # noqa:F401 -from .keyedvectors import KeyedVectors, WordEmbeddingSimilarityIndex # noqa:F401 +from .keyedvectors import KeyedVectors # noqa:F401 from .ldamulticore import LdaMulticore # noqa:F401 from .phrases import Phrases # noqa:F401 from .normmodel import NormModel # noqa:F401 @@ -23,7 +23,6 @@ from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401 from . import wrappers # noqa:F401 -from . import deprecated # noqa:F401 from gensim import interfaces, utils diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 3b7af85f9e..26337d51eb 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -435,7 +435,7 @@ def _get_field_from_model(model, field): requested field name, fields are listed in the `_NEW_HEADER_FORMAT` list """ if field == 'bucket': - return model.trainables.bucket + return model.wv.bucket elif field == 'dim': return model.vector_size elif field == 'epoch': @@ -457,7 +457,7 @@ def _get_field_from_model(model, field): elif field == 'minn': return model.wv.min_n elif field == 'min_count': - return model.vocabulary.min_count + return model.min_count elif field == 'model': # `model` => cbow:1, sg:2, sup:3 # cbow = continous bag of words (default) @@ -467,7 +467,7 @@ def _get_field_from_model(model, field): elif field == 'neg': return model.negative elif field == 't': - return model.vocabulary.sample + return model.sample elif field == 'word_ngrams': # This is skipped in gensim loading setting, using the default from FB C++ code return 1 @@ -531,9 +531,9 @@ def _dict_save(fout, model, encoding): # In the unsupervised case we have only words (no labels). Hence both fields # are equal. - fout.write(np.int32(len(model.wv.vocab)).tobytes()) + fout.write(np.int32(len(model.wv)).tobytes()) - fout.write(np.int32(len(model.wv.vocab)).tobytes()) + fout.write(np.int32(len(model.wv)).tobytes()) # nlabels=0 <- no labels we are in unsupervised mode fout.write(np.int32(0).tobytes()) @@ -544,7 +544,7 @@ def _dict_save(fout, model, encoding): fout.write(np.int64(-1)) for word in model.wv.index2word: - word_count = model.wv.vocab[word].count + word_count = model.wv.get_vecattr(word, 'count') fout.write(word.encode(encoding)) fout.write(_END_OF_WORD_MARKER) fout.write(np.int64(word_count).tobytes()) @@ -572,7 +572,7 @@ def _input_save(fout, model): ngrams_n, ngrams_dim = model.wv.vectors_ngrams.shape assert vocab_dim == ngrams_dim - assert vocab_n == len(model.wv.vocab) + assert vocab_n == len(model.wv) assert ngrams_n == model.wv.bucket fout.write(struct.pack('@2q', vocab_n + ngrams_n, vocab_dim)) @@ -596,9 +596,9 @@ def _output_save(fout, model): saved model """ if model.hs: - hidden_output = model.trainables.syn1 + hidden_output = model.syn1 if model.negative: - hidden_output = model.trainables.syn1neg + hidden_output = model.syn1neg hidden_n, hidden_dim = hidden_output.shape fout.write(struct.pack('@2q', hidden_n, hidden_dim)) diff --git a/gensim/models/_utils_any2vec.pyx b/gensim/models/_utils_any2vec.pyx deleted file mode 100644 index cc4ba9bbb4..0000000000 --- a/gensim/models/_utils_any2vec.pyx +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env cython -# cython: boundscheck=False -# cython: wraparound=False -# cython: cdivision=True -# cython: embedsignature=True -# coding: utf-8 - -"""General functions used for any2vec models.""" - -# -# This is here to support older versions of the MSVC compiler that don't have stdint.h. -# -cdef extern from "stdint_wrapper.h": - ctypedef unsigned int uint32_t - ctypedef signed char int8_t - -from six import PY2 -import numpy as np -cimport numpy as np - - -cpdef ft_hash_bytes(bytes bytez): - """Calculate hash based on `bytez`. - Reproduce `hash method from Facebook fastText implementation - `_. - - Parameters - ---------- - bytez : bytes - The string whose hash needs to be calculated, encoded as UTF-8. - - Returns - ------- - unsigned int - The hash of the string. - - """ - cdef uint32_t h = 2166136261 - cdef char b - - for b in bytez: - h = h ^ (b) - h = h * 16777619 - return h - - -cpdef ft_hash_broken(unicode string): - """Calculate hash based on `string`. - - This implementation is broken, see https://github.com/RaRe-Technologies/gensim/issues/2059. - It is here only for maintaining backwards compatibility with older models. - - Parameters - ---------- - string : unicode - The string whose hash needs to be calculated. - - Returns - ------- - unsigned int - The hash of the string. - - """ - cdef unsigned int h = 2166136261 - for c in string: - h ^= ord(c) - h *= 16777619 - return h - - -cpdef compute_ngrams(word, unsigned int min_n, unsigned int max_n): - """Get the list of all possible ngrams for a given word. - - Parameters - ---------- - word : str - The word whose ngrams need to be computed. - min_n : unsigned int - Minimum character length of the ngrams. - max_n : unsigned int - Maximum character length of the ngrams. - - Returns - ------- - list of str - Sequence of character ngrams. - - """ - cdef unicode extended_word = f'<{word}>' - ngrams = [] - for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): - for i in range(0, len(extended_word) - ngram_length + 1): - ngrams.append(extended_word[i:i + ngram_length]) - return ngrams - -# -# UTF-8 bytes that begin with 10 are subsequent bytes of a multi-byte sequence, -# as opposed to a new character. -# -cdef unsigned char _MB_MASK = 0xC0 -cdef unsigned char _MB_START = 0x80 - - -cpdef compute_ngrams_bytes(word, unsigned int min_n, unsigned int max_n): - """Computes ngrams for a word. - - Ported from the original FB implementation. - - Parameters - ---------- - word : str - A unicode string. - min_n : unsigned int - The minimum ngram length. - max_n : unsigned int - The maximum ngram length. - - Returns: - -------- - list of str - A list of ngrams, where each ngram is a list of **bytes**. - - See Also - -------- - `Original implementation `__ - - """ - cdef bytes utf8_word = ('<%s>' % word).encode("utf-8") - cdef const unsigned char *bytez = utf8_word - cdef size_t num_bytes = len(utf8_word) - cdef size_t j, i, n - - ngrams = [] - for i in range(num_bytes): - if bytez[i] & _MB_MASK == _MB_START: - continue - - j, n = i, 1 - while j < num_bytes and n <= max_n: - j += 1 - while j < num_bytes and (bytez[j] & _MB_MASK) == _MB_START: - j += 1 - if n >= min_n and not (n == 1 and (i == 0 or j == num_bytes)): - ngram = bytes(bytez[i:j]) - ngrams.append(ngram) - n += 1 - return ngrams diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py deleted file mode 100644 index ba7e941f57..0000000000 --- a/gensim/models/base_any2vec.py +++ /dev/null @@ -1,1458 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Shiva Manne -# Copyright (C) 2018 RaRe Technologies s.r.o. -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -r"""This module contains base classes required for implementing \*2vec algorithms. - -The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings. -In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector -(embedding). This is represented by the base :class:`~gensim.models.base_any2vec.BaseAny2VecModel`. The input space in -most cases (in the NLP field at least) is plain text. For this reason, we enrich the class hierarchy with the abstract -:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` to be used as a base for models where the input -space is text. - -Notes ------ -Even though this is the usual case, not all embeddings transform text, such as the -:class:`~gensim.models.poincare.PoincareModel` that embeds graphs. - -See Also --------- -:class:`~gensim.models.word2vec.Word2Vec`. - Word2Vec model - embeddings for words. -:class:`~gensim.models.fasttext.FastText`. - FastText model - embeddings for words (ngram-based). -:class:`~gensim.models.doc2vec.Doc2Vec`. - Doc2Vec model - embeddings for documents. -:class:`~gensim.models.poincare.PoincareModel` - Poincare model - embeddings for graphs. - -""" - -from gensim import utils -import logging -from timeit import default_timer -import threading -from six.moves import range -from six import itervalues, string_types -from gensim import matutils -from numpy import float32 as REAL, ones, random, dtype -from types import GeneratorType -from gensim.utils import deprecated -import os -import copy - - -try: - from queue import Queue -except ImportError: - from Queue import Queue - -logger = logging.getLogger(__name__) - - -class BaseAny2VecModel(utils.SaveLoad): - r"""Base class for training, using and evaluating \*2vec model. - - Contains implementation for multi-threaded training. The purpose of this class is to provide a - reference interface for concrete embedding implementations, whether the input space is a corpus - of words, documents or anything else. At the same time, functionality that we expect to be common - for those implementations is provided here to avoid code duplication. - - In the special but usual case where the input space consists of words, a more specialized layer - is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` - - Notes - ----- - A subclass should initialize the following attributes: - - * self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example) - * self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example) - * self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example) - - """ - def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000): - """ - - Parameters - ---------- - workers : int, optional - Number of working threads, used for multithreading. - vector_size : int, optional - Dimensionality of the feature vectors. - epochs : int, optional - Number of iterations (epochs) of training through the corpus. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks that need to be executed/run at specific stages during training. - batch_words : int, optional - Number of words to be processed by a single job. - - """ - self.vector_size = int(vector_size) - self.workers = int(workers) - self.epochs = epochs - self.train_count = 0 - self.total_train_time = 0 - self.batch_words = batch_words - self.model_trimmed_post_training = False - self.callbacks = callbacks - - def _get_job_params(self, cur_epoch): - """Get job parameters required for each batch.""" - raise NotImplementedError() - - def _set_train_params(self, **kwargs): - """Set model parameters required for training.""" - raise NotImplementedError() - - def _update_job_params(self, job_params, epoch_progress, cur_epoch): - """Get updated job parameters based on the epoch_progress and cur_epoch.""" - raise NotImplementedError() - - def _get_thread_working_mem(self): - """Get private working memory per thread.""" - raise NotImplementedError() - - def _raw_word_count(self, job): - """Get the number of words in a given job.""" - raise NotImplementedError() - - def _clear_post_train(self): - """Resets certain properties of the model post training. eg. `keyedvectors.vectors_norm`.""" - raise NotImplementedError() - - def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=None, total_words=None, **kwargs): - raise NotImplementedError() - - def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): - """Train a single batch. Return 2-tuple `(effective word count, total word count)`.""" - raise NotImplementedError() - - def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): - """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" - raise NotImplementedError() - - def _check_input_data_sanity(self, data_iterable=None, corpus_file=None): - """Check that only one argument is None.""" - if not (data_iterable is None) ^ (corpus_file is None): - raise ValueError("You must provide only one of singlestream or corpus_file arguments.") - - def _worker_loop_corpusfile(self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, - total_examples=None, total_words=None, **kwargs): - """Train the model on a `corpus_file` in LineSentence format. - - This function will be called in parallel by multiple workers (threads or processes) to make - optimal use of multicore machines. - - Parameters - ---------- - corpus_file : str - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - thread_id : int - Thread index starting from 0 to `number of workers - 1`. - offset : int - Offset (in bytes) in the `corpus_file` for particular worker. - cython_vocab : :class:`~gensim.models.word2vec_inner.CythonVocab` - Copy of the vocabulary in order to access it without GIL. - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - """ - thread_private_mem = self._get_thread_working_mem() - - examples, tally, raw_tally = self._do_train_epoch( - corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=total_examples, total_words=total_words, **kwargs) - - progress_queue.put((examples, tally, raw_tally)) - progress_queue.put(None) - - def _worker_loop(self, job_queue, progress_queue): - """Train the model, lifting batches of data from the queue. - - This function will be called in parallel by multiple workers (threads or processes) to make - optimal use of multicore machines. - - Parameters - ---------- - job_queue : Queue of (list of objects, (str, int)) - A queue of jobs still to be processed. The worker will take up jobs from this queue. - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is the dictionary of parameters. - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - thread_private_mem = self._get_thread_working_mem() - jobs_processed = 0 - while True: - job = job_queue.get() - if job is None: - progress_queue.put(None) - break # no more jobs => quit this worker - data_iterable, job_parameters = job - - for callback in self.callbacks: - callback.on_batch_begin(self) - - tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) - - for callback in self.callbacks: - callback.on_batch_end(self) - - progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress - jobs_processed += 1 - logger.debug("worker exiting, processed %i jobs", jobs_processed) - - def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None): - """Fill the jobs queue using the data found in the input stream. - - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is a dictionary of parameters. - - Parameters - ---------- - data_iterator : iterable of list of objects - The input dataset. This will be split in chunks and these chunks will be pushed to the queue. - job_queue : Queue of (list of object, dict of (str, int)) - A queue of jobs still to be processed. The worker will take up jobs from this queue. - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is the dictionary of parameters. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus. Used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus. Used to log progress. - - """ - job_batch, batch_size = [], 0 - pushed_words, pushed_examples = 0, 0 - next_job_params = self._get_job_params(cur_epoch) - job_no = 0 - - for data_idx, data in enumerate(data_iterator): - data_length = self._raw_word_count([data]) - - # can we fit this sentence into the existing job batch? - if batch_size + data_length <= self.batch_words: - # yes => add it to the current job - job_batch.append(data) - batch_size += data_length - else: - job_no += 1 - job_queue.put((job_batch, next_job_params)) - - # update the learning rate for the next job - if total_examples: - # examples-based decay - pushed_examples += len(job_batch) - epoch_progress = 1.0 * pushed_examples / total_examples - else: - # words-based decay - pushed_words += self._raw_word_count(job_batch) - epoch_progress = 1.0 * pushed_words / total_words - next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch) - - # add the sentence that didn't fit as the first item of a new job - job_batch, batch_size = [data], data_length - # add the last job too (may be significantly smaller than batch_words) - if job_batch: - job_no += 1 - job_queue.put((job_batch, next_job_params)) - - if job_no == 0 and self.train_count == 0: - logger.warning( - "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable iteration = an iterable)." - ) - - # give the workers heads up that they can finish -- no more work! - for _ in range(self.workers): - job_queue.put(None) - logger.debug("job loop exiting, total %i jobs", job_no) - - def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed): - raise NotImplementedError() - - def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode): - raise NotImplementedError() - - def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): - raise NotImplementedError() - - def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, - total_words=None, report_delay=1.0, is_corpus_file_mode=None): - """Get the progress report for a single training epoch. - - Parameters - ---------- - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - job_queue : Queue of (list of object, dict of (str, int)) - A queue of jobs still to be processed. The worker will take up jobs from this queue. - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is the dictionary of parameters. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus. Used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus. Used to log progress. - report_delay : float, optional - Number of seconds between two consecutive progress report messages in the logger. - is_corpus_file_mode : bool, optional - Whether training is file-based (corpus_file argument) or not. - - Returns - ------- - (int, int, int) - The epoch report consisting of three elements: - * size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - example_count, trained_word_count, raw_word_count = 0, 0, 0 - start, next_report = default_timer() - 0.00001, 1.0 - job_tally = 0 - unfinished_worker_count = self.workers - - while unfinished_worker_count > 0: - report = progress_queue.get() # blocks if workers too slow - if report is None: # a thread reporting that it finished - unfinished_worker_count -= 1 - logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) - continue - examples, trained_words, raw_words = report - job_tally += 1 - - # update progress stats - example_count += examples - trained_word_count += trained_words # only words in vocab & sampled - raw_word_count += raw_words - - # log progress once every report_delay seconds - elapsed = default_timer() - start - if elapsed >= next_report: - self._log_progress( - job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed) - next_report = elapsed + report_delay - # all done; report the final stats - elapsed = default_timer() - start - self._log_epoch_end( - cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode) - self.total_train_time += elapsed - return trained_word_count, raw_word_count, job_tally - - def _train_epoch_corpusfile(self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, **kwargs): - """Train the model for a single epoch. - - Parameters - ---------- - corpus_file : str - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus, used to log progress. - total_words : int - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus, used to log progress. Must be provided in order to seek in `corpus_file`. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - Returns - ------- - (int, int, int) - The training report for this epoch consisting of three elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - if not total_words: - raise ValueError("total_words must be provided alongside corpus_file argument.") - - from gensim.models.word2vec_corpusfile import CythonVocab - from gensim.models.fasttext import FastText - cython_vocab = CythonVocab(self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) - - progress_queue = Queue() - - corpus_file_size = os.path.getsize(corpus_file) - - thread_kwargs = copy.copy(kwargs) - thread_kwargs['cur_epoch'] = cur_epoch - thread_kwargs['total_examples'] = total_examples - thread_kwargs['total_words'] = total_words - workers = [ - threading.Thread( - target=self._worker_loop_corpusfile, - args=( - corpus_file, thread_id, corpus_file_size / self.workers * thread_id, cython_vocab, progress_queue - ), - kwargs=thread_kwargs - ) for thread_id in range(self.workers) - ] - - for thread in workers: - thread.daemon = True - thread.start() - - trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( - progress_queue=progress_queue, job_queue=None, cur_epoch=cur_epoch, - total_examples=total_examples, total_words=total_words, is_corpus_file_mode=True) - - return trained_word_count, raw_word_count, job_tally - - def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, - queue_factor=2, report_delay=1.0): - """Train the model for a single epoch. - - Parameters - ---------- - data_iterable : iterable of list of object - The input corpus. This will be split in chunks and these chunks will be pushed to the queue. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus, used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus, used to log progress. - queue_factor : int, optional - Multiplier for size of queue -> size = number of workers * queue_factor. - report_delay : float, optional - Number of seconds between two consecutive progress report messages in the logger. - - Returns - ------- - (int, int, int) - The training report for this epoch consisting of three elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - job_queue = Queue(maxsize=queue_factor * self.workers) - progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - - workers = [ - threading.Thread( - target=self._worker_loop, - args=(job_queue, progress_queue,)) - for _ in range(self.workers) - ] - - workers.append(threading.Thread( - target=self._job_producer, - args=(data_iterable, job_queue), - kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words})) - - for thread in workers: - thread.daemon = True # make interrupting the process with ctrl+c easier - thread.start() - - trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( - progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, - report_delay=report_delay, is_corpus_file_mode=False) - - return trained_word_count, raw_word_count, job_tally - - def train(self, data_iterable=None, corpus_file=None, epochs=None, total_examples=None, - total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): - """Train the model for multiple epochs using multiple workers. - - Parameters - ---------- - data_iterable : iterable of list of object - The input corpus. This will be split in chunks and these chunks will be pushed to the queue. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - If you use this argument instead of `data_iterable`, you must provide `total_words` argument as well. - epochs : int, optional - Number of epochs (training iterations over the whole input) of training. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus, used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus, used to log progress. - queue_factor : int, optional - Multiplier for size of queue -> size = number of workers * queue_factor. - report_delay : float, optional - Number of seconds between two consecutive progress report messages in the logger. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks to execute at specific stages during training. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - Returns - ------- - (int, int) - The total training report consisting of two elements: - * size of total data processed, for example number of sentences in the whole corpus. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - - """ - self._set_train_params(**kwargs) - if callbacks: - self.callbacks = callbacks - self.epochs = epochs - self._check_training_sanity( - epochs=epochs, - total_examples=total_examples, - total_words=total_words, **kwargs) - - for callback in self.callbacks: - callback.on_train_begin(self) - - trained_word_count = 0 - raw_word_count = 0 - start = default_timer() - 0.00001 - job_tally = 0 - - for cur_epoch in range(self.epochs): - for callback in self.callbacks: - callback.on_epoch_begin(self) - - if data_iterable is not None: - trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch( - data_iterable, cur_epoch=cur_epoch, total_examples=total_examples, - total_words=total_words, queue_factor=queue_factor, report_delay=report_delay) - else: - trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch_corpusfile( - corpus_file, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, **kwargs) - - trained_word_count += trained_word_count_epoch - raw_word_count += raw_word_count_epoch - job_tally += job_tally_epoch - - for callback in self.callbacks: - callback.on_epoch_end(self) - - # Log overall time - total_elapsed = default_timer() - start - self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally) - - self.train_count += 1 # number of times train() has been called - self._clear_post_train() - - for callback in self.callbacks: - callback.on_train_end(self) - return trained_word_count, raw_word_count - - @classmethod - def load(cls, fname_or_handle, **kwargs): - """Load a previously saved object (using :meth:`gensim.models.base_any2vec.BaseAny2VecModel.save`) from a file. - - Parameters - ---------- - fname_or_handle : {str, file-like object} - Path to file that contains needed object or handle to an open file. - **kwargs : object - Keyword arguments propagated to :meth:`~gensim.utils.SaveLoad.load`. - - See Also - -------- - :meth:`~gensim.models.base_any2vec.BaseAny2VecModel.save` - Method for save a model. - - Returns - ------- - object - Object loaded from `fname_or_handle`. - - Raises - ------ - IOError - When methods are called on an instance (should be called on a class, this is a class method). - - """ - return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs) - - def save(self, fname_or_handle, **kwargs): - """"Save the object to file. - - Parameters - ---------- - fname_or_handle : {str, file-like object} - Path to file where the model will be persisted. - **kwargs : object - Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`. - - See Also - -------- - :meth:`~gensim.models.base_any2vec.BaseAny2VecModel.load` - Method for load model after current method. - - """ - super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs) - - -class BaseWordEmbeddingsModel(BaseAny2VecModel): - """Base class containing common methods for training, using & evaluating word embeddings learning models. - - See Also - -------- - :class:`~gensim.models.word2vec.Word2Vec`. - Word2Vec model - embeddings for words. - :class:`~gensim.models.fasttext.FastText`. - FastText model - embeddings for words (ngram-based). - :class:`~gensim.models.doc2vec.Doc2Vec`. - Doc2Vec model - embeddings for documents. - :class:`~gensim.models.poincare.PoincareModel` - Poincare model - embeddings for graphs. - - """ - def _clear_post_train(self): - raise NotImplementedError() - - def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): - raise NotImplementedError() - - def _set_train_params(self, **kwargs): - raise NotImplementedError() - - def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100, epochs=5, callbacks=(), - batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, - ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, **kwargs): - """ - - Parameters - ---------- - sentences : iterable of list of str, optional - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` for such examples. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). - workers : int, optional - Number of working threads, used for multiprocessing. - vector_size : int, optional - Dimensionality of the feature vectors. - epochs : int, optional - Number of iterations (epochs) of training through the corpus. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks that need to be executed/run at specific stages during training. - batch_words : int, optional - Number of words to be processed by a single job. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during current method call and is not stored as part - of the model. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - sg : {1, 0}, optional - Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. - alpha : float, optional - The beginning learning rate. This will linearly reduce with iterations until it reaches `min_alpha`. - window : int, optional - The maximum distance between the current and predicted word within a sentence. - seed : int, optional - Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. - Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker - thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. - In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` - environment variable to control hash randomization. - hs : {1,0}, optional - If 1, hierarchical softmax will be used for model training. - If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int, optional - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. - cbow_mean : {1,0}, optional - If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - min_alpha : float, optional - Final learning rate. Drops linearly with the number of iterations from `alpha`. - compute_loss : bool, optional - If True, loss will be computed while training the Word2Vec model and stored in - :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss` attribute. - **kwargs : object - Key word arguments needed to allow children classes to accept more arguments. - - """ - self.sg = int(sg) - if vector_size % 4 != 0: - logger.warning("consider setting layer size to a multiple of 4 for greater performance") - self.alpha = float(alpha) - self.window = int(window) - self.random = random.RandomState(seed) - self.min_alpha = float(min_alpha) - self.hs = int(hs) - self.negative = int(negative) - self.ns_exponent = ns_exponent - self.cbow_mean = int(cbow_mean) - self.compute_loss = bool(compute_loss) - self.running_training_loss = 0 - self.min_alpha_yet_reached = float(alpha) - self.corpus_count = 0 - self.corpus_total_words = 0 - - super(BaseWordEmbeddingsModel, self).__init__( - workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words) - - if sentences is not None or corpus_file is not None: - self._check_input_data_sanity(data_iterable=sentences, corpus_file=corpus_file) - if corpus_file is not None and not isinstance(corpus_file, string_types): - raise TypeError("You must pass string as the corpus_file argument.") - elif isinstance(sentences, GeneratorType): - raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.") - - self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule) - self.train( - sentences=sentences, corpus_file=corpus_file, total_examples=self.corpus_count, - total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, - end_alpha=self.min_alpha, compute_loss=compute_loss) - else: - if trim_rule is not None: - logger.warning( - "The rule, if given, is only used to prune vocabulary during build_vocab() " - "and is not stored as part of the model. Model initialized without sentences. " - "trim_rule provided, if any, will be ignored.") - - # for backward compatibility (aliases pointing to corresponding variables in trainables, vocabulary) - @property - @deprecated("Attribute will be removed in 4.0.0, use self.epochs instead") - def iter(self): - return self.epochs - - @iter.setter - @deprecated("Attribute will be removed in 4.0.0, use self.epochs instead") - def iter(self, value): - self.epochs = value - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead") - def syn1(self): - return self.trainables.syn1 - - @syn1.setter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead") - def syn1(self, value): - self.trainables.syn1 = value - - @syn1.deleter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead") - def syn1(self): - del self.trainables.syn1 - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead") - def syn1neg(self): - return self.trainables.syn1neg - - @syn1neg.setter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead") - def syn1neg(self, value): - self.trainables.syn1neg = value - - @syn1neg.deleter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead") - def syn1neg(self): - del self.trainables.syn1neg - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead") - def syn0_lockf(self): - return self.trainables.vectors_lockf - - @syn0_lockf.setter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead") - def syn0_lockf(self, value): - self.trainables.vectors_lockf = value - - @syn0_lockf.deleter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead") - def syn0_lockf(self): - del self.trainables.vectors_lockf - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.layer1_size instead") - def layer1_size(self): - return self.trainables.layer1_size - - @layer1_size.setter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.layer1_size instead") - def layer1_size(self, value): - self.trainables.layer1_size = value - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.hashfxn instead") - def hashfxn(self): - return self.trainables.hashfxn - - @hashfxn.setter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.hashfxn instead") - def hashfxn(self, value): - self.trainables.hashfxn = value - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.sample instead") - def sample(self): - return self.vocabulary.sample - - @sample.setter - @deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.sample instead") - def sample(self, value): - self.vocabulary.sample = value - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.min_count instead") - def min_count(self): - return self.vocabulary.min_count - - @min_count.setter - @deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.min_count instead") - def min_count(self, value): - self.vocabulary.min_count = value - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead") - def cum_table(self): - return self.vocabulary.cum_table - - @cum_table.setter - @deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead") - def cum_table(self, value): - self.vocabulary.cum_table = value - - @cum_table.deleter - @deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead") - def cum_table(self): - del self.vocabulary.cum_table - - def __str__(self): - """Get a human readable representation of the object. - - Returns - ------- - str - A human readable string containing the class name, as well as the size of dictionary, number of - features and starting learning rate used by the object. - - """ - return "%s(vocab=%s, size=%s, alpha=%s)" % ( - self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha - ) - - def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, - keep_raw_vocab=False, trim_rule=None, **kwargs): - """Build vocabulary from a sequence of sentences (can be a once-only generator stream). - - Parameters - ---------- - sentences : iterable of list of str - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` module for such examples. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). - update : bool - If true, the new words in `sentences` will be added to model's vocab. - progress_per : int, optional - Indicates how many words to process before showing/updating the progress. - keep_raw_vocab : bool, optional - If False, the raw vocabulary will be deleted after the scaling is done to free up RAM. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during current method call and is not stored as part - of the model. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - **kwargs : object - Key word arguments propagated to `self.vocabulary.prepare_vocab` - - """ - total_words, corpus_count = self.vocabulary.scan_vocab( - sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) - self.corpus_count = corpus_count - self.corpus_total_words = total_words - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, - trim_rule=trim_rule, **kwargs) - report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) - - def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): - """Build vocabulary from a dictionary of word frequencies. - - Parameters - ---------- - word_freq : dict of (str, int) - A mapping from a word in the vocabulary to its frequency count. - keep_raw_vocab : bool, optional - If False, delete the raw vocabulary after the scaling is done to free up RAM. - corpus_count : int, optional - Even if no corpus is provided, this argument can set corpus_count explicitly. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during current method call and is not stored as part - of the model. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - update : bool, optional - If true, the new provided words in `word_freq` dict will be added to model's vocab. - - """ - logger.info("Processing provided word frequencies") - # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) - # to be directly the raw vocab - raw_vocab = word_freq - logger.info( - "collected %i different raw word, with total frequency of %i", - len(raw_vocab), sum(itervalues(raw_vocab)) - ) - - # Since no sentences are provided, this is to control the corpus_count. - self.corpus_count = corpus_count or 0 - self.vocabulary.raw_vocab = raw_vocab - - # trim by min_count & precalculate downsampling - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab, - trim_rule=trim_rule, update=update) - report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights( - self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) # build tables & arrays - - def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings and provided vocabulary size. - - Parameters - ---------- - vocab_size : int, optional - Number of unique tokens in the vocabulary - report : dict of (str, int), optional - A dictionary from string representations of the model's memory consuming members to their size in bytes. - - Returns - ------- - dict of (str, int) - A dictionary from string representations of the model's memory consuming members to their size in bytes. - - """ - vocab_size = vocab_size or len(self.wv.vocab) - report = report or {} - report['vocab'] = vocab_size * (700 if self.hs else 500) - report['vectors'] = vocab_size * self.vector_size * dtype(REAL).itemsize - if self.hs: - report['syn1'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize - if self.negative: - report['syn1neg'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize - report['total'] = sum(report.values()) - logger.info( - "estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total'] - ) - return report - - def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs): - """Train the model. If the hyper-parameters are passed, they override the ones set in the constructor. - - Parameters - ---------- - sentences : iterable of list of str - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` module for such examples. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). - total_examples : int, optional - Count of sentences. - total_words : int, optional - Count of raw words in sentences. - epochs : int, optional - Number of iterations (epochs) over the corpus. - start_alpha : float, optional - Initial learning rate. - end_alpha : float, optional - Final learning rate. Drops linearly with the number of iterations from `start_alpha`. - word_count : int, optional - Count of words already trained. Leave this to 0 for the usual case of training on all words in sentences. - queue_factor : int, optional - Multiplier for size of queue -> size = number of workers * queue_factor. - report_delay : float, optional - Seconds to wait before reporting progress. - compute_loss : bool, optional - If True, loss will be computed while training the Word2Vec model and stored in - :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks that need to be executed/run at specific stages during training. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - Returns - ------- - (int, int) - Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count). - - """ - - self.alpha = start_alpha or self.alpha - self.min_alpha = end_alpha or self.min_alpha - self.compute_loss = compute_loss - self.running_training_loss = 0.0 - return super(BaseWordEmbeddingsModel, self).train( - data_iterable=sentences, corpus_file=corpus_file, total_examples=total_examples, - total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, - queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks, - **kwargs) - - def _get_job_params(self, cur_epoch): - """Get the learning rate used in the current epoch. - - Parameters - ---------- - cur_epoch : int - Current iteration through the corpus - - Returns - ------- - float - The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`). - - """ - alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs) - return alpha - - def _update_job_params(self, job_params, epoch_progress, cur_epoch): - """Get the correct learning rate for the next iteration. - - Parameters - ---------- - job_params : dict of (str, obj) - UNUSED. - epoch_progress : float - Ratio of finished work in the current epoch. - cur_epoch : int - Number of current iteration. - - Returns - ------- - float - The learning rate to be used in the next training epoch. - - """ - start_alpha = self.alpha - end_alpha = self.min_alpha - progress = (cur_epoch + epoch_progress) / self.epochs - next_alpha = start_alpha - (start_alpha - end_alpha) * progress - next_alpha = max(end_alpha, next_alpha) - self.min_alpha_yet_reached = next_alpha - return next_alpha - - def _get_thread_working_mem(self): - """Computes the memory used per worker thread. - - Returns - ------- - (np.ndarray, np.ndarray) - Each worker threads private work memory. - - """ - work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) - return work, neu1 - - def _raw_word_count(self, job): - """Get the number of words in a given job. - - Parameters - ---------- - job: iterable of list of str - The corpus chunk processed in a single batch. - - Returns - ------- - int - Number of raw words in the corpus chunk. - - """ - return sum(len(sentence) for sentence in job) - - def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): - """Checks whether the training parameters make sense. - - Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train` - and raises warning or errors depending on the severity of the issue in case an inconsistent parameter - combination is detected. - - Parameters - ---------- - epochs : int, optional - Number of training epochs. Must have a (non None) value. - total_examples : int, optional - Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied. - total_words : int, optional - Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied. - **kwargs : object - Unused. Present to preserve signature among base and inherited implementations. - - Raises - ------ - RuntimeError - If one of the required training pre/post processing steps have not been performed. - ValueError - If the combination of input parameters is inconsistent. - - """ - if self.alpha > self.min_alpha_yet_reached: - logger.warning("Effective 'alpha' higher than previous training cycles") - if self.model_trimmed_post_training: - raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") - - if not self.wv.vocab: # should be set by `build_vocab` - raise RuntimeError("you must first build vocabulary before training the model") - if not len(self.wv.vectors): - raise RuntimeError("you must initialize vectors before training the model") - - if not hasattr(self, 'corpus_count'): - raise ValueError( - "The number of examples in the training corpus is missing. " - "Please make sure this is set inside `build_vocab` function." - "Call the `build_vocab` function before calling `train`." - ) - - if total_words is None and total_examples is None: - raise ValueError( - "You must specify either total_examples or total_words, for proper job parameters updation" - "and progress calculations. " - "The usual value is total_examples=model.corpus_count." - ) - if epochs is None: - raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.epochs.") - logger.info( - "training model with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg, - self.hs, self.vocabulary.sample, self.negative, self.window - ) - - @classmethod - def load(cls, *args, **kwargs): - """Load a previously saved object (using :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`) from file. - - Also initializes extra instance attributes in case the loaded model does not include them. - `*args` or `**kwargs` **MUST** include the fname argument (path to saved file). - See :meth:`~gensim.utils.SaveLoad.load`. - - Parameters - ---------- - *args : object - Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`. - **kwargs : object - Key word arguments passed to :meth:`~gensim.utils.SaveLoad.load`. - - See Also - -------- - :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save` - Method for save a model. - - Returns - ------- - :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` - Model loaded from disk. - - Raises - ------ - IOError - When methods are called on instance (should be called from class). - - """ - model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs) - if not hasattr(model, 'ns_exponent'): - model.ns_exponent = 0.75 - if not hasattr(model.vocabulary, 'ns_exponent'): - model.vocabulary.ns_exponent = 0.75 - if model.negative and hasattr(model.wv, 'index2word'): - model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary - if not hasattr(model, 'corpus_count'): - model.corpus_count = None - if not hasattr(model, 'corpus_total_words'): - model.corpus_total_words = None - if not hasattr(model.trainables, 'vectors_lockf') and hasattr(model.wv, 'vectors'): - model.trainables.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL) - if not hasattr(model, 'random'): - model.random = random.RandomState(model.trainables.seed) - if not hasattr(model, 'train_count'): - model.train_count = 0 - model.total_train_time = 0 - return model - - def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed): - """Callback used to log progress for long running jobs. - - Parameters - ---------- - job_queue : Queue of (list of object, dict of (str, float)) - The queue of jobs still to be performed by workers. Each job is represented as a tuple containing - the batch of data to be processed and the parameters to be used for the processing as a dict. - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - cur_epoch : int - The current training iteration through the corpus. - example_count : int - Number of examples (could be sentences for example) processed until now. - total_examples : int - Number of all examples present in the input corpus. - raw_word_count : int - Number of words used in training until now. - total_words : int - Number of all words in the input corpus. - trained_word_count : int - Number of effective words used in training until now (after ignoring unknown words and trimming - the sentence length). - elapsed : int - Elapsed time since the beginning of training in seconds. - - Notes - ----- - If you train the model via `corpus_file` argument, there is no job_queue, so reported job_queue size will - always be equal to -1. - - """ - if total_examples: - # examples-based progress % - logger.info( - "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, - -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) - ) - else: - # words-based progress % - logger.info( - "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) - ) - - def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode): - """Callback used to log the end of a training epoch. - - Parameters - ---------- - cur_epoch : int - The current training iteration through the corpus. - example_count : int - Number of examples (could be sentences for example) processed until now. - total_examples : int - Number of all examples present in the input corpus. - raw_word_count : int - Number of words used in training until now. - total_words : int - Number of all words in the input corpus. - trained_word_count : int - Number of effective words used in training until now (after ignoring unknown words and trimming - the sentence length). - elapsed : int - Elapsed time since the beginning of training in seconds. - is_corpus_file_mode : bool - Whether training is file-based (corpus_file argument) or not. - - Warnings - -------- - In case the corpus is changed while the epoch was running. - - """ - logger.info( - "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed - ) - - # don't warn if training in file-based mode, because it's expected behavior - if is_corpus_file_mode: - return - - # check that the input corpus hasn't changed during iteration - if total_examples and total_examples != example_count: - logger.warning( - "EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1, - example_count, total_examples - ) - if total_words and total_words != raw_word_count: - logger.warning( - "EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1, - raw_word_count, total_words - ) - - def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): - """Callback to log the end of training. - - Parameters - ---------- - raw_word_count : int - Number of words used in the whole training. - trained_word_count : int - Number of effective words used in training (after ignoring unknown words and trimming the sentence length). - total_elapsed : int - Total time spent during training in seconds. - job_tally : int - Total number of jobs processed during training. - - """ - logger.info( - "training on a %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed - ) - if job_tally < 10 * self.workers: - logger.warning( - "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" - ) - - # for backward compatibility - @deprecated("Method will be removed in 4.0.0, use self.wv.most_similar() instead") - def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): - """Deprecated, use self.wv.most_similar() instead. - - Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar`. - - """ - return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer) - - @deprecated("Method will be removed in 4.0.0, use self.wv.wmdistance() instead") - def wmdistance(self, document1, document2): - """Deprecated, use self.wv.wmdistance() instead. - - Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance`. - - """ - return self.wv.wmdistance(document1, document2) - - @deprecated("Method will be removed in 4.0.0, use self.wv.most_similar_cosmul() instead") - def most_similar_cosmul(self, positive=None, negative=None, topn=10): - """Deprecated, use self.wv.most_similar_cosmul() instead. - - Refer to the documentation for - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar_cosmul`. - - """ - return self.wv.most_similar_cosmul(positive, negative, topn) - - @deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_word() instead") - def similar_by_word(self, word, topn=10, restrict_vocab=None): - """Deprecated, use self.wv.similar_by_word() instead. - - Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_word`. - - """ - return self.wv.similar_by_word(word, topn, restrict_vocab) - - @deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead") - def similar_by_vector(self, vector, topn=10, restrict_vocab=None): - """Deprecated, use self.wv.similar_by_vector() instead. - - Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_vector`. - - """ - return self.wv.similar_by_vector(vector, topn, restrict_vocab) - - @deprecated("Method will be removed in 4.0.0, use self.wv.doesnt_match() instead") - def doesnt_match(self, words): - """Deprecated, use self.wv.doesnt_match() instead. - - Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.doesnt_match`. - - """ - return self.wv.doesnt_match(words) - - @deprecated("Method will be removed in 4.0.0, use self.wv.similarity() instead") - def similarity(self, w1, w2): - """Deprecated, use self.wv.similarity() instead. - - Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`. - - """ - return self.wv.similarity(w1, w2) - - @deprecated("Method will be removed in 4.0.0, use self.wv.n_similarity() instead") - def n_similarity(self, ws1, ws2): - """Deprecated, use self.wv.n_similarity() instead. - - Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.n_similarity`. - - """ - return self.wv.n_similarity(ws1, ws2) - - @deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_pairs() instead") - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, - case_insensitive=True, dummy4unknown=False): - """Deprecated, use self.wv.evaluate_word_pairs() instead. - - Refer to the documentation for - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.evaluate_word_pairs`. - - """ - return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) diff --git a/gensim/models/callbacks.py b/gensim/models/callbacks.py index dd77348c8f..cefdd33091 100644 --- a/gensim/models/callbacks.py +++ b/gensim/models/callbacks.py @@ -569,7 +569,7 @@ def on_epoch_end(self, epoch, topics=None): class CallbackAny2Vec(object): - """Base class to build callbacks for :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`. + """Base class to build callbacks for :class:`~gensim.models.word2vec.Word2Vec` & subclasses. Callbacks are used to apply custom functions over the model at specific points during training (epoch start, batch end etc.). This is a base class and its purpose is to be inherited by @@ -584,7 +584,7 @@ def on_epoch_begin(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -595,7 +595,7 @@ def on_epoch_end(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -606,7 +606,7 @@ def on_batch_begin(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -617,7 +617,7 @@ def on_batch_end(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -628,7 +628,7 @@ def on_train_begin(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -639,7 +639,7 @@ def on_train_end(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ diff --git a/gensim/models/deprecated/__init__.py b/gensim/models/deprecated/__init__.py deleted file mode 100644 index cfa71654f5..0000000000 --- a/gensim/models/deprecated/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""This package contains some deprecated implementations of algorithm, will be removed soon.""" diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py deleted file mode 100644 index 9378b77d88..0000000000 --- a/gensim/models/deprecated/doc2vec.py +++ /dev/null @@ -1,1042 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2013 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.doc2vec` instead. - - - -Deep learning via the distributed memory and distributed bag of words models from -[1]_, using either hierarchical softmax or negative sampling [2]_ [3]_. See [#tutorial]_ - -**Make sure you have a C compiler before installing gensim, to use optimized (compiled) -doc2vec training** (70x speedup [blog]_). - -Initialize a model with e.g.:: - -.. sourcecode:: pycon - - >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) - -Persist a model to disk with:: - -.. sourcecode:: pycon - - >>> model.save(fname) - >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! - -If you're finished training a model (=no more updates, only querying), you can do - -.. sourcecode:: pycon - - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): - -to trim unneeded model memory = use (much) less RAM. - - - -.. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. - http://arxiv.org/pdf/1405.4053v2.pdf -.. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. - Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. - Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. -.. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ - -.. [#tutorial] Doc2vec in gensim tutorial, - https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb - - - -""" - -import logging -import os - -try: - from queue import Queue -except ImportError: - from Queue import Queue # noqa:F401 - -from collections import namedtuple, defaultdict -from timeit import default_timer - -from numpy import zeros, sum as np_sum, add as np_add, concatenate, \ - repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \ - sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide, integer - -from gensim import utils -from gensim.utils import call_on_class_only, deprecated -from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg,\ - MAX_WORDS_IN_BATCH -from gensim.models.deprecated.keyedvectors import KeyedVectors -from gensim.models.doc2vec import Doc2Vec as NewDoc2Vec -from gensim.models.deprecated.old_saveload import SaveLoad - -from gensim import matutils # utility fnc for pickling, common scipy operations etc -from six.moves import zip, range -from six import string_types, integer_types - -logger = logging.getLogger(__name__) - - -def load_old_doc2vec(*args, **kwargs): - old_model = Doc2Vec.load(*args, **kwargs) - params = { - 'dm_mean': old_model.__dict__.get('dm_mean', None), - 'dm': old_model.dm, - 'dbow_words': old_model.dbow_words, - 'dm_concat': old_model.dm_concat, - 'dm_tag_count': old_model.dm_tag_count, - 'docvecs_mapfile': old_model.__dict__.get('docvecs_mapfile', None), - 'comment': old_model.__dict__.get('comment', None), - 'vector_size': old_model.vector_size, - 'alpha': old_model.alpha, - 'window': old_model.window, - 'min_count': old_model.min_count, - 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), - 'sample': old_model.sample, - 'seed': old_model.seed, - 'workers': old_model.workers, - 'min_alpha': old_model.min_alpha, - 'hs': old_model.hs, - 'negative': old_model.negative, - 'cbow_mean': old_model.cbow_mean, - 'hashfxn': old_model.hashfxn, - 'epochs': old_model.iter, - 'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1), - 'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH), - 'compute_loss': old_model.__dict__.get('compute_loss', None) - } - new_model = NewDoc2Vec(**params) - # set word2vec trainables attributes - new_model.wv.vectors = old_model.wv.syn0 - if hasattr(old_model.wv, 'syn0norm'): - new_model.docvecs.vectors_norm = old_model.wv.syn0norm - if hasattr(old_model, 'syn1'): - new_model.trainables.syn1 = old_model.syn1 - if hasattr(old_model, 'syn1neg'): - new_model.trainables.syn1neg = old_model.syn1neg - if hasattr(old_model, 'syn0_lockf'): - new_model.trainables.vectors_lockf = old_model.syn0_lockf - - # set doc2vec trainables attributes - new_model.docvecs.vectors_docs = old_model.docvecs.doctag_syn0 - if hasattr(old_model.docvecs, 'doctag_syn0norm'): - new_model.docvecs.vectors_docs_norm = old_model.docvecs.doctag_syn0norm - if hasattr(old_model.docvecs, 'doctag_syn0_lockf'): - new_model.trainables.vectors_docs_lockf = old_model.docvecs.doctag_syn0_lockf - if hasattr(old_model.docvecs, 'mapfile_path'): - new_model.docvecs.mapfile_path = old_model.docvecs.mapfile_path - - # set word2vec vocabulary attributes - new_model.wv.vocab = old_model.wv.vocab - new_model.wv.index2word = old_model.wv.index2word - new_model.vocabulary.cum_table = old_model.cum_table - - # set doc2vec vocabulary attributes - new_model.docvecs.doctags = old_model.docvecs.doctags - new_model.docvecs.count = old_model.docvecs.count - if hasattr(old_model.docvecs, 'max_rawint'): # `doc2vec` models before `0.12.3` do not have these 2 attributes - new_model.docvecs.max_rawint = old_model.docvecs.__dict__.get('max_rawint') - new_model.docvecs.offset2doctag = old_model.docvecs.__dict__.get('offset2doctag') - else: - # Doc2Vec models before Gensim version 0.12.3 did not have `max_rawint` and `offset2doctag` as they did not - # mixing of string and int tags. This implies the new attribute `offset2doctag` equals the old `index2doctag` - # (which was only filled if the documents had string tags). - # This also implies that the new attribute, `max_rawint`(highest rawint-indexed doctag) would either be equal - # to the initial value -1, in case only string tags are used or would be equal to `count` if only int indexing - # was used. - new_model.docvecs.max_rawint = -1 if old_model.docvecs.index2doctag else old_model.docvecs.count - 1 - new_model.docvecs.offset2doctag = old_model.docvecs.index2doctag - - new_model.train_count = old_model.__dict__.get('train_count', None) - new_model.corpus_count = old_model.__dict__.get('corpus_count', None) - new_model.corpus_total_words = old_model.__dict__.get('corpus_total_words', None) - new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0) - new_model.total_train_time = old_model.__dict__.get('total_train_time', None) - new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha) - new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None) - - return new_model - - -def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, - train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """ - Update distributed bag of words model ("PV-DBOW") by training on a single document. - - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. - - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - - If `train_words` is True, simultaneously train word-to-word (not just doc-to-word) - examples, exactly as per Word2Vec skip-gram training. (Without this option, - word vectors are neither consulted nor updated during DBOW doc vector training.) - - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from doc2vec_inner instead. - - """ - if doctag_vectors is None: - doctag_vectors = model.docvecs.doctag_syn0 - if doctag_locks is None: - doctag_locks = model.docvecs.doctag_syn0_lockf - - if train_words and learn_words: - train_batch_sg(model, [doc_words], alpha, work) - for doctag_index in doctag_indexes: - for word in doc_words: - train_sg_pair( - model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden, - context_vectors=doctag_vectors, context_locks=doctag_locks - ) - - return len(doc_words) - - -def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, - learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """ - Update distributed memory model ("PV-DM") by training on a single document. - - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This - method implements the DM model with a projection (input) layer that is - either the sum or mean of the context vectors, depending on the model's - `dm_mean` configuration field. See `train_document_dm_concat()` for the DM - model with a concatenated input layer. - - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - - This is the non-optimized, Python version. If you have a C compiler, gensim - will use the optimized version from doc2vec_inner instead. - - """ - if word_vectors is None: - word_vectors = model.wv.syn0 - if word_locks is None: - word_locks = model.syn0_lockf - if doctag_vectors is None: - doctag_vectors = model.docvecs.doctag_syn0 - if doctag_locks is None: - doctag_locks = model.docvecs.doctag_syn0_lockf - - word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code - start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] - l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) - count = len(word2_indexes) + len(doctag_indexes) - if model.cbow_mean and count > 1: - l1 /= count - neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, - learn_vectors=False, learn_hidden=learn_hidden) - if not model.cbow_mean and count > 1: - neu1e /= count - if learn_doctags: - for i in doctag_indexes: - doctag_vectors[i] += neu1e * doctag_locks[i] - if learn_words: - for i in word2_indexes: - word_vectors[i] += neu1e * word_locks[i] - - return len(word_vocabs) - - -def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, - learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, - doctag_vectors=None, doctag_locks=None): - """ - Update distributed memory model ("PV-DM") by training on a single document, using a - concatenation of the context window word vectors (rather than a sum or average). - - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. - - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - - This is the non-optimized, Python version. If you have a C compiler, gensim - will use the optimized version from doc2vec_inner instead. - - """ - if word_vectors is None: - word_vectors = model.wv.syn0 - if word_locks is None: - word_locks = model.syn0_lockf - if doctag_vectors is None: - doctag_vectors = model.docvecs.doctag_syn0 - if doctag_locks is None: - doctag_locks = model.docvecs.doctag_syn0_lockf - - word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - doctag_len = len(doctag_indexes) - if doctag_len != model.dm_tag_count: - return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) - - null_word = model.wv.vocab['\0'] - pre_pad_count = model.window - post_pad_count = model.window - padded_document_indexes = ( - (pre_pad_count * [null_word.index]) # pre-padding - + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words - + (post_pad_count * [null_word.index]) # post-padding - ) - - for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): - word_context_indexes = ( - padded_document_indexes[(pos - pre_pad_count): pos] # preceding words - + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words - ) - predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]] - # numpy advanced-indexing copies; concatenate, flatten to 1d - l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() - neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, - learn_hidden=learn_hidden, learn_vectors=False) - - # filter by locks and shape for addition to source vectors - e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) - neu1e_r = (neu1e.reshape(-1, model.vector_size) - * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) - - if learn_doctags: - np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) - if learn_words: - np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) - - return len(padded_document_indexes) - pre_pad_count - post_pad_count - - -class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): - """ - A single document, made up of `words` (a list of unicode string tokens) - and `tags` (a list of tokens). Tags may be one or more unicode string - tokens, but typical practice (which will also be most memory-efficient) is - for the tags list to include a unique integer id as the only tag. - - Replaces "sentence as a list of words" from Word2Vec. - - """ - - def __str__(self): - return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) - - -# for compatibility -@deprecated("Class will be removed in 4.0.0, use TaggedDocument instead") -class LabeledSentence(TaggedDocument): - pass - - -class DocvecsArray(SaveLoad): - """ - Default storage of doc vectors during/after training, in a numpy array. - - As the 'docvecs' property of a Doc2Vec model, allows access and - comparison of document vectors. - - .. sourcecode:: pycon - - >>> docvec = d2v_model.docvecs[99] - >>> docvec = d2v_model.docvecs['SENT_99'] # if string tag used in training - >>> sims = d2v_model.docvecs.most_similar(99) - >>> sims = d2v_model.docvecs.most_similar('SENT_99') - >>> sims = d2v_model.docvecs.most_similar(docvec) - - If only plain int tags are presented during training, the dict (of - string tag -> index) and list (of index -> string tag) stay empty, - saving memory. - - Supplying a mapfile_path (as by initializing a Doc2Vec model with a - 'docvecs_mapfile' value) will use a pair of memory-mapped - files as the array backing for doctag_syn0/doctag_syn0_lockf values. - - The Doc2Vec model automatically uses this class, but a future alternative - implementation, based on another persistence mechanism like LMDB, LevelDB, - or SQLite, should also be possible. - """ - - def __init__(self, mapfile_path=None): - self.doctags = {} # string -> Doctag (only filled if necessary) - self.max_rawint = -1 # highest rawint-indexed doctag - self.offset2doctag = [] # int offset-past-(max_rawint+1) -> String (only filled if necessary) - self.count = 0 - self.mapfile_path = mapfile_path - - def note_doctag(self, key, document_no, document_length): - """Note a document tag during initial corpus scan, for structure sizing.""" - if isinstance(key, integer_types + (integer,)): - self.max_rawint = max(self.max_rawint, key) - else: - if key in self.doctags: - self.doctags[key] = self.doctags[key].repeat(document_length) - else: - self.doctags[key] = Doctag(len(self.offset2doctag), document_length, 1) - self.offset2doctag.append(key) - self.count = self.max_rawint + 1 + len(self.offset2doctag) - - def indexed_doctags(self, doctag_tokens): - """Return indexes and backing-arrays used in training examples.""" - return ([self._int_index(index) for index in doctag_tokens if index in self], - self.doctag_syn0, self.doctag_syn0_lockf, doctag_tokens) - - def trained_item(self, indexed_tuple): - """Persist any changes made to the given indexes (matching tuple previously - returned by indexed_doctags()); a no-op for this implementation""" - pass - - def _int_index(self, index): - """Return int index for either string or int index""" - if isinstance(index, integer_types + (integer,)): - return index - else: - return self.max_rawint + 1 + self.doctags[index].offset - - @deprecated("Method will be removed in 4.0.0, use self.index_to_doctag instead") - def _key_index(self, i_index, missing=None): - """Return string index for given int index, if available""" - return self.index_to_doctag(i_index) - - def index_to_doctag(self, i_index): - """Return string key for given i_index, if available. Otherwise return raw int doctag (same int).""" - candidate_offset = i_index - self.max_rawint - 1 - if 0 <= candidate_offset < len(self.offset2doctag): - return self.offset2doctag[candidate_offset] - else: - return i_index - - def __getitem__(self, index): - """ - Accept a single key (int or string tag) or list of keys as input. - - If a single string or int, return designated tag's vector - representation, as a 1D numpy array. - - If a list, return designated tags' vector representations as a - 2D numpy array: #tags x #vector_size. - """ - if isinstance(index, string_types + integer_types + (integer,)): - return self.doctag_syn0[self._int_index(index)] - - return vstack([self[i] for i in index]) - - def __len__(self): - return self.count - - def __contains__(self, index): - if isinstance(index, integer_types + (integer,)): - return index < self.count - else: - return index in self.doctags - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) - super(DocvecsArray, self).save(*args, **kwargs) - - def borrow_from(self, other_docvecs): - self.count = other_docvecs.count - self.doctags = other_docvecs.doctags - self.offset2doctag = other_docvecs.offset2doctag - - def clear_sims(self): - self.doctag_syn0norm = None - - def estimated_lookup_memory(self): - """Estimated memory for tag lookup; 0 if using pure int tags.""" - return 60 * len(self.offset2doctag) + 140 * len(self.doctags) - - def reset_weights(self, model): - length = max(len(self.doctags), self.count) - if self.mapfile_path: - self.doctag_syn0 = np_memmap( - self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size) - ) - self.doctag_syn0_lockf = np_memmap( - self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,) - ) - self.doctag_syn0_lockf.fill(1.0) - else: - self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL) - self.doctag_syn0_lockf = ones((length,), dtype=REAL) # zeros suppress learning - - for i in range(length): - # construct deterministic seed from index AND model seed - seed = "%d %s" % (model.seed, self.index_to_doctag(i)) - self.doctag_syn0[i] = model.seeded_vector(seed) - - def init_sims(self, replace=False): - """ - Precompute L2-normalized vectors. - - If `replace` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - - Note that you **cannot continue training or inference** after doing a replace. - The model becomes effectively read-only = you can call `most_similar`, `similarity` - etc., but not `train` or `infer_vector`. - - """ - if getattr(self, 'doctag_syn0norm', None) is None or replace: - logger.info("precomputing L2-norms of doc weight vectors") - if replace: - for i in range(self.doctag_syn0.shape[0]): - self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1)) - self.doctag_syn0norm = self.doctag_syn0 - else: - if self.mapfile_path: - self.doctag_syn0norm = np_memmap( - self.mapfile_path + '.doctag_syn0norm', dtype=REAL, - mode='w+', shape=self.doctag_syn0.shape) - else: - self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL) - np_divide(self.doctag_syn0, sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm) - - def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, indexer=None): - """ - Find the top-N most similar docvecs known from training. Positive docs contribute - positively towards the similarity, negative docs negatively. - - This method computes cosine similarity between a simple mean of the projection - weight vectors of the given docs. Docs may be specified as vectors, integer indexes - of trained docvecs, or if the documents were originally presented with string tags, - by the corresponding tags. - - The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous - range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering - there was chosen to be significant, such as more popular tag IDs in lower indexes.) - """ - if positive is None: - positive = [] - if negative is None: - negative = [] - - self.init_sims() - clip_end = clip_end or len(self.doctag_syn0norm) - - if isinstance(positive, string_types + integer_types + (integer,)) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) - positive = [positive] - - # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs - positive = [ - (doc, 1.0) if isinstance(doc, string_types + integer_types + (ndarray, integer)) - else doc for doc in positive - ] - negative = [ - (doc, -1.0) if isinstance(doc, string_types + integer_types + (ndarray, integer)) - else doc for doc in negative - ] - - # compute the weighted average of all docs - all_docs, mean = set(), [] - for doc, weight in positive + negative: - if isinstance(doc, ndarray): - mean.append(weight * doc) - elif doc in self.doctags or doc < self.count: - mean.append(weight * self.doctag_syn0norm[self._int_index(doc)]) - all_docs.add(self._int_index(doc)) - else: - raise KeyError("doc '%s' not in trained set" % doc) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - - if indexer is not None: - return indexer.most_similar(mean, topn) - - dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean) - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) - # ignore (don't return) docs from the input - result = [ - (self.index_to_doctag(sim + clip_start), float(dists[sim])) - for sim in best - if (sim + clip_start) not in all_docs - ] - return result[:topn] - - def doesnt_match(self, docs): - """ - Which doc from the given list doesn't go with the others? - - (TODO: Accept vectors of out-of-training-set docs, as if from inference.) - - """ - self.init_sims() - - docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns - logger.debug("using docs %s", docs) - if not docs: - raise ValueError("cannot select a doc from an empty list") - vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL) - mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) - dists = dot(vectors, mean) - return sorted(zip(dists, docs))[0][1] - - def similarity(self, d1, d2): - """ - Compute cosine similarity between two docvecs in the trained set, specified by int index or - string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) - - """ - return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2])) - - def n_similarity(self, ds1, ds2): - """ - Compute cosine similarity between two sets of docvecs from the trained set, specified by int - index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) - - """ - v1 = [self[doc] for doc in ds1] - v2 = [self[doc] for doc in ds2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) - - def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_alpha=0.0001, steps=5): - """ - Compute cosine similarity between two post-bulk out of training documents. - - Document should be a list of (word) tokens. - """ - d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) - d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) - return dot(matutils.unitvec(d1), matutils.unitvec(d2)) - - -class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): - """A string document tag discovered during the initial vocabulary - scan. (The document-vector equivalent of a Vocab object.) - - Will not be used if all presented document tags are ints. - - The offset is only the true index into the doctags_syn0/doctags_syn0_lockf - if-and-only-if no raw-int tags were used. If any raw-int tags were used, - string Doctag vectors begin at index (max_rawint + 1), so the true index is - (rawint_index + 1 + offset). See also DocvecsArray.index_to_doctag(). - """ - __slots__ = () - - def repeat(self, word_count): - return self._replace(word_count=self.word_count + word_count, doc_count=self.doc_count + 1) - - -class Doc2Vec(Word2Vec): - """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" - - def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, - docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs): - """ - Initialize the model from an iterable of `documents`. Each document is a - TaggedDocument object that will be used for training. - - The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, - consider an iterable that streams the documents directly from disk/network. - - If you don't supply `documents`, the model is left uninitialized -- use if - you plan to initialize it in some other way. - - `dm` defines the training algorithm. By default (`dm=1`), 'distributed memory' (PV-DM) is used. - Otherwise, `distributed bag of words` (PV-DBOW) is employed. - - `size` is the dimensionality of the feature vectors. - - `window` is the maximum distance between the predicted word and context words used for prediction - within a document. - - `alpha` is the initial learning rate (will linearly drop to `min_alpha` as training progresses). - - `seed` = for the random number generator. - Note that for a fully deterministically-reproducible run, you must also limit the model to - a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python - 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED - environment variable to control hash randomization.) - - `min_count` = ignore all words with total frequency lower than this. - - `max_vocab_size` = limit RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types - need about 1GB of RAM. Set to `None` for no limit (default). - - `sample` = threshold for configuring which higher-frequency words are randomly downsampled; - default is 1e-3, values of 1e-5 (or lower) may also be useful, set to 0.0 to disable downsampling. - - `workers` = use this many worker threads to train the model (=faster training with multicore machines). - - `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, - but values of 10 or 20 are common in published 'Paragraph Vector' experiments. - - `hs` = if 1, hierarchical softmax will be used for model training. - If set to 0 (default), and `negative` is non-zero, negative sampling will be used. - - `negative` = if > 0, negative sampling will be used, the int for negative - specifies how many "noise words" should be drawn (usually between 5-20). - Default is 5. If set to 0, no negative samping is used. - - `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. - Only applies when dm is used in non-concatenative mode. - - `dm_concat` = if 1, use concatenation of context vectors rather than sum/average; - default is 0 (off). Note concatenation results in a much-larger model, as the input - is no longer the size of one (sampled or arithmetically combined) word vector, but the - size of the tag(s) and all words in the context strung together. - - `dm_tag_count` = expected constant number of document tags per document, when using - dm_concat mode; default is 1. - - `dbow_words` if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW - doc-vector training; default is 0 (faster training of doc-vectors only). - - `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain - in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and - returns either util.RULE_DISCARD, util.RULE_KEEP or util.RULE_DEFAULT. - Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part - of the model. - """ - - if 'sentences' in kwargs: - raise DeprecationWarning( - "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, " - "use 'documents' instead." - ) - - super(Doc2Vec, self).__init__( - sg=(1 + dm) % 2, - null_word=dm_concat, - **kwargs) - - self.load = call_on_class_only - - if dm_mean is not None: - self.cbow_mean = dm_mean - - self.dbow_words = dbow_words - self.dm_concat = dm_concat - self.dm_tag_count = dm_tag_count - if self.dm and self.dm_concat: - self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size - - self.docvecs = docvecs or DocvecsArray(docvecs_mapfile) - self.comment = comment - if documents is not None: - self.build_vocab(documents, trim_rule=trim_rule) - self.train(documents, total_examples=self.corpus_count, epochs=self.iter) - - @property - def dm(self): - return not self.sg # opposite of SG - - @property - def dbow(self): - return self.sg # same as SG - - def clear_sims(self): - super(Doc2Vec, self).clear_sims() - self.docvecs.clear_sims() - - def reset_weights(self): - if self.dm and self.dm_concat: - # expand l1 size to match concatenated tags+words length - self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size - logger.info("using concatenative %d-dimensional layer1", self.layer1_size) - super(Doc2Vec, self).reset_weights() - self.docvecs.reset_weights(self) - - def reset_from(self, other_model): - """Reuse shareable structures from other_model.""" - self.docvecs.borrow_from(other_model.docvecs) - super(Doc2Vec, self).reset_from(other_model) - - def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False): - logger.info("collecting all words and their counts") - document_no = -1 - total_words = 0 - min_reduce = 1 - interval_start = default_timer() - 0.00001 # guard against next sample being identical - interval_count = 0 - checked_string_types = 0 - vocab = defaultdict(int) - for document_no, document in enumerate(documents): - if not checked_string_types: - if isinstance(document.words, string_types): - logger.warning( - "Each 'words' should be a list of words (usually unicode strings). " - "First 'words' here is instead plain %s.", - type(document.words) - ) - checked_string_types += 1 - if document_no % progress_per == 0: - interval_rate = (total_words - interval_count) / (default_timer() - interval_start) - logger.info( - "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", - document_no, total_words, interval_rate, len(vocab), len(self.docvecs) - ) - interval_start = default_timer() - interval_count = total_words - document_length = len(document.words) - - for tag in document.tags: - self.docvecs.note_doctag(tag, document_no, document_length) - - for word in document.words: - vocab[word] += 1 - total_words += len(document.words) - - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - logger.info( - "collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(vocab), len(self.docvecs), document_no + 1, total_words - ) - self.corpus_count = document_no + 1 - self.raw_vocab = vocab - - def _do_train_job(self, job, alpha, inits): - work, neu1 = inits - tally = 0 - for doc in job: - indexed_doctags = self.docvecs.indexed_doctags(doc.tags) - doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags - if self.sg: - tally += train_document_dbow( - self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - elif self.dm_concat: - tally += train_document_dm_concat( - self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - else: - tally += train_document_dm( - self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - self.docvecs.trained_item(indexed_doctags) - return tally, self._raw_word_count(job) - - def _raw_word_count(self, job): - """Return the number of words in a given job.""" - return sum(len(sentence.words) for sentence in job) - - def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): - """ - Infer a vector for given post-bulk training document. - - Document should be a list of (word) tokens. - """ - doctag_vectors = empty((1, self.vector_size), dtype=REAL) - doctag_vectors[0] = self.seeded_vector(' '.join(doc_words)) - doctag_locks = ones(1, dtype=REAL) - doctag_indexes = [0] - - work = zeros(self.layer1_size, dtype=REAL) - if not self.sg: - neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) - - for i in range(steps): - if self.sg: - train_document_dbow( - self, doc_words, doctag_indexes, alpha, work, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - elif self.dm_concat: - train_document_dm_concat( - self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - else: - train_document_dm( - self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha - - return doctag_vectors[0] - - def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings.""" - report = report or {} - report['doctag_lookup'] = self.docvecs.estimated_lookup_memory() - report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize - return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - - def __str__(self): - """Abbreviated name reflecting major configuration paramaters.""" - segments = [] - if self.comment: - segments.append('"%s"' % self.comment) - if self.sg: - if self.dbow_words: - segments.append('dbow+w') # also training words - else: - segments.append('dbow') # PV-DBOW (skip-gram-style) - - else: # PV-DM... - if self.dm_concat: - segments.append('dm/c') # ...with concatenative context layer - else: - if self.cbow_mean: - segments.append('dm/m') - else: - segments.append('dm/s') - segments.append('d%d' % self.vector_size) # dimensions - if self.negative: - segments.append('n%d' % self.negative) # negative samples - if self.hs: - segments.append('hs') - if not self.sg or (self.sg and self.dbow_words): - segments.append('w%d' % self.window) # window size, when relevant - if self.min_count > 1: - segments.append('mc%d' % self.min_count) - if self.sample > 0: - segments.append('s%g' % self.sample) - if self.workers > 1: - segments.append('t%d' % self.workers) - return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) - - def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): - """ - Discard parameters that are used in training and score. Use if you're sure you're done training a model. - Set `keep_doctags_vectors` to False if you don't want to save doctags vectors, - in this case you can't to use docvecs's most_similar, similarity etc. methods. - Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method - """ - if not keep_inference: - self._minimize_model(False, False, False) - if self.docvecs and hasattr(self.docvecs, 'doctag_syn0') and not keep_doctags_vectors: - del self.docvecs.doctag_syn0 - if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'): - del self.docvecs.doctag_syn0_lockf - - def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): - """ - Store the input-hidden weight matrix. - - `fname` is the file used to save the vectors in - `doctag_vec` is an optional boolean indicating whether to store document vectors - `word_vec` is an optional boolean indicating whether to store word vectors - (if both doctag_vec and word_vec are True, then both vectors are stored in the same file) - `prefix` to uniquely identify doctags from word vocab, and avoid collision - in case of repeated string in doctag and word vocab - `fvocab` is an optional file used to save the vocabulary - `binary` is an optional boolean indicating whether the data is to be saved - in binary word2vec format (default: False) - - """ - total_vec = len(self.wv.vocab) + len(self.docvecs) - # save word vectors - if word_vec: - if not doctag_vec: - total_vec = len(self.wv.vocab) - KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) - # save document vectors - if doctag_vec: - with utils.open(fname, 'ab') as fout: - if not word_vec: - total_vec = len(self.docvecs) - logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) - fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) - # store as in input order - for i in range(len(self.docvecs)): - doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i)) - row = self.docvecs.doctag_syn0[i] - if binary: - fout.write(utils.to_utf8(doctag) + b" " + row.tostring()) - else: - fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row)))) - - -class TaggedBrownCorpus(object): - """Iterate over documents from the Brown corpus (part of NLTK data), yielding - each document out as a TaggedDocument object.""" - - def __init__(self, dirname): - self.dirname = dirname - - def __iter__(self): - for fname in os.listdir(self.dirname): - fname = os.path.join(self.dirname, fname) - if not os.path.isfile(fname): - continue - with utils.open(fname, 'rb') as f: - for item_no, line in enumerate(f): - line = utils.to_unicode(line) - # each file line is a single document in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty documents - continue - yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) - - -class TaggedLineDocument(object): - """Simple format: one document = one line = one TaggedDocument object. - - Words are expected to be already preprocessed and separated by whitespace, - tags are constructed automatically from the document line number.""" - - def __init__(self, source): - """ - `source` can be either a string (filename) or a file object. - - Example:: - - documents = TaggedLineDocument('myfile.txt') - - Or for compressed files:: - - documents = TaggedLineDocument('compressed_text.txt.bz2') - documents = TaggedLineDocument('compressed_text.txt.gz') - - """ - self.source = source - - def __iter__(self): - """Iterate through the lines in the source.""" - try: - # Assume it is a file-like object and try treating it as such - # Things that don't have seek will trigger an exception - self.source.seek(0) - for item_no, line in enumerate(self.source): - yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) - except AttributeError: - # If it didn't work like a file, use it as a string filename - with utils.open(self.source, 'rb') as fin: - for item_no, line in enumerate(fin): - yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) diff --git a/gensim/models/deprecated/fasttext.py b/gensim/models/deprecated/fasttext.py deleted file mode 100644 index 0d46b6f1cc..0000000000 --- a/gensim/models/deprecated/fasttext.py +++ /dev/null @@ -1,711 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Authors: Chinmaya Pancholi , Shiva Manne -# Copyright (C) 2017 RaRe Technologies s.r.o. - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.fasttext` instead. - - -Learn word representations via fasttext's "skip-gram and CBOW models", using either -hierarchical softmax or negative sampling [1]_. - -Notes ------ -There are more ways to get word vectors in Gensim than just FastText. -See wrappers for VarEmbed and WordRank or Word2Vec - -This module allows training a word embedding from a training corpus with the additional ability -to obtain word vectors for out-of-vocabulary words. - -For a tutorial on gensim's native fasttext, refer to the noteboook -- [2]_ - -**Make sure you have a C compiler before installing gensim, to use optimized (compiled) fasttext training** - -.. [1] P. Bojanowski, E. Grave, A. Joulin, T. Mikolov - Enriching Word Vectors with Subword Information. In arXiv preprint arXiv:1607.04606. - https://arxiv.org/abs/1607.04606 - -.. [2] https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb - -""" - -import logging - -import numpy as np -from numpy import zeros, ones, vstack, sum as np_sum, empty, float32 as REAL - -from gensim.models.deprecated.word2vec import Word2Vec, train_sg_pair, train_cbow_pair -from gensim.models.deprecated.fasttext_wrapper import FastTextKeyedVectors -from gensim.models.deprecated.fasttext_wrapper import FastText as Ft_Wrapper, compute_ngrams, ft_hash -from gensim.models.fasttext import FastText as NewFastText - -logger = logging.getLogger(__name__) - -MAX_WORDS_IN_BATCH = 10000 - - -def load_old_fasttext(*args, **kwargs): - old_model = FastText.load(*args, **kwargs) - params = { - 'size': old_model.vector_size, - 'alpha': old_model.alpha, - 'window': old_model.window, - 'min_count': old_model.min_count, - 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), - 'sample': old_model.sample, - 'seed': old_model.seed, - 'workers': old_model.workers, - 'min_alpha': old_model.min_alpha, - 'sg': old_model.sg, - 'hs': old_model.hs, - 'negative': old_model.negative, - 'cbow_mean': old_model.cbow_mean, - 'hashfxn': old_model.hashfxn, - 'iter': old_model.iter, - 'null_word': old_model.null_word, - 'sorted_vocab': old_model.sorted_vocab, - 'batch_words': old_model.batch_words, - 'min_n': old_model.min_n, - 'max_n': old_model.max_n, - 'word_ngrams': old_model.word_ngrams, - 'bucket': old_model.bucket - } - new_model = NewFastText(**params) - # set trainables attributes - new_model.wv.vectors = old_model.wv.syn0 - new_model.wv.vectors_vocab = old_model.wv.syn0_vocab - new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams - if hasattr(old_model.wv, 'syn0norm'): - new_model.wv.vectors_norm = old_model.wv.syn0norm - if hasattr(old_model, 'syn1'): - new_model.trainables.syn1 = old_model.syn1 - if hasattr(old_model, 'syn1neg'): - new_model.trainables.syn1neg = old_model.syn1neg - if hasattr(old_model, 'syn0_lockf'): - new_model.trainables.vectors_lockf = old_model.syn0_lockf - - if hasattr(old_model, 'syn0_vocab_lockf'): - new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf - if hasattr(old_model, 'syn0_ngrams_lockf'): - new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf - if hasattr(old_model.wv, 'syn0_vocab_norm'): - new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm - if hasattr(old_model.wv, 'syn0_ngrams_norm'): - new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm - - # set vocabulary attributes - new_model.wv.vocab = old_model.wv.vocab - new_model.wv.index2word = old_model.wv.index2word - new_model.vocabulary.cum_table = old_model.cum_table - - new_model.wv.hash2index = old_model.wv.hash2index - - new_model.train_count = old_model.train_count - new_model.corpus_count = old_model.corpus_count - new_model.corpus_total_words = old_model.corpus_total_words - new_model.running_training_loss = old_model.running_training_loss - new_model.total_train_time = old_model.total_train_time - new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached - new_model.model_trimmed_post_training = old_model.model_trimmed_post_training - - new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors - - return new_model - - -def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): - """Update CBOW model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from fasttext_inner instead. - - Parameters - ---------- - model : :class:`~gensim.models.fasttext.FastText` - `FastText` instance. - sentences : iterable of iterables - Iterable of the sentences directly from disk/network. - alpha : float - Learning rate. - work : :class:`numpy.ndarray` - Private working memory for each worker. - neu1 : :class:`numpy.ndarray` - Private working memory for each worker. - - Returns - ------- - int - Effective number of words trained. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) - start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] - - word2_subwords = [] - vocab_subwords_indices = [] - ngrams_subwords_indices = [] - - for index in word2_indices: - vocab_subwords_indices += [index] - word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] - - for subword in word2_subwords: - ngrams_subwords_indices.append(model.wv.ngrams[subword]) - - l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size - l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size - - l1 = np_sum([l1_vocab, l1_ngrams], axis=0) - subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] - if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: - l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) - - # train on the sliding window for target word - train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) - result += len(word_vocabs) - return result - - -def train_batch_sg(model, sentences, alpha, work=None, neu1=None): - """Update skip-gram model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from fasttext_inner instead. - - Parameters - ---------- - model : :class:`~gensim.models.fasttext.FastText` - `FastText` instance. - sentences : iterable of iterables - Iterable of the sentences directly from disk/network. - alpha : float - Learning rate. - work : :class:`numpy.ndarray` - Private working memory for each worker. - neu1 : :class:`numpy.ndarray` - Private working memory for each worker. - - Returns - ------- - int - Effective number of words trained. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - # now go over all words from the (reduced) window, predicting each one in turn - start = max(0, pos - model.window + reduced_window) - - subwords_indices = [word.index] - word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] - - for subword in word2_subwords: - subwords_indices.append(model.wv.ngrams[subword]) - - for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): - if pos2 != pos: # don't train on the `word` itself - train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) - - result += len(word_vocabs) - return result - - -class FastText(Word2Vec): - """Class for training, using and evaluating word representations learned using method - described in [1]_ aka Fasttext. - - The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and - :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original - fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`. - - """ - def __init__( - self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, - bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): - """Initialize the model from an iterable of `sentences`. Each sentence is a - list of words (unicode strings) that will be used for training. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it - in some other way. - sg : int {1, 0} - Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. - size : int - Dimensionality of the feature vectors. - window : int - The maximum distance between the current and predicted word within a sentence. - alpha : float - The initial learning rate. - min_alpha : float - Learning rate will linearly drop to `min_alpha` as training progresses. - seed : int - Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, - you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter - from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires - use of the `PYTHONHASHSEED` environment variable to control hash randomization). - min_count : int - Ignores all words with total frequency lower than this. - max_vocab_size : int - Limits the RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. - Set to `None` for no limit. - sample : float - The threshold for configuring which higher-frequency words are randomly downsampled, - useful range is (0, 1e-5). - workers : int - Use these many worker threads to train the model (=faster training with multicore machines). - hs : int {1,0} - If 1, hierarchical softmax will be used for model training. - If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. - cbow_mean : int {1,0} - If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - hashfxn : function - Hash function to use to randomly initialize weights, for increased training reproducibility. - iter : int - Number of iterations (epochs) over the corpus. - trim_rule : function - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. - sorted_vocab : int {1,0} - If 1, sort the vocabulary by descending frequency before assigning word indexes. - batch_words : int - Target size (in words) for batches of examples passed to worker threads (and - thus cython routines).(Larger batches will be passed if individual - texts are longer than 10000 words, but the standard cython code truncates to that maximum.) - min_n : int - Min length of char ngrams to be used for training word representations. - max_n : int - Max length of char ngrams to be used for training word representations. Set `max_n` to be - lesser than `min_n` to avoid char ngrams being used. - word_ngrams : int {1,0} - If 1, uses enriches word vectors with subword(ngrams) information. - If 0, this is equivalent to word2vec. - bucket : int - Character ngrams are hashed into a fixed number of buckets, in order to limit the - memory usage of the model. This option specifies the number of buckets used by the model. - - Examples - -------- - Initialize and train a `FastText` model - - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = FastText(sentences, min_count=1) - >>> say_vector = model['say'] # get vector for word - >>> of_vector = model['of'] # get vector for out-of-vocab word - - """ - # fastText specific params - self.bucket = bucket - self.word_ngrams = word_ngrams - self.min_n = min_n - self.max_n = max_n - if self.word_ngrams <= 1 and self.max_n == 0: - self.bucket = 0 - - super(FastText, self).__init__( - sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, - max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, - sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, - trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words) - - def initialize_word_vectors(self): - """Initializes FastTextKeyedVectors instance to store all vocab/ngram vectors for the model.""" - self.wv = FastTextKeyedVectors() - self.wv.min_n = self.min_n - self.wv.max_n = self.max_n - - def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): - """Build vocabulary from a sequence of sentences (can be a once-only generator stream). - Each sentence must be a list of unicode strings. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - keep_raw_vocab : bool - If not true, delete the raw vocabulary after the scaling is done and free up RAM. - trim_rule : function - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. - progress_per : int - Indicates how many words to process before showing/updating the progress. - update: bool - If true, the new words in `sentences` will be added to model's vocab. - - Example - ------- - Train a model and update vocab for online training - - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> sentences_2 = [["dude", "say", "wazzup!"]] - >>> - >>> model = FastText(min_count=1) - >>> model.build_vocab(sentences_1) - >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter) - >>> model.build_vocab(sentences_2, update=True) - >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter) - - """ - if update: - if not len(self.wv.vocab): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus " - "before doing an online update.") - self.old_vocab_len = len(self.wv.vocab) - self.old_hash2index_len = len(self.wv.hash2index) - - super(FastText, self).build_vocab( - sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update) - self.init_ngrams(update=update) - - def init_ngrams(self, update=False): - """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. - Vectors for other ngrams are initialized with a random uniform distribution in FastText. - - Parameters - ---------- - update : bool - If True, the new vocab words and their new ngrams word vectors are initialized - with random uniform distribution and updated/added to the existing vocab word and ngram vectors. - - """ - if not update: - self.wv.ngrams = {} - self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) - self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL) - - self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) - self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) - - all_ngrams = [] - for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) - all_ngrams += self.wv.ngrams_word[w] - - all_ngrams = list(set(all_ngrams)) - self.num_ngram_vectors = len(all_ngrams) - logger.info("Total number of ngrams is %d", len(all_ngrams)) - - self.wv.hash2index = {} - ngram_indices = [] - new_hash_count = 0 - for i, ngram in enumerate(all_ngrams): - ngram_hash = ft_hash(ngram) % self.bucket - if ngram_hash in self.wv.hash2index: - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - else: - ngram_indices.append(ngram_hash % self.bucket) - self.wv.hash2index[ngram_hash] = new_hash_count - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - new_hash_count = new_hash_count + 1 - - self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) - self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) - self.reset_ngram_weights() - else: - new_ngrams = [] - for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) - new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] - - new_ngrams = list(set(new_ngrams)) - logger.info("Number of new ngrams is %d", len(new_ngrams)) - new_hash_count = 0 - for i, ngram in enumerate(new_ngrams): - ngram_hash = ft_hash(ngram) % self.bucket - if ngram_hash not in self.wv.hash2index: - self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - new_hash_count = new_hash_count + 1 - else: - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - - rand_obj = np.random - rand_obj.seed(self.seed) - new_vocab_rows = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, - (len(self.wv.vocab) - self.old_vocab_len, self.vector_size) - ).astype(REAL) - new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) - new_ngram_rows = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, - (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size) - ).astype(REAL) - new_ngram_lockf_rows = ones( - (len(self.wv.hash2index) - self.old_hash2index_len, - self.vector_size), - dtype=REAL) - - self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) - self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) - self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) - self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows]) - - def reset_ngram_weights(self): - """Reset all projection weights to an initial (untrained) state, - but keep the existing vocabulary and their ngrams. - - """ - rand_obj = np.random - rand_obj.seed(self.seed) - for index in range(len(self.wv.vocab)): - self.wv.syn0_vocab[index] = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size - ).astype(REAL) - for index in range(len(self.wv.hash2index)): - self.wv.syn0_ngrams[index] = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size - ).astype(REAL) - - def _do_train_job(self, sentences, alpha, inits): - """Train a single batch of sentences. Return 2-tuple `(effective word count after - ignoring unknown words and sentence length trimming, total word count)`. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - alpha : float - The current learning rate. - inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`) - Each worker's private work memory. - - Returns - ------- - (int, int) - Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count) - - """ - work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, neu1) - else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1) - - return tally, self._raw_word_count(sentences) - - def train(self, sentences, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0): - """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). - For FastText, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) - - To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate - progress-percentage logging, either total_examples (count of sentences) or total_words (count of - raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to - :meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus - will be available in the model's :attr:`corpus_count` property). - - To avoid common mistakes around the model's ability to do multiple training passes itself, an - explicit `epochs` argument **MUST** be provided. In the common and recommended case, - where :meth:`~gensim.models.fasttext.FastText.train()` is only called once, - the model's cached `iter` value should be supplied as `epochs` value. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - total_examples : int - Count of sentences. - total_words : int - Count of raw words in sentences. - epochs : int - Number of iterations (epochs) over the corpus. - start_alpha : float - Initial learning rate. - end_alpha : float - Final learning rate. Drops linearly from `start_alpha`. - word_count : int - Count of words already trained. Set this to 0 for the usual - case of training on all words in sentences. - queue_factor : int - Multiplier for size of queue (number of workers * queue_factor). - report_delay : float - Seconds to wait before reporting progress. - - Examples - -------- - - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = FastText(min_count=1) - >>> model.build_vocab(sentences) - >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) - - """ - self.neg_labels = [] - if self.negative > 0: - # precompute negative labels optimization for pure-python training - self.neg_labels = zeros(self.negative + 1) - self.neg_labels[0] = 1. - - Word2Vec.train( - self, sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha) - self.get_vocab_word_vecs() - - def __getitem__(self, word): - """Get `word` representations in vector space, as a 1D numpy array. - - Parameters - ---------- - word : str - A single word whose vector needs to be returned. - - Returns - ------- - :class:`numpy.ndarray` - The word's representations in vector space, as a 1D numpy array. - - Raises - ------ - KeyError - For words with all ngrams absent, a KeyError is raised. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> from gensim.test.utils import datapath - >>> - >>> trained_model = FastText.load_fasttext_format(datapath('lee_fasttext')) - >>> meow_vector = trained_model['hello'] # get vector for word - - """ - return self.word_vec(word) - - def get_vocab_word_vecs(self): - """Calculate vectors for words in vocabulary and stores them in `wv.syn0`.""" - for w, v in self.wv.vocab.items(): - word_vec = np.copy(self.wv.syn0_vocab[v.index]) - ngrams = self.wv.ngrams_word[w] - ngram_weights = self.wv.syn0_ngrams - for ngram in ngrams: - word_vec += ngram_weights[self.wv.ngrams[ngram]] - word_vec /= (len(ngrams) + 1) - self.wv.syn0[v.index] = word_vec - - def word_vec(self, word, use_norm=False): - """Get the word's representations in vector space, as a 1D numpy array. - - Parameters - ---------- - word : str - A single word whose vector needs to be returned. - use_norm : bool - If True, returns normalized vector. - - Returns - ------- - :class:`numpy.ndarray` - The word's representations in vector space, as a 1D numpy array. - - Raises - ------ - KeyError - For words with all ngrams absent, a KeyError is raised. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = FastText(sentences, min_count=1) - >>> meow_vector = model.word_vec('meow') # get vector for word - - """ - return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm) - - @classmethod - def load_fasttext_format(cls, *args, **kwargs): - """Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with - the original fasttext implementation. - - Parameters - ---------- - fname : str - Path to the file. - - """ - return Ft_Wrapper.load_fasttext_format(*args, **kwargs) - - def save(self, *args, **kwargs): - """Save the model. This saved model can be loaded again using :func:`~gensim.models.fasttext.FastText.load`, - which supports online training and getting vectors for out-of-vocabulary words. - - Parameters - ---------- - fname : str - Path to the file. - - """ - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) - super(FastText, self).save(*args, **kwargs) diff --git a/gensim/models/deprecated/fasttext_wrapper.py b/gensim/models/deprecated/fasttext_wrapper.py deleted file mode 100644 index 727db0e1e0..0000000000 --- a/gensim/models/deprecated/fasttext_wrapper.py +++ /dev/null @@ -1,461 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Jayant Jain -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.2.0 - Use :mod:`gensim.models.fasttext` instead. - - -Python wrapper around word representation learning from FastText, a library for efficient learning -of word representations and sentence classification [1]. - -This module allows training a word embedding from a training corpus with the additional ability -to obtain word vectors for out-of-vocabulary words, using the fastText C implementation. - -The wrapped model can NOT be updated with new documents for online training -- use gensim's -`Word2Vec` for that. - -Example: -.. sourcecode:: pycon - - >>> from gensim.models.wrappers import FastText - >>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8') - >>> print model['forests'] # prints vector for given out-of-vocabulary word - -.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information - - - -""" - - -import logging -import tempfile -import os -import struct - -import numpy as np -from numpy import float32 as REAL, sqrt, newaxis -from gensim import utils -from gensim.models.deprecated.keyedvectors import KeyedVectors, Vocab -from gensim.models.deprecated.word2vec import Word2Vec - -logger = logging.getLogger(__name__) - -try: - FileNotFoundError -except NameError: - FileNotFoundError = IOError - -FASTTEXT_FILEFORMAT_MAGIC = 793712314 - - -class FastTextKeyedVectors(KeyedVectors): - """ - Class to contain vectors, vocab and ngrams for the FastText training class and other methods not directly - involved in training such as most_similar(). - Subclasses KeyedVectors to implement oov lookups, storing ngrams and other FastText specific methods - - """ - - def __init__(self): - super(FastTextKeyedVectors, self).__init__() - self.syn0_vocab = None - self.syn0_vocab_norm = None - self.syn0_ngrams = None - self.syn0_ngrams_norm = None - self.ngrams = {} - self.hash2index = {} - self.ngrams_word = {} - self.min_n = 0 - self.max_n = 0 - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) - super(FastTextKeyedVectors, self).save(*args, **kwargs) - - def word_vec(self, word, use_norm=False): - """ - Accept a single word as input. - Returns the word's representations in vector space, as a 1D numpy array. - - The word can be out-of-vocabulary as long as ngrams for the word are present. - For words with all ngrams absent, a KeyError is raised. - - Example: - - .. sourcecode:: pycon - - >>> trained_model['office'] - array([ -1.40128313e-02, ...]) - - """ - if word in self.vocab: - return super(FastTextKeyedVectors, self).word_vec(word, use_norm) - else: - word_vec = np.zeros(self.syn0_ngrams.shape[1], dtype=np.float32) - ngrams = compute_ngrams(word, self.min_n, self.max_n) - ngrams = [ng for ng in ngrams if ng in self.ngrams] - if use_norm: - ngram_weights = self.syn0_ngrams_norm - else: - ngram_weights = self.syn0_ngrams - for ngram in ngrams: - word_vec += ngram_weights[self.ngrams[ngram]] - if word_vec.any(): - return word_vec / len(ngrams) - else: # No ngrams of the word are present in self.ngrams - raise KeyError('all ngrams for word %s absent from model' % word) - - def init_sims(self, replace=False): - """ - Precompute L2-normalized vectors. - - If `replace` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - - Note that you **cannot continue training** after doing a replace. The model becomes - effectively read-only = you can only call `most_similar`, `similarity` etc. - - """ - super(FastTextKeyedVectors, self).init_sims(replace) - if getattr(self, 'syn0_ngrams_norm', None) is None or replace: - logger.info("precomputing L2-norms of ngram weight vectors") - if replace: - for i in range(self.syn0_ngrams.shape[0]): - self.syn0_ngrams[i, :] /= sqrt((self.syn0_ngrams[i, :] ** 2).sum(-1)) - self.syn0_ngrams_norm = self.syn0_ngrams - else: - self.syn0_ngrams_norm = \ - (self.syn0_ngrams / sqrt((self.syn0_ngrams ** 2).sum(-1))[..., newaxis]).astype(REAL) - - def __contains__(self, word): - """ - Check if `word` or any character ngrams in `word` are present in the vocabulary. - A vector for the word is guaranteed to exist if `__contains__` returns True. - """ - if word in self.vocab: - return True - else: - char_ngrams = compute_ngrams(word, self.min_n, self.max_n) - return any(ng in self.ngrams for ng in char_ngrams) - - @classmethod - def load_word2vec_format(cls, *args, **kwargs): - """Not suppported. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" - raise NotImplementedError("Not supported. Use gensim.models.KeyedVectors.load_word2vec_format instead.") - - -class FastText(Word2Vec): - """ - Class for word vector training using FastText. Communication between FastText and Python - takes place by working with data files on disk and calling the FastText binary with - subprocess.call(). - Implements functionality similar to [fasttext.py](https://github.com/salestock/fastText.py), - improving speed and scope of functionality like `most_similar`, `similarity` by extracting vectors - into numpy matrix. - - Warnings - -------- - .. deprecated:: 3.2.0 - Use :class:`gensim.models.fasttext.FastText` instead of :class:`gensim.models.wrappers.fasttext.FastText`. - - - """ - - def initialize_word_vectors(self): - self.wv = FastTextKeyedVectors() - - @classmethod - def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5, - word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12): - """ - `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`. - - `corpus_file` is the filename of the text file to be used for training the FastText model. - Expects file to contain utf-8 encoded text. - - `model` defines the training algorithm. By default, cbow is used. Accepted values are - 'cbow', 'skipgram'. - - `size` is the dimensionality of the feature vectors. - - `window` is the maximum distance between the current and predicted word within a sentence. - - `alpha` is the initial learning rate. - - `min_count` = ignore all words with total occurrences lower than this. - - `word_ngram` = max length of word ngram - - `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax), - `ns` (negative sampling) and `softmax`. Defaults to `ns` - - `sample` = threshold for configuring which higher-frequency words are randomly downsampled; - default is 1e-3, useful range is (0, 1e-5). - - `negative` = the value for negative specifies how many "noise words" should be drawn - (usually between 5-20). Default is 5. If set to 0, no negative samping is used. - Only relevant when `loss` is set to `ns` - - `iter` = number of iterations (epochs) over the corpus. Default is 5. - - `min_n` = min length of char ngrams to be used for training word representations. Default is 3. - - `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be - lesser than `min_n` to avoid char ngrams being used. Default is 6. - - `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before - assigning word indexes. - - `threads` = number of threads to use. Default is 12. - - """ - ft_path = ft_path - output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model') - ft_args = { - 'input': corpus_file, - 'output': output_file, - 'lr': alpha, - 'dim': size, - 'ws': window, - 'epoch': iter, - 'minCount': min_count, - 'wordNgrams': word_ngrams, - 'neg': negative, - 'loss': loss, - 'minn': min_n, - 'maxn': max_n, - 'thread': threads, - 't': sample - } - cmd = [ft_path, model] - for option, value in ft_args.items(): - cmd.append("-%s" % option) - cmd.append(str(value)) - - utils.check_output(args=cmd) - model = cls.load_fasttext_format(output_file) - cls.delete_training_files(output_file) - return model - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) - super(FastText, self).save(*args, **kwargs) - - @classmethod - def load_fasttext_format(cls, model_file, encoding='utf8'): - """ - Load the input-hidden weight matrix from the fast text output files. - - Note that due to limitations in the FastText API, you cannot continue training - with a model loaded this way, though you can query for word similarity etc. - - `model_file` is the path to the FastText output files. - FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin` - - Expected value for this example: `/path/to/model` or `/path/to/model.bin`, - as gensim requires only `.bin` file to load entire fastText model. - - """ - model = cls() - if not model_file.endswith('.bin'): - model_file += '.bin' - model.file_name = model_file - model.load_binary_data(encoding=encoding) - return model - - @classmethod - def load(cls, *args, **kwargs): - model = super(FastText, cls).load(*args, **kwargs) - if hasattr(model.wv, 'syn0_all'): - setattr(model.wv, 'syn0_ngrams', model.wv.syn0_all) - delattr(model.wv, 'syn0_all') - return model - - @classmethod - def delete_training_files(cls, model_file): - """Deletes the files created by FastText training""" - try: - os.remove('%s.vec' % model_file) - os.remove('%s.bin' % model_file) - except FileNotFoundError: - logger.debug('Training files %s not found when attempting to delete', model_file) - pass - - def load_binary_data(self, encoding='utf8'): - """Loads data from the output binary file created by FastText training""" - - # TODO use smart_open again when https://github.com/RaRe-Technologies/smart_open/issues/207 will be fixed - with open(self.file_name, 'rb') as f: - self.load_model_params(f) - self.load_dict(f, encoding=encoding) - self.load_vectors(f) - - def load_model_params(self, file_handle): - magic, version = self.struct_unpack(file_handle, '@2i') - if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format - self.new_format = True - dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \ - self.struct_unpack(file_handle, '@12i1d') - else: # older format - self.new_format = False - dim = magic - ws = version - epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d') - # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc) - self.vector_size = dim - self.window = ws - self.iter = epoch - self.min_count = min_count - self.negative = neg - self.hs = loss == 1 - self.sg = model == 2 - self.bucket = bucket - self.wv.min_n = minn - self.wv.max_n = maxn - self.sample = t - - def load_dict(self, file_handle, encoding='utf8'): - vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i') - # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) - if nlabels > 0: - raise NotImplementedError("Supervised fastText models are not supported") - logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name) - - self.struct_unpack(file_handle, '@1q') # number of tokens - if self.new_format: - pruneidx_size, = self.struct_unpack(file_handle, '@q') - for i in range(vocab_size): - word_bytes = b'' - char_byte = file_handle.read(1) - # Read vocab word - while char_byte != b'\x00': - word_bytes += char_byte - char_byte = file_handle.read(1) - word = word_bytes.decode(encoding) - count, _ = self.struct_unpack(file_handle, '@qb') - - self.wv.vocab[word] = Vocab(index=i, count=count) - self.wv.index2word.append(word) - - assert len(self.wv.vocab) == nwords, ( - 'mismatch between final vocab size ({} words), ' - 'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords)) - if len(self.wv.vocab) != vocab_size: - # expecting to log this warning only for pretrained french vector, wiki.fr - logger.warning( - "mismatch between final vocab size (%s words), and expected vocab size (%s words)", - len(self.wv.vocab), vocab_size - ) - - if self.new_format: - for j in range(pruneidx_size): - self.struct_unpack(file_handle, '@2i') - - def load_vectors(self, file_handle): - if self.new_format: - self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc - num_vectors, dim = self.struct_unpack(file_handle, '@2q') - # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc) - assert self.vector_size == dim, ( - 'mismatch between vector size in model params ({}) and model vectors ({})' - .format(self.vector_size, dim) - ) - float_size = struct.calcsize('@f') - if float_size == 4: - dtype = np.dtype(np.float32) - elif float_size == 8: - dtype = np.dtype(np.float64) - - self.num_original_vectors = num_vectors - self.wv.syn0_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim) - self.wv.syn0_ngrams = self.wv.syn0_ngrams.reshape((num_vectors, dim)) - assert self.wv.syn0_ngrams.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \ - 'mismatch between actual weight matrix shape {} and expected shape {}'\ - .format( - self.wv.syn0_ngrams.shape, (self.bucket + len(self.wv.vocab), self.vector_size) - ) - - self.init_ngrams() - - def struct_unpack(self, file_handle, fmt): - num_bytes = struct.calcsize(fmt) - return struct.unpack(fmt, file_handle.read(num_bytes)) - - def init_ngrams(self): - """ - Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams. - Vectors for other ngrams are initialized with a random uniform distribution in FastText. These - vectors are discarded here to save space. - - """ - self.wv.ngrams = {} - all_ngrams = [] - self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL) - - for w, vocab in self.wv.vocab.items(): - all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n) - self.wv.syn0[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index]) - - all_ngrams = set(all_ngrams) - self.num_ngram_vectors = len(all_ngrams) - ngram_indices = [] - for i, ngram in enumerate(all_ngrams): - ngram_hash = ft_hash(ngram) - ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) - self.wv.ngrams[ngram] = i - self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) - - ngram_weights = self.wv.syn0_ngrams - - logger.info( - "loading weights for %s words for fastText model from %s", - len(self.wv.vocab), self.file_name - ) - - for w, vocab in self.wv.vocab.items(): - word_ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n) - for word_ngram in word_ngrams: - self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]]) - - self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1) - logger.info( - "loaded %s weight matrix for fastText model from %s", - self.wv.syn0.shape, self.file_name - ) - - -def compute_ngrams(word, min_n, max_n): - BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix - extended_word = BOW + word + EOW - ngrams = [] - for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): - for i in range(0, len(extended_word) - ngram_length + 1): - ngrams.append(extended_word[i:i + ngram_length]) - return ngrams - - -def ft_hash(string): - """ - Reproduces [hash method](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) - used in fastText. - - """ - # Runtime warnings for integer overflow are raised, this is expected behaviour. These warnings are suppressed. - old_settings = np.seterr(all='ignore') - h = np.uint32(2166136261) - for c in string: - h = h ^ np.uint32(ord(c)) - h = h * np.uint32(16777619) - np.seterr(**old_settings) - return h diff --git a/gensim/models/deprecated/keyedvectors.py b/gensim/models/deprecated/keyedvectors.py deleted file mode 100644 index a8983909d0..0000000000 --- a/gensim/models/deprecated/keyedvectors.py +++ /dev/null @@ -1,1115 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2016 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.keyedvectors` instead. - - -Word vector storage and similarity look-ups. -Common code independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc) - -The word vectors are considered read-only in this class. - -Initialize the vectors by training e.g. Word2Vec: - -.. sourcecode:: pycon - - >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) - >>> word_vectors = model.wv - -Persist the word vectors to disk with: - -.. sourcecode:: pycon - - >>> word_vectors.save(fname) - >>> word_vectors = KeyedVectors.load(fname) - -The vectors can also be instantiated from an existing file on disk -in the original Google's word2vec C format as a KeyedVectors instance: - -.. sourcecode:: pycon - - >>> from gensim.models.keyedvectors import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format - -You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them -are already built-in: - -.. sourcecode:: pycon - - >>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] - - >>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - [('queen', 0.71382287), ...] - - >>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' - - >>> word_vectors.similarity('woman', 'man') - 0.73723527 - -Correlation with human opinion on word similarity: - -.. sourcecode:: pycon - - >>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) - 0.51, 0.62, 0.13 - -And on analogies: - -.. sourcecode:: pycon - - >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) - -and so on. - -""" -from __future__ import division # py3 "true division" - -import logging - -try: - from queue import Queue, Empty -except ImportError: - from Queue import Queue, Empty # noqa:F401 - -# If pyemd C extension is available, import it. -# If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance -try: - from pyemd import emd - PYEMD_EXT = True -except (ImportError, ValueError): - PYEMD_EXT = False - -from numpy import dot, zeros, dtype, float32 as REAL,\ - double, array, vstack, fromstring, sqrt, newaxis,\ - ndarray, sum as np_sum, prod, ascontiguousarray,\ - argmax -import numpy as np - -from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.corpora.dictionary import Dictionary -from six import string_types, iteritems -from six.moves import range -from scipy import stats - - -logger = logging.getLogger(__name__) - - -class Vocab(object): - """ - A single vocabulary item, used internally for collecting per-word frequency/sampling info, - and for constructing binary trees (incl. both word leaves and inner nodes). - - """ - - def __init__(self, **kwargs): - self.count = 0 - self.__dict__.update(kwargs) - - def __lt__(self, other): # used for sorting in a priority queue - return self.count < other.count - - def __str__(self): - vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] - return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) - - -class KeyedVectorsBase(utils.SaveLoad): - """ - Base class to contain vectors and vocab for any set of vectors which are each associated with a key. - - """ - - def __init__(self): - self.syn0 = [] - self.vocab = {} - self.index2word = [] - self.vector_size = None - - def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): - """ - Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility. - - `fname` is the file used to save the vectors in - `fvocab` is an optional file used to save the vocabulary - `binary` is an optional boolean indicating whether the data is to be saved - in binary word2vec format (default: False) - `total_vec` is an optional parameter to explicitly specify total no. of vectors - (in case word vectors are appended with document vectors afterwards) - - """ - if total_vec is None: - total_vec = len(self.vocab) - vector_size = self.syn0.shape[1] - if fvocab is not None: - logger.info("storing vocabulary in %s", fvocab) - with utils.open(fvocab, 'wb') as vout: - for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): - vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) - logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) - assert (len(self.vocab), vector_size) == self.syn0.shape - with utils.open(fname, 'wb') as fout: - fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) - # store in sorted order: most frequent words at the top - for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): - row = self.syn0[vocab.index] - if binary: - fout.write(utils.to_utf8(word) + b" " + row.tostring()) - else: - fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row)))) - - @classmethod - def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): - """ - Load the input-hidden weight matrix from the original C word2vec-tool format. - - Note that the information stored in the file is incomplete (the binary tree is missing), - so while you can query for word similarity etc., you cannot continue training - with a model loaded this way. - - `binary` is a boolean indicating whether the data is in binary word2vec format. - `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. - Word counts are read from `fvocab` filename, if set (this is the file generated - by `-save-vocab` flag of the original C tool). - - If you trained the C model using non-utf8 encoding for words, specify that - encoding in `encoding`. - - `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors` - argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source - file may include word tokens truncated in the middle of a multibyte unicode character - (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. - - `limit` sets a maximum number of word-vectors to read from the file. The default, - None, means read all. - - `datatype` (experimental) can coerce dimensions to a non-default float type (such - as np.float16) to save memory. (Such types may result in much slower bulk operations - or incompatibility with optimized routines.) - - """ - counts = None - if fvocab is not None: - logger.info("loading word counts from %s", fvocab) - counts = {} - with utils.open(fvocab, 'rb') as fin: - for line in fin: - word, count = utils.to_unicode(line).strip().split() - counts[word] = int(count) - - logger.info("loading projection weights from %s", fname) - with utils.open(fname, 'rb') as fin: - header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format - if limit: - vocab_size = min(vocab_size, limit) - result = cls() - result.vector_size = vector_size - result.syn0 = zeros((vocab_size, vector_size), dtype=datatype) - - def add_word(word, weights): - word_id = len(result.vocab) - if word in result.vocab: - logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname) - return - if counts is None: - # most common scenario: no vocab file given. just make up some bogus counts, in descending order - result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) - elif word in counts: - # use count from the vocab file - result.vocab[word] = Vocab(index=word_id, count=counts[word]) - else: - # vocab file given, but word is missing -- set count to None (TODO: or raise?) - logger.warning("vocabulary file is incomplete: '%s' is missing", word) - result.vocab[word] = Vocab(index=word_id, count=None) - result.syn0[word_id] = weights - result.index2word.append(word) - - if binary: - binary_len = dtype(REAL).itemsize * vector_size - for _ in range(vocab_size): - # mixed text and binary: read text first, then binary - word = [] - while True: - ch = fin.read(1) - if ch == b' ': - break - if ch == b'': - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - if ch != b'\n': # ignore newlines in front of words (some binary files have) - word.append(ch) - word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) - weights = fromstring(fin.read(binary_len), dtype=REAL) - add_word(word, weights) - else: - for line_no in range(vocab_size): - line = fin.readline() - if line == b'': - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") - if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [REAL(x) for x in parts[1:]] - add_word(word, weights) - if result.syn0.shape[0] != len(result.vocab): - logger.info( - "duplicate words detected, shrinking matrix size from %i to %i", - result.syn0.shape[0], len(result.vocab) - ) - result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) - assert (len(result.vocab), vector_size) == result.syn0.shape - - logger.info("loaded %s matrix from %s", result.syn0.shape, fname) - return result - - def similarity(self, w1, w2): - """ - Compute similarity between vectors of two input words. - To be implemented by child class. - - """ - raise NotImplementedError - - def distance(self, w1, w2): - """ - Compute distance between vectors of two input words. - To be implemented by child class. - - """ - raise NotImplementedError - - def distances(self, word_or_vector, other_words=()): - """ - Compute distances from given word or vector to all words in `other_words`. - If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab. - To be implemented by child class. - - """ - raise NotImplementedError - - def word_vec(self, word): - """ - Accept a single word as input. - Returns the word's representations in vector space, as a 1D numpy array. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.word_vec('office') - array([ -1.40128313e-02, ...]) - - """ - if word in self.vocab: - result = self.syn0[self.vocab[word].index] - result.setflags(write=False) - return result - else: - raise KeyError("word '%s' not in vocabulary" % word) - - def __getitem__(self, words): - """ - Accept a single word or a list of words as input. - - If a single word: returns the word's representations in vector space, as - a 1D numpy array. - - Multiple words: return the words' representations in vector space, as a - 2d numpy array: #words x #vector_size. Matrix rows are in the same order - as in input. - - Example: - - .. sourcecode:: pycon - - >>> trained_model['office'] - array([ -1.40128313e-02, ...]) - - >>> trained_model[['office', 'products']] - array([ -1.40128313e-02, ...] - [ -1.70425311e-03, ...] - ...) - - """ - if isinstance(words, string_types): - # allow calls like trained_model['office'], as a shorthand for trained_model[['office']] - return self.word_vec(words) - - return vstack([self.word_vec(word) for word in words]) - - def __contains__(self, word): - return word in self.vocab - - def most_similar_to_given(self, w1, word_list): - """Return the word from word_list most similar to w1. - - Args: - w1 (str): a word - word_list (list): list of words containing a word most similar to w1 - - Returns: - the word in word_list with the highest similarity to w1 - - Raises: - KeyError: If w1 or any word in word_list is not in the vocabulary - - Example: - - .. sourcecode:: pycon - - >>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse']) - 'sound' - - >>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone']) - 'animal' - - """ - return word_list[argmax([self.similarity(w1, word) for word in word_list])] - - def words_closer_than(self, w1, w2): - """ - Returns all words that are closer to `w1` than `w2` is to `w1`. - - Parameters - ---------- - w1 : str - Input word. - w2 : str - Input word. - - Returns - ------- - list (str) - List of words that are closer to `w1` than `w2` is to `w1`. - - Examples - -------- - - .. sourcecode:: pycon - - >>> model.words_closer_than('carnivore.n.01', 'mammal.n.01') - ['dog.n.01', 'canine.n.02'] - - """ - all_distances = self.distances(w1) - w1_index = self.vocab[w1].index - w2_index = self.vocab[w2].index - closer_node_indices = np.where(all_distances < all_distances[w2_index])[0] - return [self.index2word[index] for index in closer_node_indices if index != w1_index] - - def rank(self, w1, w2): - """ - Rank of the distance of `w2` from `w1`, in relation to distances of all words from `w1`. - - Parameters - ---------- - w1 : str - Input word. - w2 : str - Input word. - - Returns - ------- - int - Rank of `w2` from `w1` in relation to all other nodes. - - Examples - -------- - - .. sourcecode:: pycon - - >>> model.rank('mammal.n.01', 'carnivore.n.01') - 3 - - """ - return len(self.words_closer_than(w1, w2)) + 1 - - -class EuclideanKeyedVectors(KeyedVectorsBase): - """ - Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly - involved in training such as most_similar() - """ - - def __init__(self): - super(EuclideanKeyedVectors, self).__init__() - self.syn0norm = None - - @property - def wv(self): - return self - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) - super(EuclideanKeyedVectors, self).save(*args, **kwargs) - - def word_vec(self, word, use_norm=False): - """ - Accept a single word as input. - Returns the word's representations in vector space, as a 1D numpy array. - - If `use_norm` is True, returns the normalized word vector. - - Example: - - .. sourcecode:: pycon - - >>> trained_model['office'] - array([ -1.40128313e-02, ...]) - - """ - if word in self.vocab: - if use_norm: - result = self.syn0norm[self.vocab[word].index] - else: - result = self.syn0[self.vocab[word].index] - - result.setflags(write=False) - return result - else: - raise KeyError("word '%s' not in vocabulary" % word) - - def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): - """ - Find the top-N most similar words. Positive words contribute positively towards the - similarity, negative words negatively. - - This method computes cosine similarity between a simple mean of the projection - weight vectors of the given words and the vectors for each word in the model. - The method corresponds to the `word-analogy` and `distance` scripts in the original - word2vec implementation. - - If topn is False, most_similar returns the vector of similarity scores. - - `restrict_vocab` is an optional integer which limits the range of vectors which - are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) - - Example: - - .. sourcecode:: pycon - - >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] - - """ - if positive is None: - positive = [] - if negative is None: - negative = [] - - self.init_sims() - - if isinstance(positive, string_types) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) - positive = [positive] - - # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words - positive = [ - (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in positive - ] - negative = [ - (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in negative - ] - - # compute the weighted average of all words - all_words, mean = set(), [] - for word, weight in positive + negative: - if isinstance(word, ndarray): - mean.append(weight * word) - else: - mean.append(weight * self.word_vec(word, use_norm=True)) - if word in self.vocab: - all_words.add(self.vocab[word].index) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - - if indexer is not None: - return indexer.most_similar(mean, topn) - - limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab] - dists = dot(limited, mean) - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) - # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] - return result[:topn] - - def similar_by_word(self, word, topn=10, restrict_vocab=None): - """ - Find the top-N most similar words. - - If topn is False, similar_by_word returns the vector of similarity scores. - - `restrict_vocab` is an optional integer which limits the range of vectors which - are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) - - Example: - - .. sourcecode:: pycon - - >>> trained_model.similar_by_word('graph') - [('user', 0.9999163150787354), ...] - - """ - return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) - - def similar_by_vector(self, vector, topn=10, restrict_vocab=None): - """ - Find the top-N most similar words by vector. - - If topn is False, similar_by_vector returns the vector of similarity scores. - - `restrict_vocab` is an optional integer which limits the range of vectors which - are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) - - Example:: - - >>> trained_model.similar_by_vector([1,2]) - [('survey', 0.9942699074745178), ...] - - """ - return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) - - def wmdistance(self, document1, document2): - """ - Compute the Word Mover's Distance between two documents. When using this - code, please consider citing the following papers: - - .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching". - .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances". - .. Matt Kusner et al. "From Word Embeddings To Document Distances". - - Note that if one of the documents have no words that exist in the - Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned. - - This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler). - - Example: - - .. sourcecode:: pycon - - >>> # Train word2vec model. - >>> model = Word2Vec(sentences) - - >>> # Some sentences to test. - >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() - >>> sentence_president = 'The president greets the press in Chicago'.lower().split() - - >>> # Remove their stopwords. - >>> from nltk.corpus import stopwords - >>> stopwords = nltk.corpus.stopwords.words('english') - >>> sentence_obama = [w for w in sentence_obama if w not in stopwords] - >>> sentence_president = [w for w in sentence_president if w not in stopwords] - - >>> # Compute WMD. - >>> distance = model.wmdistance(sentence_obama, sentence_president) - """ - - if not PYEMD_EXT: - raise ImportError("Please install pyemd Python package to compute WMD.") - - # Remove out-of-vocabulary words. - len_pre_oov1 = len(document1) - len_pre_oov2 = len(document2) - document1 = [token for token in document1 if token in self] - document2 = [token for token in document2 if token in self] - diff1 = len_pre_oov1 - len(document1) - diff2 = len_pre_oov2 - len(document2) - if diff1 > 0 or diff2 > 0: - logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2) - - if len(document1) == 0 or len(document2) == 0: - logger.info( - "At least one of the documents had no words that werein the vocabulary. " - "Aborting (returning inf)." - ) - return float('inf') - - dictionary = Dictionary(documents=[document1, document2]) - vocab_len = len(dictionary) - - if vocab_len == 1: - # Both documents are composed by a single unique token - return 0.0 - - # Sets for faster look-up. - docset1 = set(document1) - docset2 = set(document2) - - # Compute distance matrix. - distance_matrix = zeros((vocab_len, vocab_len), dtype=double) - for i, t1 in dictionary.items(): - for j, t2 in dictionary.items(): - if t1 not in docset1 or t2 not in docset2: - continue - # Compute Euclidean distance between word vectors. - distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2)) - - if np_sum(distance_matrix) == 0.0: - # `emd` gets stuck if the distance matrix contains only zeros. - logger.info('The distance matrix is all zeros. Aborting (returning inf).') - return float('inf') - - def nbow(document): - d = zeros(vocab_len, dtype=double) - nbow = dictionary.doc2bow(document) # Word frequencies. - doc_len = len(document) - for idx, freq in nbow: - d[idx] = freq / float(doc_len) # Normalized word frequencies. - return d - - # Compute nBOW representation of documents. - d1 = nbow(document1) - d2 = nbow(document2) - - # Compute WMD. - return emd(d1, d2, distance_matrix) - - def most_similar_cosmul(self, positive=None, negative=None, topn=10): - """ - Find the top-N most similar words, using the multiplicative combination objective - proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute - positively towards the similarity, negative words negatively, but with less - susceptibility to one large distance dominating the calculation. - - In the common analogy-solving case, of two positive and one negative examples, - this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg. - - Additional positive or negative examples contribute to the numerator or denominator, - respectively – a potentially sensible but untested extension of the method. (With - a single positive example, rankings will be the same as in the default most_similar.) - - Example: - - .. sourcecode:: pycon - - >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) - [(u'iraq', 0.8488819003105164), ...] - - .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014. - - """ - if positive is None: - positive = [] - if negative is None: - negative = [] - - self.init_sims() - - if isinstance(positive, string_types) and not negative: - # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) - positive = [positive] - - all_words = { - self.vocab[word].index for word in positive + negative - if not isinstance(word, ndarray) and word in self.vocab - } - - positive = [ - self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word - for word in positive - ] - negative = [ - self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word - for word in negative - ] - - if not positive: - raise ValueError("cannot compute similarity with no input") - - # equation (4) of Levy & Goldberg "Linguistic Regularities...", - # with distances shifted to [0,1] per footnote (7) - pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive] - neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative] - dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001) - - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) - # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] - return result[:topn] - - def doesnt_match(self, words): - """ - Which word from the given list doesn't go with the others? - - Example:: - - >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' - - """ - self.init_sims() - - used_words = [word for word in words if word in self] - if len(used_words) != len(words): - ignored_words = set(words) - set(used_words) - logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words) - if not used_words: - raise ValueError("cannot select a word from an empty list") - vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) - mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) - dists = dot(vectors, mean) - return sorted(zip(dists, used_words))[0][1] - - @staticmethod - def cosine_similarities(vector_1, vectors_all): - """ - Return cosine similarities between one vector and a set of other vectors. - - Parameters - ---------- - vector_1 : numpy.array - vector from which similarities are to be computed. - expected shape (dim,) - vectors_all : numpy.array - for each row in vectors_all, distance from vector_1 is computed. - expected shape (num_vectors, dim) - - Returns - ------- - numpy.array - Contains cosine distance between vector_1 and each row in vectors_all. - shape (num_vectors,) - - """ - norm = np.linalg.norm(vector_1) - all_norms = np.linalg.norm(vectors_all, axis=1) - dot_products = dot(vectors_all, vector_1) - similarities = dot_products / (norm * all_norms) - return similarities - - def distances(self, word_or_vector, other_words=()): - """ - Compute cosine distances from given word or vector to all words in `other_words`. - If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab. - - Parameters - ---------- - word_or_vector : str or numpy.array - Word or vector from which distances are to be computed. - - other_words : iterable(str) or None - For each word in `other_words` distance from `word_or_vector` is computed. - If None or empty, distance of `word_or_vector` from all words in vocab is computed (including itself). - - Returns - ------- - numpy.array - Array containing distances to all words in `other_words` from input `word_or_vector`, - in the same order as `other_words`. - - Notes - ----- - Raises KeyError if either `word_or_vector` or any word in `other_words` is absent from vocab. - - """ - if isinstance(word_or_vector, string_types): - input_vector = self.word_vec(word_or_vector) - else: - input_vector = word_or_vector - if not other_words: - other_vectors = self.syn0 - else: - other_indices = [self.vocab[word].index for word in other_words] - other_vectors = self.syn0[other_indices] - return 1 - self.cosine_similarities(input_vector, other_vectors) - - def distance(self, w1, w2): - """ - Compute cosine distance between two words. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.distance('woman', 'man') - 0.34 - - >>> trained_model.distance('woman', 'woman') - 0.0 - - """ - return 1 - self.similarity(w1, w2) - - def similarity(self, w1, w2): - """ - Compute cosine similarity between two words. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.similarity('woman', 'man') - 0.73723527 - - >>> trained_model.similarity('woman', 'woman') - 1.0 - - """ - return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2])) - - def n_similarity(self, ws1, ws2): - """ - Compute cosine similarity between two sets of words. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) - 0.61540466561049689 - - >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant']) - 1.0000000000000004 - - >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant') - True - - """ - if not(len(ws1) and len(ws2)): - raise ZeroDivisionError('At least one of the passed list is empty.') - v1 = [self[word] for word in ws1] - v2 = [self[word] for word in ws2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) - - @staticmethod - def log_accuracy(section): - correct, incorrect = len(section['correct']), len(section['incorrect']) - if correct + incorrect > 0: - logger.info( - "%s: %.1f%% (%i/%i)", - section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect - ) - - def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): - """ - Compute accuracy of the model. `questions` is a filename where lines are - 4-tuples of words, split into sections by ": SECTION NAME" lines. - See questions-words.txt in - https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip - for an example. - - The accuracy is reported (=printed to log and returned as a list) for each - section separately, plus there's one aggregate summary at the end. - - Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` - words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. - In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then - case normalization is performed. - - Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before - evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens - and question words. In case of multiple case variants of a single word, the vector for the first - occurrence (also the most frequent if vocabulary is sorted) is taken. - - This method corresponds to the `compute-accuracy` script of the original C word2vec. - - """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) - - sections, section = [], None - with utils.open(questions, 'rb') as f: - for line_no, line in enumerate(f): - # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed - line = utils.to_unicode(line) - if line.startswith(': '): - # a new section starts => store the old section - if section: - sections.append(section) - self.log_accuracy(section) - section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} - else: - if not section: - raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) - try: - if case_insensitive: - a, b, c, expected = [word.upper() for word in line.split()] - else: - a, b, c, expected = [word for word in line.split()] - except ValueError: - logger.info("skipping invalid line #%i in %s", line_no, questions) - continue - if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) - continue - - original_vocab = self.vocab - self.vocab = ok_vocab - ignore = {a, b, c} # input words to be ignored - predicted = None - # find the most likely prediction, ignoring OOV words and input words - sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) - self.vocab = original_vocab - for index in matutils.argsort(sims, reverse=True): - predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] - if predicted in ok_vocab and predicted not in ignore: - if predicted != expected: - logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) - break - if predicted == expected: - section['correct'].append((a, b, c, expected)) - else: - section['incorrect'].append((a, b, c, expected)) - if section: - # store the last section, too - sections.append(section) - self.log_accuracy(section) - - total = { - 'section': 'total', - 'correct': sum((s['correct'] for s in sections), []), - 'incorrect': sum((s['incorrect'] for s in sections), []), - } - self.log_accuracy(total) - sections.append(total) - return sections - - @staticmethod - def log_evaluate_word_pairs(pearson, spearman, oov, pairs): - logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0]) - logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0]) - logger.info('Pairs with unknown words ratio: %.1f%%', oov) - - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, - case_insensitive=True, dummy4unknown=False): - """ - Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where - lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`. - An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at - http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html. - - The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient - between the similarities from the dataset and the similarities produced by the model itself. - The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words). - - Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab` - words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency. - If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization - is performed. - - Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before - evaluating the model (default True). Useful when you expect case-mismatch between training tokens - and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first - occurrence (also the most frequent if vocabulary is sorted) is taken. - - Use `dummy4unknown=True` to produce zero-valued similarities for pairs with out-of-vocabulary words. - Otherwise (default False), these pairs are skipped entirely. - """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) - - similarity_gold = [] - similarity_model = [] - oov = 0 - - original_vocab = self.vocab - self.vocab = ok_vocab - - with utils.open(pairs, 'rb') as f: - for line_no, line in enumerate(f): - line = utils.to_unicode(line) - if line.startswith('#'): - # May be a comment - continue - else: - try: - if case_insensitive: - a, b, sim = [word.upper() for word in line.split(delimiter)] - else: - a, b, sim = [word for word in line.split(delimiter)] - sim = float(sim) - except (ValueError, TypeError): - logger.info('skipping invalid line #%d in %s', line_no, pairs) - continue - if a not in ok_vocab or b not in ok_vocab: - oov += 1 - if dummy4unknown: - similarity_model.append(0.0) - similarity_gold.append(sim) - continue - else: - logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip()) - continue - similarity_gold.append(sim) # Similarity from the dataset - similarity_model.append(self.similarity(a, b)) # Similarity from the model - self.vocab = original_vocab - spearman = stats.spearmanr(similarity_gold, similarity_model) - pearson = stats.pearsonr(similarity_gold, similarity_model) - oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100 - - logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) - logger.debug( - 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', - pairs, spearman[0], spearman[1] - ) - logger.debug('Pairs with unknown words: %d', oov) - self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) - return pearson, spearman, oov_ratio - - def init_sims(self, replace=False): - """ - Precompute L2-normalized vectors. - - If `replace` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - - Note that you **cannot continue training** after doing a replace. The model becomes - effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`. - - """ - if getattr(self, 'syn0norm', None) is None or replace: - logger.info("precomputing L2-norms of word weight vectors") - if replace: - for i in range(self.syn0.shape[0]): - self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) - self.syn0norm = self.syn0 - else: - self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL) - - def get_keras_embedding(self, train_embeddings=False): - """ - Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings - """ - try: - from keras.layers import Embedding - except ImportError: - raise ImportError("Please install Keras to use this function") - weights = self.syn0 - - # set `trainable` as `False` to use the pretrained word embedding - # No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights - layer = Embedding( - input_dim=weights.shape[0], output_dim=weights.shape[1], - weights=[weights], trainable=train_embeddings - ) - return layer - - -# For backward compatibility -KeyedVectors = EuclideanKeyedVectors diff --git a/gensim/models/deprecated/old_saveload.py b/gensim/models/deprecated/old_saveload.py deleted file mode 100644 index 750d83ed44..0000000000 --- a/gensim/models/deprecated/old_saveload.py +++ /dev/null @@ -1,398 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2018 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.utils` instead. - - -Class containing the old SaveLoad class with modeified `unpickle` function is support loading models saved using -an older gensim version. - -""" -from __future__ import with_statement - -import logging - -try: - import cPickle as _pickle -except ImportError: - import pickle as _pickle - -import re -import sys - -import numpy as np -import scipy.sparse - -from six import iteritems - -from gensim import utils - -if sys.version_info[0] >= 3: - unicode = str - -logger = logging.getLogger(__name__) - - -PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE) -RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) - - -class SaveLoad(object): - """Class which inherit from this class have save/load functions, which un/pickle them to disk. - - Warnings - -------- - This uses pickle for de/serializing, so objects must not contain unpicklable attributes, - such as lambda functions etc. - - """ - @classmethod - def load(cls, fname, mmap=None): - """Load a previously saved object (using :meth:`~gensim.utils.SaveLoad.save`) from file. - - Parameters - ---------- - fname : str - Path to file that contains needed object. - mmap : str, optional - Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays - via mmap (shared memory) using `mmap='r'. - If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set. - - See Also - -------- - :meth:`~gensim.utils.SaveLoad.save` - - Returns - ------- - object - Object loaded from `fname`. - - Raises - ------ - IOError - When methods are called on instance (should be called from class). - - """ - logger.info("loading %s object from %s", cls.__name__, fname) - - compress, subname = SaveLoad._adapt_by_suffix(fname) - - obj = unpickle(fname) - obj._load_specials(fname, mmap, compress, subname) - logger.info("loaded %s", fname) - return obj - - def _load_specials(self, fname, mmap, compress, subname): - """Loads any attributes that were stored specially, and gives the same opportunity - to recursively included :class:`~gensim.utils.SaveLoad` instances. - - Parameters - ---------- - fname : str - Path to file that contains needed object. - mmap : str - Memory-map option. - compress : bool - Set to True if file is compressed. - subname : str - ... - - - """ - def mmap_error(obj, filename): - return IOError( - 'Cannot mmap compressed object %s in file %s. ' % (obj, filename) - + 'Use `load(fname, mmap=None)` or uncompress files manually.' - ) - - for attrib in getattr(self, '__recursive_saveloads', []): - cfname = '.'.join((fname, attrib)) - logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap) - getattr(self, attrib)._load_specials(cfname, mmap, compress, subname) - - for attrib in getattr(self, '__numpys', []): - logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap) - - if compress: - if mmap: - raise mmap_error(attrib, subname(fname, attrib)) - - val = np.load(subname(fname, attrib))['val'] - else: - val = np.load(subname(fname, attrib), mmap_mode=mmap) - - setattr(self, attrib, val) - - for attrib in getattr(self, '__scipys', []): - logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap) - sparse = unpickle(subname(fname, attrib)) - if compress: - if mmap: - raise mmap_error(attrib, subname(fname, attrib)) - - with np.load(subname(fname, attrib, 'sparse')) as f: - sparse.data = f['data'] - sparse.indptr = f['indptr'] - sparse.indices = f['indices'] - else: - sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap) - sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) - sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) - - setattr(self, attrib, sparse) - - for attrib in getattr(self, '__ignoreds', []): - logger.info("setting ignored attribute %s to None", attrib) - setattr(self, attrib, None) - - @staticmethod - def _adapt_by_suffix(fname): - """Give appropriate compress setting and filename formula. - - Parameters - ---------- - fname : str - Input filename. - - Returns - ------- - (bool, function) - First argument will be True if `fname` compressed. - - """ - compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy') - return compress, lambda *args: '.'.join(args + (suffix,)) - - def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """Save the object to file. - - Parameters - ---------- - fname : str - Path to file. - separately : list, optional - Iterable of attributes than need to store distinctly. - sep_limit : int, optional - Limit for separation. - ignore : frozenset, optional - Attributes that shouldn't be store. - pickle_protocol : int, optional - Protocol number for pickle. - - Notes - ----- - If `separately` is None, automatically detect large - numpy/scipy.sparse arrays in the object being stored, and store - them into separate files. This avoids pickle memory errors and - allows mmap'ing large arrays back on load efficiently. - - You can also set `separately` manually, in which case it must be - a list of attribute names to be stored in separate files. The - automatic check is not performed in this case. - - See Also - -------- - :meth:`~gensim.utils.SaveLoad.load` - - """ - logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately) - - compress, subname = SaveLoad._adapt_by_suffix(fname) - - restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, - compress, subname) - try: - pickle(self, fname, protocol=pickle_protocol) - finally: - # restore attribs handled specially - for obj, asides in restores: - for attrib, val in iteritems(asides): - setattr(obj, attrib, val) - logger.info("saved %s", fname) - - def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): - """Save aside any attributes that need to be handled separately, including - by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances. - - Parameters - ---------- - fname : str - Output filename. - separately : list or None - Iterable of attributes than need to store distinctly - sep_limit : int - Limit for separation. - ignore : iterable of str - Attributes that shouldn't be store. - pickle_protocol : int - Protocol number for pickle. - compress : bool - If True - compress output with :func:`numpy.savez_compressed`. - subname : function - Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix` - - Returns - ------- - list of (obj, {attrib: value, ...}) - Settings that the caller should use to restore each object's attributes that were set aside - during the default :func:`~gensim.utils.pickle`. - - """ - asides = {} - sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix) - if separately is None: - separately = [] - for attrib, val in iteritems(self.__dict__): - if isinstance(val, np.ndarray) and val.size >= sep_limit: - separately.append(attrib) - elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: - separately.append(attrib) - - # whatever's in `separately` or `ignore` at this point won't get pickled - for attrib in separately + list(ignore): - if hasattr(self, attrib): - asides[attrib] = getattr(self, attrib) - delattr(self, attrib) - - recursive_saveloads = [] - restores = [] - for attrib, val in iteritems(self.__dict__): - if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading - recursive_saveloads.append(attrib) - cfname = '.'.join((fname, attrib)) - restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname)) - - try: - numpys, scipys, ignoreds = [], [], [] - for attrib, val in iteritems(asides): - if isinstance(val, np.ndarray) and attrib not in ignore: - numpys.append(attrib) - logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib)) - - if compress: - np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val)) - else: - np.save(subname(fname, attrib), np.ascontiguousarray(val)) - - elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: - scipys.append(attrib) - logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib)) - - if compress: - np.savez_compressed( - subname(fname, attrib, 'sparse'), - data=val.data, - indptr=val.indptr, - indices=val.indices - ) - else: - np.save(subname(fname, attrib, 'data'), val.data) - np.save(subname(fname, attrib, 'indptr'), val.indptr) - np.save(subname(fname, attrib, 'indices'), val.indices) - - data, indptr, indices = val.data, val.indptr, val.indices - val.data, val.indptr, val.indices = None, None, None - - try: - # store array-less object - pickle(val, subname(fname, attrib), protocol=pickle_protocol) - finally: - val.data, val.indptr, val.indices = data, indptr, indices - else: - logger.info("not storing attribute %s", attrib) - ignoreds.append(attrib) - - self.__dict__['__numpys'] = numpys - self.__dict__['__scipys'] = scipys - self.__dict__['__ignoreds'] = ignoreds - self.__dict__['__recursive_saveloads'] = recursive_saveloads - except Exception: - # restore the attributes if exception-interrupted - for attrib, val in iteritems(asides): - setattr(self, attrib, val) - raise - return restores + [(self, asides)] - - def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """Save the object to file. - - Parameters - ---------- - fname_or_handle : str or file-like - Path to output file or already opened file-like object. If the object is a file handle, - no special array handling will be performed, all attributes will be saved to the same file. - separately : list of str or None, optional - If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store - them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays - back on load efficiently. - If list of str - this attributes will be stored in separate files, the automatic check - is not performed in this case. - sep_limit : int - Limit for automatic separation. - ignore : frozenset of str - Attributes that shouldn't be serialize/store. - pickle_protocol : int - Protocol number for pickle. - - See Also - -------- - :meth:`~gensim.utils.SaveLoad.load` - - """ - try: - _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) - logger.info("saved %s object", self.__class__.__name__) - except TypeError: # `fname_or_handle` does not have write attribute - self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) - - -def unpickle(fname): - """Load object from `fname`. - - Parameters - ---------- - fname : str - Path to pickle file. - - Returns - ------- - object - Python object loaded from `fname`. - - """ - with utils.open(fname, 'rb') as f: - file_bytes = f.read() - file_bytes = file_bytes.replace(b'gensim.models.word2vec', b'gensim.models.deprecated.word2vec') - file_bytes = file_bytes.replace(b'gensim.models.keyedvectors', b'gensim.models.deprecated.keyedvectors') - file_bytes = file_bytes.replace(b'gensim.models.doc2vec', b'gensim.models.deprecated.doc2vec') - file_bytes = file_bytes.replace(b'gensim.models.fasttext', b'gensim.models.deprecated.fasttext') - file_bytes = file_bytes.replace( - b'gensim.models.wrappers.fasttext', b'gensim.models.deprecated.fasttext_wrapper') - if sys.version_info > (3, 0): - return _pickle.loads(file_bytes, encoding='latin1') - else: - return _pickle.loads(file_bytes) - - -def pickle(obj, fname, protocol=2): - """Pickle object `obj` to file `fname`. - - Parameters - ---------- - obj : object - Any python object. - fname : str - Path to pickle file. - protocol : int, optional - Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x. - - """ - with utils.open(fname, 'wb') as fout: # 'b' for binary, needed on Windows - _pickle.dump(obj, fout, protocol=protocol) diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py deleted file mode 100644 index d57a902c55..0000000000 --- a/gensim/models/deprecated/word2vec.py +++ /dev/null @@ -1,1907 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2013 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.word2vec` instead. - - -Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either -hierarchical softmax or negative sampling [1]_ [2]_. - -NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. -See wrappers for FastText, VarEmbed and WordRank. - -The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ -and extended with additional functionality. - -For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, -visit http://radimrehurek.com/2014/02/word2vec-tutorial/ - -**Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training** -(70x speedup compared to plain NumPy implementation [3]_). - -Initialize a model with e.g.: - -.. sourcecode:: pycon - - >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) - -Persist a model to disk with: - -.. sourcecode:: pycon - - >>> model.save(fname) - >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! - -The word vectors are stored in a KeyedVectors instance in model.wv. -This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec: - -.. sourcecode:: pycon - - >>> model.wv['computer'] # numpy vector of a word - array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) - -The word vectors can also be instantiated from an existing file on disk in the word2vec C format -as a KeyedVectors instance:: - - NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, - vocabulary frequency and the binary tree is missing: - - .. sourcecode:: pycon - - >>> from gensim.models.keyedvectors import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format - - -You can perform various NLP word tasks with the model. Some of them -are already built-in: - -.. sourcecode:: pycon - - >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] - - >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - [('queen', 0.71382287), ...] - - >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' - - >>> model.wv.similarity('woman', 'man') - 0.73723527 - -Probability of a text under the model: - -.. sourcecode:: pycon - - >>> model.score(["The fox jumped over a lazy dog".split()]) - 0.2158356 - -Correlation with human opinion on word similarity: - -.. sourcecode:: pycon - - >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) - 0.51, 0.62, 0.13 - -And on analogies: - -.. sourcecode:: pycon - - >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) - -and so on. - -If you're finished training a model (i.e. no more updates, only querying), -then switch to the :mod:`gensim.models.KeyedVectors` instance in wv - -.. sourcecode:: pycon - - >>> word_vectors = model.wv - >>> del model - -to trim unneeded model memory = use much less RAM. - -Note that there is a :mod:`gensim.models.phrases` module which lets you automatically -detect phrases longer than one word. Using phrases, you can learn a word2vec model -where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: - -.. sourcecode:: pycon - - >>> bigram_transformer = gensim.models.Phrases(sentences) - >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) - -.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. - Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. - Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. -.. [3] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ -""" -from __future__ import division # py3 "true division" - -import logging -import sys -import os -import heapq -from timeit import default_timer -from copy import deepcopy -from collections import defaultdict -import threading -import itertools -import warnings - -from gensim.utils import keep_vocab_item, call_on_class_only -from gensim.models.deprecated.keyedvectors import KeyedVectors, Vocab -from gensim.models.word2vec import Word2Vec as NewWord2Vec -from gensim.models.deprecated.old_saveload import SaveLoad - -try: - from queue import Queue, Empty -except ImportError: - from Queue import Queue, Empty - -from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ - uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ - empty, sum as np_sum, ones, logaddexp - -from scipy.special import expit - -from gensim import utils -from gensim import matutils # utility fnc for pickling, common scipy operations etc -from six import iteritems, itervalues, string_types -from six.moves import range -from types import GeneratorType - -logger = logging.getLogger(__name__) - -MAX_WORDS_IN_BATCH = 10000 - - -def load_old_word2vec(*args, **kwargs): - old_model = Word2Vec.load(*args, **kwargs) - vector_size = getattr(old_model, 'vector_size', old_model.layer1_size) - params = { - 'size': vector_size, - 'alpha': old_model.alpha, - 'window': old_model.window, - 'min_count': old_model.min_count, - 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), - 'sample': old_model.__dict__.get('sample', 1e-3), - 'seed': old_model.seed, - 'workers': old_model.workers, - 'min_alpha': old_model.min_alpha, - 'sg': old_model.sg, - 'hs': old_model.hs, - 'negative': old_model.negative, - 'cbow_mean': old_model.cbow_mean, - 'hashfxn': old_model.__dict__.get('hashfxn', hash), - 'iter': old_model.__dict__.get('iter', 5), - 'null_word': old_model.__dict__.get('null_word', 0), - 'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1), - 'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH), - 'compute_loss': old_model.__dict__.get('compute_loss', None) - } - new_model = NewWord2Vec(**params) - # set trainables attributes - new_model.wv.vectors = old_model.wv.syn0 - if hasattr(old_model.wv, 'syn0norm'): - new_model.wv.vectors_norm = old_model.wv.syn0norm - if hasattr(old_model, 'syn1'): - new_model.trainables.syn1 = old_model.syn1 - if hasattr(old_model, 'syn1neg'): - new_model.trainables.syn1neg = old_model.syn1neg - if hasattr(old_model, 'syn0_lockf'): - new_model.trainables.vectors_lockf = old_model.syn0_lockf - # set vocabulary attributes - new_model.wv.vocab = old_model.wv.vocab - new_model.wv.index2word = old_model.wv.index2word - new_model.vocabulary.cum_table = old_model.__dict__.get('cum_table', None) - - new_model.train_count = old_model.__dict__.get('train_count', None) - new_model.corpus_count = old_model.__dict__.get('corpus_count', None) - new_model.corpus_total_words = old_model.__dict__.get('corpus_total_words', None) - new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0) - new_model.total_train_time = old_model.__dict__.get('total_train_time', None) - new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha) - new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None) - - return new_model - - -def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): - """ - Update skip-gram model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from `Word2Vec.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - - # now go over all words from the (reduced) window, predicting each one in turn - start = max(0, pos - model.window + reduced_window) - for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): - # don't train on the `word` itself - if pos2 != pos: - train_sg_pair( - model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss - ) - - result += len(word_vocabs) - return result - - -def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): - """ - Update CBOW model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from `Word2Vec.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] - l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size - if word2_indices and model.cbow_mean: - l1 /= len(word2_indices) - train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss) - result += len(word_vocabs) - return result - - -def score_sentence_sg(model, sentence, work=None): - """ - Obtain likelihood score for a single sentence in a fitted skip-gram representaion. - - The sentence is a list of Vocab objects (or None, when the corresponding - word is not in the vocabulary). Called internally from `Word2Vec.score()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - log_prob_sentence = 0.0 - if model.negative: - raise RuntimeError("scoring is only available for HS=True") - - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab] - for pos, word in enumerate(word_vocabs): - if word is None: - continue # OOV word in the input sentence => skip - - # now go over all words from the window, predicting each one in turn - start = max(0, pos - model.window) - for pos2, word2 in enumerate(word_vocabs[start: pos + model.window + 1], start): - # don't train on OOV words and on the `word` itself - if word2 is not None and pos2 != pos: - log_prob_sentence += score_sg_pair(model, word, word2) - - return log_prob_sentence - - -def score_sentence_cbow(model, sentence, work=None, neu1=None): - """ - Obtain likelihood score for a single sentence in a fitted CBOW representaion. - - The sentence is a list of Vocab objects (or None, where the corresponding - word is not in the vocabulary. Called internally from `Word2Vec.score()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - log_prob_sentence = 0.0 - if model.negative: - raise RuntimeError("scoring is only available for HS=True") - - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab] - for pos, word in enumerate(word_vocabs): - if word is None: - continue # OOV word in the input sentence => skip - - start = max(0, pos - model.window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] - l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x layer1_size - if word2_indices and model.cbow_mean: - l1 /= len(word2_indices) - log_prob_sentence += score_cbow_pair(model, word, l1) - - return log_prob_sentence - - -def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, - context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): - if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 - if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf - - if word not in model.wv.vocab: - return - predict_word = model.wv.vocab[word] # target word (NN output) - - if is_ft: - l1_vocab = context_vectors_vocab[context_index[0]] - l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0) - if context_index: - l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index) - else: - l1 = context_vectors[context_index] # input word (NN input/projection layer) - lock_factor = context_locks[context_index] - - neu1e = zeros(l1.shape) - - if model.hs: - # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) - l2a = deepcopy(model.syn1[predict_word.point]) # 2d matrix, codelen x layer1_size - prod_term = dot(l1, l2a.T) - fa = expit(prod_term) # propagate hidden -> output - ga = (1 - predict_word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[predict_word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - # loss component corresponding to hierarchical softmax - if compute_loss: - sgn = (-1.0)**predict_word.code # `ch` function, 0 -> 1, 1 -> -1 - lprob = -log(expit(-sgn * prod_term)) - model.running_training_loss += sum(lprob) - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [predict_word.index] - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != predict_word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - prod_term = dot(l1, l2b.T) - fb = expit(prod_term) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - # loss component corresponding to negative sampling - if compute_loss: - model.running_training_loss -= sum(log(expit(-1 * prod_term[1:]))) # for the sampled words - model.running_training_loss -= log(expit(prod_term[0])) # for the output word - - if learn_vectors: - if is_ft: - model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]] - for i in context_index[1:]: - model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) - return neu1e - - -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, - compute_loss=False, context_vectors=None, context_locks=None, is_ft=False): - if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 - if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf - - neu1e = zeros(l1.shape) - - if model.hs: - l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - prod_term = dot(l1, l2a.T) - fa = expit(prod_term) # propagate hidden -> output - ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - # loss component corresponding to hierarchical softmax - if compute_loss: - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - model.running_training_loss += sum(-log(expit(-sgn * prod_term))) - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [word.index] - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - prod_term = dot(l1, l2b.T) - fb = expit(prod_term) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - # loss component corresponding to negative sampling - if compute_loss: - model.running_training_loss -= sum(log(expit(-1 * prod_term[1:]))) # for the sampled words - model.running_training_loss -= log(expit(prod_term[0])) # for the output word - - if learn_vectors: - # learn input -> hidden, here for all words in the window separately - if is_ft: - if not model.cbow_mean and input_word_indices: - neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1])) - for i in input_word_indices[0]: - context_vectors_vocab[i] += neu1e * context_locks_vocab[i] - for i in input_word_indices[1]: - context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - if not model.cbow_mean and input_word_indices: - neu1e /= len(input_word_indices) - for i in input_word_indices: - context_vectors[i] += neu1e * context_locks[i] - - return neu1e - - -def score_sg_pair(model, word, word2): - l1 = model.wv.syn0[word2.index] - l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) - return sum(lprob) - - -def score_cbow_pair(model, word, l1): - l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) - return sum(lprob) - - -class Word2Vec(SaveLoad): - """ - Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/ - - If you're finished training a model (=no more updates, only querying) - then switch to the :mod:`gensim.models.KeyedVectors` instance in wv - - The model can be stored/loaded via its `save()` and `load()` methods, or stored/loaded in a format - compatible with the original word2vec implementation via `wv.save_word2vec_format()` - and `KeyedVectors.load_word2vec_format()`. - - """ - - def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): - """ - Initialize the model from an iterable of `sentences`. Each sentence is a - list of words (unicode strings) that will be used for training. - - The `sentences` iterable can be simply a list, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in - this module for such examples. - - If you don't supply `sentences`, the model is left uninitialized -- use if - you plan to initialize it in some other way. - - `sg` defines the training algorithm. By default (`sg=0`), CBOW is used. - Otherwise (`sg=1`), skip-gram is employed. - - `size` is the dimensionality of the feature vectors. - - `window` is the maximum distance between the current and predicted word within a sentence. - - `alpha` is the initial learning rate (will linearly drop to `min_alpha` as training progresses). - - `seed` = for the random number generator. Initial vectors for each - word are seeded with a hash of the concatenation of word + str(seed). - Note that for a fully deterministically-reproducible run, you must also limit the model to - a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python - 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED - environment variable to control hash randomization.) - - `min_count` = ignore all words with total frequency lower than this. - - `max_vocab_size` = limit RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types - need about 1GB of RAM. Set to `None` for no limit (default). - - `sample` = threshold for configuring which higher-frequency words are randomly downsampled; - default is 1e-3, useful range is (0, 1e-5). - - `workers` = use this many worker threads to train the model (=faster training with multicore machines). - - `hs` = if 1, hierarchical softmax will be used for model training. - If set to 0 (default), and `negative` is non-zero, negative sampling will be used. - - `negative` = if > 0, negative sampling will be used, the int for negative - specifies how many "noise words" should be drawn (usually between 5-20). - Default is 5. If set to 0, no negative samping is used. - - `cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean. - Only applies when cbow is used. - - `hashfxn` = hash function to use to randomly initialize weights, for increased - training reproducibility. Default is Python's rudimentary built in hash function. - - `iter` = number of iterations (epochs) over the corpus. Default is 5. - - `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain - in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and - returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. - - `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before - assigning word indexes. - - `batch_words` = target size (in words) for batches of examples passed to worker threads (and - thus cython routines). Default is 10000. (Larger batches will be passed if individual - texts are longer than 10000 words, but the standard cython code truncates to that maximum.) - - """ - - self.load = call_on_class_only - - self.initialize_word_vectors() - self.sg = int(sg) - self.cum_table = None # for negative sampling - self.vector_size = int(size) - self.layer1_size = int(size) - if size % 4 != 0: - logger.warning("consider setting layer size to a multiple of 4 for greater performance") - self.alpha = float(alpha) - self.min_alpha_yet_reached = float(alpha) # To warn user if alpha increases - self.window = int(window) - self.max_vocab_size = max_vocab_size - self.seed = seed - self.random = random.RandomState(seed) - self.min_count = min_count - self.sample = sample - self.workers = int(workers) - self.min_alpha = float(min_alpha) - self.hs = hs - self.negative = negative - self.cbow_mean = int(cbow_mean) - self.hashfxn = hashfxn - self.iter = iter - self.null_word = null_word - self.train_count = 0 - self.total_train_time = 0 - self.sorted_vocab = sorted_vocab - self.batch_words = batch_words - self.model_trimmed_post_training = False - self.compute_loss = compute_loss - self.running_training_loss = 0 - if sentences is not None: - if isinstance(sentences, GeneratorType): - raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") - self.build_vocab(sentences, trim_rule=trim_rule) - self.train( - sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha - ) - else: - if trim_rule is not None: - logger.warning( - "The rule, if given, is only used to prune vocabulary during build_vocab() " - "and is not stored as part of the model. Model initialized without sentences. " - "trim_rule provided, if any, will be ignored." - ) - - def initialize_word_vectors(self): - self.wv = KeyedVectors() - - def make_cum_table(self, power=0.75, domain=2**31 - 1): - """ - Create a cumulative-distribution table using stored vocabulary word counts for - drawing random words in the negative-sampling training routines. - - To draw a word index, choose a random integer up to the maximum value in the - table (cum_table[-1]), then finding that integer's sorted insertion point - (as if by bisect_left or ndarray.searchsorted()). That insertion point is the - drawn index, coming up in proportion equal to the increment at that slot. - - Called internally from 'build_vocab()'. - """ - vocab_size = len(self.wv.index2word) - self.cum_table = zeros(vocab_size, dtype=uint32) - # compute sum of all power (Z in paper) - train_words_pow = 0.0 - for word_index in range(vocab_size): - train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power - cumulative = 0.0 - for word_index in range(vocab_size): - cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power - self.cum_table[word_index] = round(cumulative / train_words_pow * domain) - if len(self.cum_table) > 0: - assert self.cum_table[-1] == domain - - def create_binary_tree(self): - """ - Create a binary Huffman tree using stored vocabulary word counts. Frequent words - will have shorter binary codes. Called internally from `build_vocab()`. - - """ - logger.info("constructing a huffman tree from %i words", len(self.wv.vocab)) - - # build the huffman tree - heap = list(itervalues(self.wv.vocab)) - heapq.heapify(heap) - for i in range(len(self.wv.vocab) - 1): - min1, min2 = heapq.heappop(heap), heapq.heappop(heap) - heapq.heappush( - heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2) - ) - - # recurse over the tree, assigning a binary code to each vocabulary word - if heap: - max_depth, stack = 0, [(heap[0], [], [])] - while stack: - node, codes, points = stack.pop() - if node.index < len(self.wv.vocab): - # leaf node => store its path from the root - node.code, node.point = codes, points - max_depth = max(len(codes), max_depth) - else: - # inner node => continue recursion - points = array(list(points) + [node.index - len(self.wv.vocab)], dtype=uint32) - stack.append((node.left, array(list(codes) + [0], dtype=uint8), points)) - stack.append((node.right, array(list(codes) + [1], dtype=uint8), points)) - - logger.info("built huffman tree with maximum node depth %i", max_depth) - - def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): - """ - Build vocabulary from a sequence of sentences (can be a once-only generator stream). - Each sentence must be a list of unicode strings. - """ - self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - # trim by min_count & precalculate downsampling - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) - self.finalize_vocab(update=update) # build tables & arrays - - def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): - """ - Build vocabulary from a dictionary of word frequencies. - Build model vocabulary from a passed dictionary that contains (word,word count). - Words must be of type unicode strings. - - Parameters - ---------- - `word_freq` : dict - Word,Word_Count dictionary. - `keep_raw_vocab` : bool - If not true, delete the raw vocabulary after the scaling is done and free up RAM. - `corpus_count`: int - Even if no corpus is provided, this argument can set corpus_count explicitly. - `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain - in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and - returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. - `update`: bool - If true, the new provided words in `word_freq` dict will be added to model's vocab. - - Returns - -------- - None - - Examples - -------- - - .. sourcecode:: pycon - - >>> from gensim.models.word2vec import Word2Vec - >>> model = Word2Vec() - >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) - - """ - logger.info("Processing provided word frequencies") - # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) - # to be directly the raw vocab - raw_vocab = word_freq - logger.info( - "collected %i different raw word, with total frequency of %i", - len(raw_vocab), sum(itervalues(raw_vocab)) - ) - - # Since no sentences are provided, this is to control the corpus_count - self.corpus_count = corpus_count if corpus_count else 0 - self.raw_vocab = raw_vocab - - # trim by min_count & precalculate downsampling - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) - self.finalize_vocab(update=update) # build tables & arrays - - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): - """Do an initial scan of all words appearing in sentences.""" - logger.info("collecting all words and their counts") - sentence_no = -1 - total_words = 0 - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - for sentence_no, sentence in enumerate(sentences): - if not checked_string_types: - if isinstance(sentence, string_types): - logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings). " - "First item here is instead plain %s.", - type(sentence) - ) - checked_string_types += 1 - if sentence_no % progress_per == 0: - logger.info( - "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, total_words, len(vocab) - ) - for word in sentence: - vocab[word] += 1 - total_words += len(sentence) - - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - logger.info( - "collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1 - ) - self.corpus_count = sentence_no + 1 - self.raw_vocab = vocab - return total_words - - def scale_vocab(self, min_count=None, sample=None, dry_run=False, - keep_raw_vocab=False, trim_rule=None, update=False): - """ - Apply vocabulary settings for `min_count` (discarding less-frequent words) - and `sample` (controlling the downsampling of more-frequent words). - - Calling with `dry_run=True` will only simulate the provided settings and - report the size of the retained vocabulary, effective corpus length, and - estimated memory requirements. Results are both printed via logging and - returned as a dict. - - Delete the raw vocabulary after the scaling is done to free up RAM, - unless `keep_raw_vocab` is set. - - """ - min_count = min_count or self.min_count - sample = sample or self.sample - drop_total = drop_unique = 0 - - if not update: - logger.info("Loading a fresh vocabulary") - retain_total, retain_words = 0, [] - # Discard words less-frequent than min_count - if not dry_run: - self.wv.index2word = [] - # make stored settings match these applied settings - self.min_count = min_count - self.sample = sample - self.wv.vocab = {} - - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): - retain_words.append(word) - retain_total += v - if not dry_run: - self.wv.vocab[word] = Vocab(count=v, index=len(self.wv.index2word)) - self.wv.index2word.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(retain_words) + drop_unique - retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info( - "min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique - ) - original_total = retain_total + drop_total - retain_pct = retain_total * 100 / max(original_total, 1) - logger.info( - "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - min_count, retain_total, retain_pct, original_total, drop_total - ) - else: - logger.info("Updating model with new vocabulary") - new_total = pre_exist_total = 0 - new_words = pre_exist_words = [] - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): - if word in self.wv.vocab: - pre_exist_words.append(word) - pre_exist_total += v - if not dry_run: - self.wv.vocab[word].count += v - else: - new_words.append(word) - new_total += v - if not dry_run: - self.wv.vocab[word] = Vocab(count=v, index=len(self.wv.index2word)) - self.wv.index2word.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique - pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) - new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)", - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total - ) - retain_words = new_words + pre_exist_words - retain_total = new_total + pre_exist_total - - # Precalculate each vocabulary item's threshold for sampling - if not sample: - # no words downsampled - threshold_count = retain_total - elif sample < 1.0: - # traditional meaning: set parameter as proportion of total - threshold_count = sample * retain_total - else: - # new shorthand: sample >= 1 means downsample all words with higher count than sample - threshold_count = int(sample * (3 + sqrt(5)) / 2) - - downsample_total, downsample_unique = 0, 0 - for w in retain_words: - v = self.raw_vocab[w] - word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) - if word_probability < 1.0: - downsample_unique += 1 - downsample_total += word_probability * v - else: - word_probability = 1.0 - downsample_total += v - if not dry_run: - self.wv.vocab[w].sample_int = int(round(word_probability * 2**32)) - - if not dry_run and not keep_raw_vocab: - logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) - self.raw_vocab = defaultdict(int) - - logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total - ) - - # return from each step: words-affected, resulting-corpus-size, extra memory estimates - report_values = { - 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, - 'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words)) - } - - return report_values - - def finalize_vocab(self, update=False): - """Build tables and model weights based on final vocabulary settings.""" - if not self.wv.index2word: - self.scale_vocab() - if self.sorted_vocab and not update: - self.sort_vocab() - if self.hs: - # add info about each word's Huffman encoding - self.create_binary_tree() - if self.negative: - # build the table for drawing random words (for negative sampling) - self.make_cum_table() - if self.null_word: - # create null pseudo-word for padding when using concatenative L1 (run-of-words) - # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter - word, v = '\0', Vocab(count=1, sample_int=0) - v.index = len(self.wv.vocab) - self.wv.index2word.append(word) - self.wv.vocab[word] = v - # set initial input/projection and hidden weights - if not update: - self.reset_weights() - else: - self.update_weights() - - def sort_vocab(self): - """Sort the vocabulary so the most frequent words have the lowest indexes.""" - if len(self.wv.syn0): - raise RuntimeError("cannot sort vocabulary after model weights already initialized.") - self.wv.index2word.sort(key=lambda word: self.wv.vocab[word].count, reverse=True) - for i, word in enumerate(self.wv.index2word): - self.wv.vocab[word].index = i - - def reset_from(self, other_model): - """ - Borrow shareable pre-built structures (like vocab) from the other_model. Useful - if testing multiple models in parallel on the same corpus. - """ - self.wv.vocab = other_model.wv.vocab - self.wv.index2word = other_model.wv.index2word - self.cum_table = other_model.cum_table - self.corpus_count = other_model.corpus_count - self.reset_weights() - - def _do_train_job(self, sentences, alpha, inits): - """ - Train a single batch of sentences. Return 2-tuple `(effective word count after - ignoring unknown words and sentence length trimming, total word count)`. - """ - work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) - else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) - return tally, self._raw_word_count(sentences) - - def _raw_word_count(self, job): - """Return the number of words in a given job.""" - return sum(len(sentence) for sentence in job) - - def train(self, sentences, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=None): - """ - Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). - For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) - - To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate - progres-percentage logging, either total_examples (count of sentences) or total_words (count of - raw words in sentences) MUST be provided. (If the corpus is the same as was provided to - `build_vocab()`, the count of examples in that corpus will be available in the model's - `corpus_count` property.) - - To avoid common mistakes around the model's ability to do multiple training passes itself, an - explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()` - is only called once, the model's cached `iter` value should be supplied as `epochs` value. - """ - if self.model_trimmed_post_training: - raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") - - if compute_loss: - self.compute_loss = compute_loss - self.running_training_loss = 0 - - logger.info( - "training model with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, - self.hs, self.sample, self.negative, self.window - ) - - if not self.wv.vocab: - raise RuntimeError("you must first build vocabulary before training the model") - if not len(self.wv.syn0): - raise RuntimeError("you must first finalize vocabulary before training the model") - - if not hasattr(self, 'corpus_count'): - raise ValueError( - "The number of sentences in the training corpus is missing. " - "Did you load the model via KeyedVectors.load_word2vec_format?" - "Models loaded via load_word2vec_format don't support further training. " - "Instead start with a blank model, scan_vocab on the new corpus, " - "intersect_word2vec_format with the old model, then train." - ) - - if total_words is None and total_examples is None: - raise ValueError( - "You must specify either total_examples or total_words, for proper alpha and progress calculations. " - "The usual value is total_examples=model.corpus_count." - ) - if epochs is None: - raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.") - start_alpha = start_alpha or self.alpha - end_alpha = end_alpha or self.min_alpha - - job_tally = 0 - - if epochs > 1: - sentences = utils.RepeatCorpusNTimes(sentences, epochs) - total_words = total_words and total_words * epochs - total_examples = total_examples and total_examples * epochs - - def worker_loop(): - """Train the model, lifting lists of sentences from the job_queue.""" - work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory - neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) - jobs_processed = 0 - while True: - job = job_queue.get() - if job is None: - progress_queue.put(None) - break # no more jobs => quit this worker - sentences, alpha = job - tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1)) - progress_queue.put((len(sentences), tally, raw_tally)) # report back progress - jobs_processed += 1 - logger.debug("worker exiting, processed %i jobs", jobs_processed) - - def job_producer(): - """Fill jobs queue using the input `sentences` iterator.""" - job_batch, batch_size = [], 0 - pushed_words, pushed_examples = 0, 0 - next_alpha = start_alpha - if next_alpha > self.min_alpha_yet_reached: - logger.warning("Effective 'alpha' higher than previous training cycles") - self.min_alpha_yet_reached = next_alpha - job_no = 0 - - for sent_idx, sentence in enumerate(sentences): - sentence_length = self._raw_word_count([sentence]) - - # can we fit this sentence into the existing job batch? - if batch_size + sentence_length <= self.batch_words: - # yes => add it to the current job - job_batch.append(sentence) - batch_size += sentence_length - else: - # no => submit the existing job - logger.debug( - "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha - ) - job_no += 1 - job_queue.put((job_batch, next_alpha)) - - # update the learning rate for the next job - if end_alpha < next_alpha: - if total_examples: - # examples-based decay - pushed_examples += len(job_batch) - progress = 1.0 * pushed_examples / total_examples - else: - # words-based decay - pushed_words += self._raw_word_count(job_batch) - progress = 1.0 * pushed_words / total_words - next_alpha = start_alpha - (start_alpha - end_alpha) * progress - next_alpha = max(end_alpha, next_alpha) - - # add the sentence that didn't fit as the first item of a new job - job_batch, batch_size = [sentence], sentence_length - - # add the last job too (may be significantly smaller than batch_words) - if job_batch: - logger.debug( - "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha - ) - job_no += 1 - job_queue.put((job_batch, next_alpha)) - - if job_no == 0 and self.train_count == 0: - logger.warning( - "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable iteration = an iterable)." - ) - - # give the workers heads up that they can finish -- no more work! - for _ in range(self.workers): - job_queue.put(None) - logger.debug("job loop exiting, total %i jobs", job_no) - - # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( - job_queue = Queue(maxsize=queue_factor * self.workers) - progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - - workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] - unfinished_worker_count = len(workers) - workers.append(threading.Thread(target=job_producer)) - - for thread in workers: - thread.daemon = True # make interrupting the process with ctrl+c easier - thread.start() - - example_count, trained_word_count, raw_word_count = 0, 0, word_count - start, next_report = default_timer() - 0.00001, 1.0 - - while unfinished_worker_count > 0: - report = progress_queue.get() # blocks if workers too slow - if report is None: # a thread reporting that it finished - unfinished_worker_count -= 1 - logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) - continue - examples, trained_words, raw_words = report - job_tally += 1 - - # update progress stats - example_count += examples - trained_word_count += trained_words # only words in vocab & sampled - raw_word_count += raw_words - - # log progress once every report_delay seconds - elapsed = default_timer() - start - if elapsed >= next_report: - if total_examples: - # examples-based progress % - logger.info( - "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", - 100.0 * example_count / total_examples, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) - ) - else: - # words-based progress % - logger.info( - "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", - 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) - ) - next_report = elapsed + report_delay - - # all done; report the final stats - elapsed = default_timer() - start - logger.info( - "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed - ) - if job_tally < 10 * self.workers: - logger.warning( - "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" - ) - - # check that the input corpus hasn't changed during iteration - if total_examples and total_examples != example_count: - logger.warning( - "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples - ) - if total_words and total_words != raw_word_count: - logger.warning( - "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words - ) - - self.train_count += 1 # number of times train() has been called - self.total_train_time += elapsed - self.clear_sims() - return trained_word_count - - # basics copied from the train() function - def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): - """ - Score the log probability for a sequence of sentences (can be a once-only generator stream). - Each sentence must be a list of unicode strings. - This does not change the fitted model in any way (see Word2Vec.train() for that). - - We have currently only implemented score for the hierarchical softmax scheme, - so you need to have run word2vec with hs=1 and negative=0 for this to work. - - Note that you should specify total_sentences; we'll run into problems if you ask to - score more than this number of sentences but it is inefficient to set the value too high. - - See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of - how to use such scores in document classification. - - .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, - in Proceedings of the 2015 Conference of the Association of Computational Linguistics. - .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb - - """ - logger.info( - "scoring sentences with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative - ) - - if not self.wv.vocab: - raise RuntimeError("you must first build vocabulary before scoring new data") - - if not self.hs: - raise RuntimeError( - "We have currently only implemented score for the hierarchical softmax scheme, " - "so you need to have run word2vec with hs=1 and negative=0 for this to work." - ) - - def worker_loop(): - """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" - work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) - neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) - while True: - job = job_queue.get() - if job is None: # signal to finish - break - ns = 0 - for sentence_id, sentence in job: - if sentence_id >= total_sentences: - break - if self.sg: - score = score_sentence_sg(self, sentence, work) - else: - score = score_sentence_cbow(self, sentence, work, neu1) - sentence_scores[sentence_id] = score - ns += 1 - progress_queue.put(ns) # report progress - - start, next_report = default_timer(), 1.0 - # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( - job_queue = Queue(maxsize=queue_factor * self.workers) - progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - - workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] - for thread in workers: - thread.daemon = True # make interrupting the process with ctrl+c easier - thread.start() - - sentence_count = 0 - sentence_scores = matutils.zeros_aligned(total_sentences, dtype=REAL) - - push_done = False - done_jobs = 0 - jobs_source = enumerate(utils.grouper(enumerate(sentences), chunksize)) - - # fill jobs queue with (id, sentence) job items - while True: - try: - job_no, items = next(jobs_source) - if (job_no - 1) * chunksize > total_sentences: - logger.warning( - "terminating after %i sentences (set higher total_sentences if you want more).", - total_sentences - ) - job_no -= 1 - raise StopIteration() - logger.debug("putting job #%i in the queue", job_no) - job_queue.put(items) - except StopIteration: - logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) - for _ in range(self.workers): - job_queue.put(None) # give the workers heads up that they can finish -- no more work! - push_done = True - try: - while done_jobs < (job_no + 1) or not push_done: - ns = progress_queue.get(push_done) # only block after all jobs pushed - sentence_count += ns - done_jobs += 1 - elapsed = default_timer() - start - if elapsed >= next_report: - logger.info( - "PROGRESS: at %.2f%% sentences, %.0f sentences/s", - 100.0 * sentence_count, sentence_count / elapsed - ) - next_report = elapsed + report_delay # don't flood log, wait report_delay seconds - else: - # loop ended by job count; really done - break - except Empty: - pass # already out of loop; continue to next push - - elapsed = default_timer() - start - self.clear_sims() - logger.info( - "scoring %i sentences took %.1fs, %.0f sentences/s", - sentence_count, elapsed, sentence_count / elapsed - ) - return sentence_scores[:sentence_count] - - def clear_sims(self): - """ - Removes all L2-normalized vectors for words from the model. - You will have to recompute them using init_sims method. - """ - - self.wv.syn0norm = None - - def update_weights(self): - """ - Copy all the existing weights, and reset the weights for the newly - added vocabulary. - """ - logger.info("updating layer weights") - gained_vocab = len(self.wv.vocab) - len(self.wv.syn0) - newsyn0 = empty((gained_vocab, self.vector_size), dtype=REAL) - - # randomize the remaining words - for i in range(len(self.wv.syn0), len(self.wv.vocab)): - # construct deterministic seed from word AND seed argument - newsyn0[i - len(self.wv.syn0)] = self.seeded_vector(self.wv.index2word[i] + str(self.seed)) - - # Raise an error if an online update is run before initial training on a corpus - if not len(self.wv.syn0): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus before doing an online update." - ) - - self.wv.syn0 = vstack([self.wv.syn0, newsyn0]) - - if self.hs: - self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) - if self.negative: - self.syn1neg = vstack([self.syn1neg, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) - self.wv.syn0norm = None - - # do not suppress learning for already learned words - self.syn0_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning - - def reset_weights(self): - """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" - logger.info("resetting layer weights") - self.wv.syn0 = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) - # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once - for i in range(len(self.wv.vocab)): - # construct deterministic seed from word AND seed argument - self.wv.syn0[i] = self.seeded_vector(self.wv.index2word[i] + str(self.seed)) - if self.hs: - self.syn1 = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) - if self.negative: - self.syn1neg = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) - self.wv.syn0norm = None - - self.syn0_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning - - def seeded_vector(self, seed_string): - """Create one 'random' vector (but deterministic by seed_string)""" - # Note: built-in hash() may vary by Python version or even (in Py3.x) per launch - once = random.RandomState(self.hashfxn(seed_string) & 0xffffffff) - return (once.rand(self.vector_size) - 0.5) / self.vector_size - - def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): - """ - Merge the input-hidden weight matrix from the original C word2vec-tool format - given, where it intersects with the current vocabulary. (No words are added to the - existing vocabulary, but intersecting words adopt the file's weights, and - non-intersecting words are left alone.) - - `binary` is a boolean indicating whether the data is in binary word2vec format. - - `lockf` is a lock-factor value to be set for any imported word-vectors; the - default value of 0.0 prevents further updating of the vector during subsequent - training. Use 1.0 to allow further training updates of merged vectors. - """ - overlap_count = 0 - logger.info("loading projection weights from %s", fname) - with utils.open(fname, 'rb') as fin: - header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format - if not vector_size == self.vector_size: - raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) - # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? - if binary: - binary_len = dtype(REAL).itemsize * vector_size - for _ in range(vocab_size): - # mixed text and binary: read text first, then binary - word = [] - while True: - ch = fin.read(1) - if ch == b' ': - break - if ch != b'\n': # ignore newlines in front of words (some binary files have) - word.append(ch) - word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) - weights = fromstring(fin.read(binary_len), dtype=REAL) - if word in self.wv.vocab: - overlap_count += 1 - self.wv.syn0[self.wv.vocab[word].index] = weights - self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes - else: - for line_no, line in enumerate(fin): - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") - if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [REAL(x) for x in parts[1:]] - if word in self.wv.vocab: - overlap_count += 1 - self.wv.syn0[self.wv.vocab[word].index] = weights - self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes - logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname) - - def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): - """ - Deprecated. Use self.wv.most_similar() instead. - Refer to the documentation for `gensim.models.KeyedVectors.most_similar` - """ - return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer) - - def wmdistance(self, document1, document2): - """ - Deprecated. Use self.wv.wmdistance() instead. - Refer to the documentation for `gensim.models.KeyedVectors.wmdistance` - """ - return self.wv.wmdistance(document1, document2) - - def most_similar_cosmul(self, positive=None, negative=None, topn=10): - """ - Deprecated. Use self.wv.most_similar_cosmul() instead. - Refer to the documentation for `gensim.models.KeyedVectors.most_similar_cosmul` - """ - return self.wv.most_similar_cosmul(positive, negative, topn) - - def similar_by_word(self, word, topn=10, restrict_vocab=None): - """ - Deprecated. Use self.wv.similar_by_word() instead. - Refer to the documentation for `gensim.models.KeyedVectors.similar_by_word` - """ - return self.wv.similar_by_word(word, topn, restrict_vocab) - - def similar_by_vector(self, vector, topn=10, restrict_vocab=None): - """ - Deprecated. Use self.wv.similar_by_vector() instead. - Refer to the documentation for `gensim.models.KeyedVectors.similar_by_vector` - """ - return self.wv.similar_by_vector(vector, topn, restrict_vocab) - - def doesnt_match(self, words): - """ - Deprecated. Use self.wv.doesnt_match() instead. - Refer to the documentation for `gensim.models.KeyedVectors.doesnt_match` - """ - return self.wv.doesnt_match(words) - - def __getitem__(self, words): - """ - Deprecated. Use self.wv.__getitem__() instead. - Refer to the documentation for `gensim.models.KeyedVectors.__getitem__` - """ - return self.wv.__getitem__(words) - - def __contains__(self, word): - """ - Deprecated. Use self.wv.__contains__() instead. - Refer to the documentation for `gensim.models.KeyedVectors.__contains__` - """ - return self.wv.__contains__(word) - - def similarity(self, w1, w2): - """ - Deprecated. Use self.wv.similarity() instead. - Refer to the documentation for `gensim.models.KeyedVectors.similarity` - """ - return self.wv.similarity(w1, w2) - - def n_similarity(self, ws1, ws2): - """ - Deprecated. Use self.wv.n_similarity() instead. - Refer to the documentation for `gensim.models.KeyedVectors.n_similarity` - """ - return self.wv.n_similarity(ws1, ws2) - - def predict_output_word(self, context_words_list, topn=10): - """Report the probability distribution of the center word given the context words - as input to the trained model.""" - if not self.negative: - raise RuntimeError( - "We have currently only implemented predict_output_word for the negative sampling scheme, " - "so you need to have run word2vec with negative > 0 for this to work." - ) - - if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'): - raise RuntimeError("Parameters required for predicting the output words not found.") - - word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab] - if not word_vocabs: - warnings.warn("All the input context words are out-of-vocabulary for the current model.") - return None - - word2_indices = [word.index for word in word_vocabs] - - l1 = np_sum(self.wv.syn0[word2_indices], axis=0) - if word2_indices and self.cbow_mean: - l1 /= len(word2_indices) - - prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities - prob_values /= sum(prob_values) - top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - # returning the most probable output words with their probabilities - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] - - def init_sims(self, replace=False): - """ - init_sims() resides in KeyedVectors because it deals with syn0 mainly, but because syn1 is not an attribute - of KeyedVectors, it has to be deleted in this class, and the normalizing of syn0 happens inside of KeyedVectors - """ - if replace and hasattr(self, 'syn1'): - del self.syn1 - return self.wv.init_sims(replace) - - def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings and provided vocabulary size.""" - vocab_size = vocab_size or len(self.wv.vocab) - report = report or {} - report['vocab'] = vocab_size * (700 if self.hs else 500) - report['syn0'] = vocab_size * self.vector_size * dtype(REAL).itemsize - if self.hs: - report['syn1'] = vocab_size * self.layer1_size * dtype(REAL).itemsize - if self.negative: - report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize - report['total'] = sum(report.values()) - logger.info( - "estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total'] - ) - return report - - @staticmethod - def log_accuracy(section): - return KeyedVectors.log_accuracy(section) - - def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True): - most_similar = most_similar or KeyedVectors.most_similar - return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive) - - @staticmethod - def log_evaluate_word_pairs(pearson, spearman, oov, pairs): - """ - Deprecated. Use self.wv.log_evaluate_word_pairs() instead. - Refer to the documentation for `gensim.models.KeyedVectors.log_evaluate_word_pairs` - """ - return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) - - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, - case_insensitive=True, dummy4unknown=False): - """ - Deprecated. Use self.wv.evaluate_word_pairs() instead. - Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` - """ - return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) - - def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % ( - self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha - ) - - def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): - warnings.warn( - "This method would be deprecated in the future. " - "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance " - "for read-only querying of word vectors." - ) - if save_syn1 and save_syn1neg and save_syn0_lockf: - return - if hasattr(self, 'syn1') and not save_syn1: - del self.syn1 - if hasattr(self, 'syn1neg') and not save_syn1neg: - del self.syn1neg - if hasattr(self, 'syn0_lockf') and not save_syn0_lockf: - del self.syn0_lockf - self.model_trimmed_post_training = True - - def delete_temporary_training_data(self, replace_word_vectors_with_normalized=False): - """ - Discard parameters that are used in training and score. Use if you're sure you're done training a model. - If `replace_word_vectors_with_normalized` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - """ - if replace_word_vectors_with_normalized: - self.init_sims(replace=True) - self._minimize_model() - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors, recalculable table - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table']) - - super(Word2Vec, self).save(*args, **kwargs) - - save.__doc__ = SaveLoad.save.__doc__ - - @classmethod - def load(cls, *args, **kwargs): - model = super(Word2Vec, cls).load(*args, **kwargs) - # update older models - if hasattr(model, 'table'): - delattr(model, 'table') # discard in favor of cum_table - if model.negative and hasattr(model.wv, 'index2word'): - model.make_cum_table() # rebuild cum_table from vocabulary - if not hasattr(model, 'corpus_count'): - model.corpus_count = None - if not hasattr(model, 'corpus_total_words'): - model.corpus_total_words = None - for v in model.wv.vocab.values(): - if hasattr(v, 'sample_int'): - break # already 0.12.0+ style int probabilities - elif hasattr(v, 'sample_probability'): - v.sample_int = int(round(v.sample_probability * 2**32)) - del v.sample_probability - if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'): - model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL) - if not hasattr(model, 'random'): - model.random = random.RandomState(model.seed) - if not hasattr(model, 'train_count'): - model.train_count = 0 - model.total_train_time = 0 - return model - - def _load_specials(self, *args, **kwargs): - super(Word2Vec, self)._load_specials(*args, **kwargs) - # loading from a pre-KeyedVectors word2vec model - if not hasattr(self, 'wv'): - wv = KeyedVectors() - wv.syn0 = self.__dict__.get('syn0', []) - wv.syn0norm = self.__dict__.get('syn0norm', None) - wv.vocab = self.__dict__.get('vocab', {}) - wv.index2word = self.__dict__.get('index2word', []) - self.wv = wv - - @classmethod - def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): - """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" - raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.") - - def save_word2vec_format(self, fname, fvocab=None, binary=False): - """Deprecated. Use model.wv.save_word2vec_format instead.""" - raise DeprecationWarning("Deprecated. Use model.wv.save_word2vec_format instead.") - - def get_latest_training_loss(self): - return self.running_training_loss - - -class BrownCorpus(object): - """Iterate over sentences from the Brown corpus (part of NLTK data).""" - - def __init__(self, dirname): - self.dirname = dirname - - def __iter__(self): - for fname in os.listdir(self.dirname): - fname = os.path.join(self.dirname, fname) - if not os.path.isfile(fname): - continue - with utils.open(fname, 'rb') as fin: - for line in fin: - line = utils.to_unicode(line) - # each file line is a single sentence in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty sentences - continue - yield words - - -class Text8Corpus(object): - """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .""" - - def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH): - self.fname = fname - self.max_sentence_length = max_sentence_length - - def __iter__(self): - # the entire corpus is one gigantic line -- there are no sentence marks at all - # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens - sentence, rest = [], b'' - with utils.open(self.fname, 'rb') as fin: - while True: - text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM - if text == rest: # EOF - words = utils.to_unicode(text).split() - sentence.extend(words) # return the last chunk of words, too (may be shorter/longer) - if sentence: - yield sentence - break - last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration - words, rest = (utils.to_unicode(text[:last_token]).split(), - text[last_token:].strip()) if last_token >= 0 else ([], text) - sentence.extend(words) - while len(sentence) >= self.max_sentence_length: - yield sentence[:self.max_sentence_length] - sentence = sentence[self.max_sentence_length:] - - -class LineSentence(object): - """ - Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - """ - - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """ - `source` can be either a string or a file object. Clip the file to the first - `limit` lines (or not clipped if limit is None, the default). - - Example:: - - sentences = LineSentence('myfile.txt') - - Or for compressed files:: - - sentences = LineSentence('compressed_text.txt.bz2') - sentences = LineSentence('compressed_text.txt.gz') - - """ - self.source = source - self.max_sentence_length = max_sentence_length - self.limit = limit - - def __iter__(self): - """Iterate through the lines in the source.""" - try: - # Assume it is a file-like object and try treating it as such - # Things that don't have seek will trigger an exception - self.source.seek(0) - for line in itertools.islice(self.source, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - except AttributeError: - # If it didn't work like a file, use it as a string filename - with utils.open(self.source, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - - -class PathLineSentences(object): - """ - - Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. - The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. - Any file not ending with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. - - The format of files (either text, or compressed text files) in the path is one sentence = one line, - with words already preprocessed and separated by whitespace. - - """ - - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """ - `source` should be a path to a directory (as a string) where all files can be opened by the - LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). - - Example:: - - sentences = PathLineSentences(os.getcwd() + '\\corpus\\') - - The files in the directory should be either text files, .bz2 files, or .gz files. - - """ - self.source = source - self.max_sentence_length = max_sentence_length - self.limit = limit - - if os.path.isfile(self.source): - logger.debug('single file given as source, rather than a directory of files') - logger.debug('consider using models.word2vec.LineSentence for a single file') - self.input_files = [self.source] # force code compatibility with list of files - elif os.path.isdir(self.source): - self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logger.info('reading directory %s', self.source) - self.input_files = os.listdir(self.source) - self.input_files = [self.source + filename for filename in self.input_files] # make full paths - self.input_files.sort() # makes sure it happens in filename order - else: # not a file or a directory, then we can't do anything with it - raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) - - def __iter__(self): - """iterate through the files""" - for file_name in self.input_files: - logger.info('reading file %s', file_name) - with utils.open(file_name, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i:i + self.max_sentence_length] - i += self.max_sentence_length - - -# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \ -# -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 -if __name__ == "__main__": - import argparse - logging.basicConfig( - format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', - level=logging.INFO - ) - logger.info("running %s", " ".join(sys.argv)) - - # check and process cmdline input - program = os.path.basename(sys.argv[0]) - if len(sys.argv) < 2: - print(globals()['__doc__'] % locals()) - sys.exit(1) - - from gensim.models.word2vec import Word2Vec # noqa:F811 avoid referencing __main__ in pickle - - seterr(all='raise') # don't ignore numpy errors - - parser = argparse.ArgumentParser() - parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) - parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") - parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) - parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument( - "-sample", - help="Set threshold for occurrence of words. " - "Those that appear with higher frequency in the training data will be randomly down-sampled;" - " default is 1e-3, useful range is (0, 1e-5)", - type=float, default=1e-3 - ) - parser.add_argument( - "-hs", help="Use Hierarchical Softmax; default is 0 (not used)", - type=int, default=0, choices=[0, 1] - ) - parser.add_argument( - "-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", - type=int, default=5 - ) - parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) - parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument( - "-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", - type=int, default=5 - ) - parser.add_argument( - "-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", - type=int, default=1, choices=[0, 1] - ) - parser.add_argument( - "-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", - type=int, default=0, choices=[0, 1] - ) - parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") - - args = parser.parse_args() - - if args.cbow == 0: - skipgram = 1 - else: - skipgram = 0 - - corpus = LineSentence(args.train) - - model = Word2Vec( - corpus, size=args.size, min_count=args.min_count, workers=args.threads, - window=args.window, sample=args.sample, sg=skipgram, hs=args.hs, - negative=args.negative, cbow_mean=1, iter=args.iter - ) - - if args.output: - outfile = args.output - model.wv.save_word2vec_format(outfile, binary=args.binary) - else: - outfile = args.train - model.save(outfile + '.model') - if args.binary == 1: - model.wv.save_word2vec_format(outfile + '.model.bin', binary=True) - else: - model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) - - if args.accuracy: - model.accuracy(args.accuracy) - - logger.info("finished running %s", program) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c5cb5b40a2..1a55ad9b5f 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Author: Shiva Manne +# Author: Gensim Contributors # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -63,7 +63,6 @@ import logging import os -import warnings try: from queue import Queue @@ -73,19 +72,17 @@ from collections import namedtuple, defaultdict from collections.abc import Iterable from timeit import default_timer +from dataclasses import dataclass -from numpy import zeros, float32 as REAL, empty, ones, \ - memmap as np_memmap, vstack, integer, dtype +from numpy import zeros, float32 as REAL, vstack, integer, dtype +import numpy as np -from gensim.utils import call_on_class_only, deprecated from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.models.word2vec import Word2VecKeyedVectors, Word2VecVocab, Word2VecTrainables -from gensim.models.word2vec import train_cbow_pair, train_sg_pair, train_batch_sg # noqa +from gensim.utils import deprecated +from gensim.models import Word2Vec from six.moves import range from six import string_types, integer_types, itervalues -from gensim.models.base_any2vec import BaseWordEmbeddingsModel -from gensim.models.keyedvectors import Doc2VecKeyedVectors -from types import GeneratorType +from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector logger = logging.getLogger(__name__) @@ -146,73 +143,34 @@ def __str__(self): return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) -# for compatibility -@deprecated("Class will be removed in 4.0.0, use TaggedDocument instead") -class LabeledSentence(TaggedDocument): - """Deprecated, use :class:`~gensim.models.doc2vec.TaggedDocument` instead.""" - pass - - -class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): - """A string document tag discovered during the initial vocabulary scan. - The document-vector equivalent of a Vocab object. - - Will not be used if all presented document tags are ints. - - The offset is only the true index into the `doctags_syn0`/`doctags_syn0_lockf` - if-and-only-if no raw-int tags were used. - If any raw-int tags were used, string :class:`~gensim.models.doc2vec.Doctag` vectors begin at index - `(max_rawint + 1)`, so the true index is `(rawint_index + 1 + offset)`. - - See Also - -------- - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors._index_to_doctag` +@dataclass +class Doctag: + """A dataclass shape-compatible with keyedvectors.SimpleVocab, extended to record + details of string document tags discovered during the initial vocabulary scan. + Will not be used if all presented document tags are ints. No longer used in a + completed model: just used during initial scan, and for backward compatibility. """ - __slots__ = () - - def repeat(self, word_count): - return self._replace(word_count=self.word_count + word_count, doc_count=self.doc_count + 1) - - -class Doc2Vec(BaseWordEmbeddingsModel): - """Class for training, using and evaluating neural networks described in - `Distributed Representations of Sentences and Documents `_. - - Some important internal attributes are the following: - - Attributes - ---------- - wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` - This object essentially contains the mapping between words and embeddings. After training, it can be used - directly to query those embeddings in various ways. See the module level docstring for examples. - - docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` - This object contains the paragraph vectors learned from the training data. There will be one such vector - for each unique document tag supplied during training. They may be individually accessed using the tag - as an indexed-access key. For example, if one of the training documents used a tag of 'doc003': - - .. sourcecode:: pycon + __slots__ = ('doc_count', 'index', 'word_count') + doc_count: int # number of docs where tag appeared + index: int # position in underlying array + word_count: int # number of words in associated docs - >>> model.docvecs['doc003'] + @property + def count(self): + return self.doc_count - vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab` - This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. - Besides keeping track of all unique words, this object provides extra functionality, such as - sorting words by frequency, or discarding extremely rare words. + @count.setter + def count(self, new_val): + self.doc_count = new_val - trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables` - This object represents the inner shallow neural network used to train the embeddings. The semantics of the - network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with - a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings - The only addition to the underlying NN used in :class:`~gensim.models.word2vec.Word2Vec` is that the input - includes not only the word vectors of each word in the context, but also the paragraph vector. - """ - def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, - dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), - **kwargs): - """ +class Doc2Vec(Word2Vec): + def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, + dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(), + window=5, epochs=10, **kwargs): + """Class for training, using and evaluating neural networks described in + `Distributed Representations of Sentences and Documents `_. Parameters ---------- @@ -256,7 +214,7 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo workers : int, optional Use these many worker threads to train the model (=faster training with multicore machines). epochs : int, optional - Number of iterations (epochs) over the corpus. + Number of iterations (epochs) over the corpus. Defaults to 10 for Doc2Vec. hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. @@ -301,28 +259,24 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. - """ - if 'sentences' in kwargs: - raise DeprecationWarning( - "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, " - "use 'documents' instead." - ) + Some important internal attributes are the following: - if 'iter' in kwargs: - warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - kwargs['epochs'] = kwargs['iter'] + Attributes + ---------- + wv : :class:`~gensim.models.keyedvectors.KeyedVectors` + This object essentially contains the mapping between words and embeddings. After training, it can be used + directly to query those embeddings in various ways. See the module level docstring for examples. - if 'size' in kwargs: - warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - kwargs['vector_size'] = kwargs['size'] + dv : :class:`~gensim.models.keyedvectors.KeyedVectors` + This object contains the paragraph vectors learned from the training data. There will be one such vector + for each unique document tag supplied during training. They may be individually accessed using the tag + as an indexed-access key. For example, if one of the training documents used a tag of 'doc003': - super(Doc2Vec, self).__init__( - sg=(1 + dm) % 2, - null_word=dm_concat, - callbacks=callbacks, - **kwargs) + .. sourcecode:: pycon - self.load = call_on_class_only + >>> model.dv['doc003'] + """ + corpus_iterable = documents if dm_mean is not None: self.cbow_mean = dm_mean @@ -330,34 +284,23 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo self.dbow_words = int(dbow_words) self.dm_concat = int(dm_concat) self.dm_tag_count = int(dm_tag_count) + if dm and dm_concat: + self.layer1_size = (dm_tag_count + (2 * window)) * vector_size + logger.info("using concatenative %d-dimensional layer1", self.layer1_size) - kwargs['null_word'] = dm_concat - vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'ns_exponent'] - vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs) - self.vocabulary = Doc2VecVocab(**vocabulary_kwargs) - - trainables_keys = ['seed', 'hashfxn', 'window'] - trainables_kwargs = dict((k, kwargs[k]) for k in trainables_keys if k in kwargs) - self.trainables = Doc2VecTrainables( - dm=dm, dm_concat=dm_concat, dm_tag_count=dm_tag_count, - vector_size=self.vector_size, **trainables_kwargs) - - self.wv = Word2VecKeyedVectors(self.vector_size) - self.docvecs = docvecs or Doc2VecKeyedVectors(self.vector_size, docvecs_mapfile) - - self.comment = comment - - if documents is not None or corpus_file is not None: - self._check_input_data_sanity(data_iterable=documents, corpus_file=corpus_file) - if corpus_file is not None and not isinstance(corpus_file, string_types): - raise TypeError("You must pass string as the corpus_file argument.") - elif isinstance(documents, GeneratorType): - raise TypeError("You can't pass a generator as the documents argument. Try a sequence.") - self.build_vocab(documents=documents, corpus_file=corpus_file, trim_rule=trim_rule) - self.train( - documents=documents, corpus_file=corpus_file, total_examples=self.corpus_count, - total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, - end_alpha=self.min_alpha, callbacks=callbacks) + self.vector_size = vector_size + self.dv = dv or KeyedVectors(self.vector_size, mapfile_path=dv_mapfile) + + super(Doc2Vec, self).__init__( + sentences=corpus_iterable, + corpus_file=corpus_file, + vector_size=self.vector_size, + sg=(1 + dm) % 2, + null_word=self.dm_concat, + callbacks=callbacks, + window=window, + epochs=epochs, + **kwargs) @property def dm(self): @@ -375,41 +318,57 @@ def dbow(self): """ return self.sg # same as SG - def _set_train_params(self, **kwargs): - pass + @property + @deprecated("The `docvecs` property has been renamed `dv`.") + def docvecs(self): + return self.dv - def _clear_post_train(self): - """Alias for :meth:`~gensim.models.doc2vec.Doc2Vec.clear_sims`.""" - self.clear_sims() + @docvecs.setter + @deprecated("The `docvecs` property has been renamed `dv`.") + def docvecs(self, value): + self.dv = value - def clear_sims(self): + def _clear_post_train(self): """Resets the current word vectors. """ - self.wv.vectors_norm = None - self.wv.vectors_docs_norm = None + self.wv.norms = None + self.dv.norms = None + + def reset_weights(self): + super(Doc2Vec, self).reset_weights() + self.dv.resize_vectors() + self.dv.randomly_initialize_vectors() + self.dv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows def reset_from(self, other_model): """Copy shareable data structures from another (possibly pre-trained) model. + This specifically causes some structures to be shared, so is limited to + structures (like those rleated to the known word/tag vocabularies) that + won't change during training or thereafter. Beware vocabulary edits/updates + to either model afterwards: the partial sharing and out-of-band modification + may leave the other model in a broken state. + Parameters ---------- other_model : :class:`~gensim.models.doc2vec.Doc2Vec` Other model whose internal data structures will be copied over to the current object. """ - self.wv.vocab = other_model.wv.vocab - self.wv.index2word = other_model.wv.index2word - self.vocabulary.cum_table = other_model.vocabulary.cum_table + self.wv.key_to_index = other_model.wv.key_to_index + self.wv.index_to_key = other_model.wv.index_to_key + self.wv.expandos = other_model.wv.expandos + self.cum_table = other_model.cum_table self.corpus_count = other_model.corpus_count - self.docvecs.count = other_model.docvecs.count - self.docvecs.doctags = other_model.docvecs.doctags - self.docvecs.offset2doctag = other_model.docvecs.offset2doctag - self.trainables.reset_weights(self.hs, self.negative, self.wv, self.docvecs) + self.dv.key_to_index = other_model.dv.key_to_index + self.dv.index_to_key = other_model.dv.index_to_key + self.dv.expandos = other_model.dv.expandos + self.reset_weights() def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, total_examples=None, total_words=None, offsets=None, start_doctags=None, **kwargs): work, neu1 = thread_private_mem - doctag_vectors = self.docvecs.vectors_docs - doctag_locks = self.trainables.vectors_docs_lockf + doctag_vectors = self.dv.vectors + doctags_lockf = self.dv.vectors_lockf offset = offsets[thread_id] start_doctag = start_doctags[thread_id] @@ -417,18 +376,18 @@ def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_p if self.sg: examples, tally, raw_tally = d2v_train_epoch_dbow( self, corpus_file, offset, start_doctag, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.docvecs.count, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, train_words=self.dbow_words) + total_examples, total_words, work, neu1, len(self.dv), + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf, train_words=self.dbow_words) elif self.dm_concat: examples, tally, raw_tally = d2v_train_epoch_dm_concat( self, corpus_file, offset, start_doctag, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.docvecs.count, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + total_examples, total_words, work, neu1, len(self.dv), + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf) else: examples, tally, raw_tally = d2v_train_epoch_dm( self, corpus_file, offset, start_doctag, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.docvecs.count, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) + total_examples, total_words, work, neu1, len(self.dv), + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf) return examples, tally, raw_tally @@ -453,29 +412,30 @@ def _do_train_job(self, job, alpha, inits): work, neu1 = inits tally = 0 for doc in job: - doctag_indexes = self.vocabulary.indexed_doctags(doc.tags, self.docvecs) - doctag_vectors = self.docvecs.vectors_docs - doctag_locks = self.trainables.vectors_docs_lockf + doctag_indexes = [self.dv.get_index(tag) for tag in doc.tags if tag in self.dv] + doctag_vectors = self.dv.vectors + doctags_lockf = self.dv.vectors_lockf if self.sg: tally += train_document_dbow( self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf ) elif self.dm_concat: tally += train_document_dm_concat( self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf ) else: tally += train_document_dm( self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf ) return tally, self._raw_word_count(job) - def train(self, documents=None, corpus_file=None, total_examples=None, total_words=None, + def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): + word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), + **kwargs): """Update the model's neural weights. To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate @@ -491,7 +451,7 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor Parameters ---------- - documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + corpus_iterable : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional Can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is left uninitialized -- use if you plan to initialize it in some other way. @@ -528,19 +488,17 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor List of callbacks that need to be executed/run at specific stages during training. """ - kwargs = {} - - if corpus_file is None and documents is None: + if corpus_file is None and corpus_iterable is None: raise TypeError("Either one of corpus_file or documents value must be provided") - if corpus_file is not None and documents is not None: - raise TypeError("Both corpus_file and documents must not be provided at the same time") + if corpus_file is not None and corpus_iterable is not None: + raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") - if documents is None and not os.path.isfile(corpus_file): + if corpus_iterable is None and not os.path.isfile(corpus_file): raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) - if documents is not None and not isinstance(documents, Iterable): - raise TypeError("documents must be an iterable of list, got %r instead" % documents) + if corpus_iterable is not None and not isinstance(corpus_iterable, Iterable): + raise TypeError("corpus_iterable must be an iterable of TaggedDocument, got %r instead" % corpus_iterable) if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) @@ -549,7 +507,8 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor kwargs['start_doctags'] = start_doctags super(Doc2Vec, self).train( - sentences=documents, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, + corpus_iterable=corpus_iterable, corpus_file=corpus_file, + total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks, **kwargs) @@ -622,7 +581,7 @@ def estimated_lookup_memory(self): The estimated RAM required to look up a tag in bytes. """ - return 60 * len(self.docvecs.offset2doctag) + 140 * len(self.docvecs.doctags) + return 60 * len(self.dv) + 140 * len(self.dv) def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps=None): """Infer a vector for given post-bulk training document. @@ -645,9 +604,6 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps Number of times to train the new document. Larger values take more time, but may improve quality and run-to-run stability of inferred vectors. If unspecified, the `epochs` value from model initialization will be reused. - steps : int, optional, deprecated - Previous name for `epochs`, still available for now for backward compatibility: if - `epochs` is unspecified but `steps` is, the `steps` value will be used. Returns ------- @@ -660,13 +616,16 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps alpha = alpha or self.alpha min_alpha = min_alpha or self.min_alpha - epochs = epochs or steps or self.epochs + epochs = epochs or self.epochs + + doctag_vectors = pseudorandom_weak_vector(self.dv.vector_size, seed_string=' '.join(doc_words)) + doctag_vectors = doctag_vectors.reshape(1, self.dv.vector_size) - doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size) + doctags_lockf = np.ones(1, dtype=REAL) doctag_indexes = [0] - work = zeros(self.trainables.layer1_size, dtype=REAL) + work = zeros(self.layer1_size, dtype=REAL) if not self.sg: - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) alpha_delta = (alpha - min_alpha) / max(epochs - 1, 1) @@ -674,17 +633,17 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps if self.sg: train_document_dbow( self, doc_words, doctag_indexes, alpha, work, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf ) elif self.dm_concat: train_document_dm_concat( self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf ) else: train_document_dm( self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks + learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf ) alpha -= alpha_delta @@ -705,8 +664,8 @@ def __getitem__(self, tag): """ if isinstance(tag, string_types + integer_types + (integer,)): - if tag not in self.wv.vocab: - return self.docvecs[tag] + if tag not in self.wv: + return self.dv[tag] return self.wv[tag] return vstack([self[i] for i in tag]) @@ -736,48 +695,21 @@ def __str__(self): segments.append('dm/m') else: segments.append('dm/s') - segments.append('d%d' % self.docvecs.vector_size) # dimensions + segments.append('d%d' % self.dv.vector_size) # dimensions if self.negative: segments.append('n%d' % self.negative) # negative samples if self.hs: segments.append('hs') if not self.sg or (self.sg and self.dbow_words): segments.append('w%d' % self.window) # window size, when relevant - if self.vocabulary.min_count > 1: - segments.append('mc%d' % self.vocabulary.min_count) - if self.vocabulary.sample > 0: - segments.append('s%g' % self.vocabulary.sample) + if self.min_count > 1: + segments.append('mc%d' % self.min_count) + if self.sample > 0: + segments.append('s%g' % self.sample) if self.workers > 1: segments.append('t%d' % self.workers) return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) - def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): - """Discard parameters that are used in training and score. Use if you're sure you're done training a model. - - Parameters - ---------- - keep_doctags_vectors : bool, optional - Set to False if you don't want to save doctags vectors. In this case you will not be able to use - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.most_similar`, - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.similarity`, etc methods. - keep_inference : bool, optional - Set to False if you don't want to store parameters that are used for - :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector` method. - - """ - if not keep_inference: - if hasattr(self.trainables, 'syn1'): - del self.trainables.syn1 - if hasattr(self.trainables, 'syn1neg'): - del self.trainables.syn1neg - if hasattr(self.trainables, 'vectors_lockf'): - del self.trainables.vectors_lockf - self.model_trimmed_post_training = True - if self.docvecs and hasattr(self.docvecs, 'vectors_docs') and not keep_doctags_vectors: - del self.docvecs.vectors_docs - if self.docvecs and hasattr(self.trainables, 'vectors_docs_lockf'): - del self.trainables.vectors_docs_lockf - def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool. @@ -798,21 +730,24 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* If True, the data will be saved in binary word2vec format, otherwise - will be saved in plain text. """ - total_vec = len(self.wv.vocab) + len(self.docvecs) - write_first_line = False + total_vec = None # save word vectors if word_vec: - if not doctag_vec: - total_vec = len(self.wv.vocab) + if doctag_vec: + total_vec = len(self.wv) + len(self.dv) self.wv.save_word2vec_format(fname, fvocab, binary, total_vec) # save document vectors if doctag_vec: - if not word_vec: - total_vec = len(self.docvecs) - write_first_line = True - self.docvecs.save_word2vec_format( - fname, prefix=prefix, fvocab=fvocab, total_vec=total_vec, - binary=binary, write_first_line=write_first_line) + write_header = True + append = False + if word_vec: + # simply appending to existing file + write_header = False + append = True + self.dv.save_word2vec_format( + fname, prefix=prefix, fvocab=fvocab, binary=binary, + write_header=write_header, append=append, + sort_attr='doc_count') def init_sims(self, replace=False): """Pre-compute L2-normalized vectors. @@ -824,7 +759,7 @@ def init_sims(self, replace=False): continue training if call it with `replace=True`). """ - self.docvecs.init_sims(replace=replace) + self.dv.init_sims(replace=replace) @classmethod def load(cls, *args, **kwargs): @@ -835,9 +770,9 @@ def load(cls, *args, **kwargs): fname : str Path to the saved file. *args : object - Additional arguments, see `~gensim.models.base_any2vec.BaseWordEmbeddingsModel.load`. + Additional arguments, see `~gensim.models.word2vec.Word2Vec.load`. **kwargs : object - Additional arguments, see `~gensim.models.base_any2vec.BaseWordEmbeddingsModel.load`. + Additional arguments, see `~gensim.models.word2vec.Word2Vec.load`. See Also -------- @@ -851,11 +786,13 @@ def load(cls, *args, **kwargs): """ try: - return super(Doc2Vec, cls).load(*args, **kwargs) - except AttributeError: - logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.') - from gensim.models.deprecated.doc2vec import load_old_doc2vec - return load_old_doc2vec(*args, **kwargs) + return super(Doc2Vec, cls).load(*args, rethrow=True, **kwargs) + except AttributeError as ae: + logger.error( + "Model load error. Was model saved using code from an older Gensim Version? " + "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "compatibility with current code.") + raise ae def estimate_memory(self, vocab_size=None, report=None): """Estimate required memory for a model using current settings. @@ -878,11 +815,11 @@ def estimate_memory(self, vocab_size=None, report=None): """ report = report or {} report['doctag_lookup'] = self.estimated_lookup_memory() - report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize + report['doctag_syn0'] = len(self.dv) * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, documents=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, - trim_rule=None, **kwargs): + def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of documents (can be a once-only generator stream). Parameters @@ -920,19 +857,16 @@ def build_vocab(self, documents=None, corpus_file=None, update=False, progress_p Additional key word arguments passed to the internal vocabulary construction. """ - total_words, corpus_count = self.vocabulary.scan_vocab( - documents=documents, corpus_file=corpus_file, docvecs=self.docvecs, + total_words, corpus_count = self.scan_vocab( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule ) self.corpus_count = corpus_count self.corpus_total_words = total_words - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, - **kwargs) + report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights( - self.hs, self.negative, self.wv, self.docvecs, update=update) + self.prepare_weights(update=update) def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): """Build vocabulary from a dictionary of word frequencies. @@ -977,68 +911,14 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No # Since no documents are provided, this is to control the corpus_count self.corpus_count = corpus_count or 0 - self.vocabulary.raw_vocab = raw_vocab + self.raw_vocab = raw_vocab # trim by min_count & precalculate downsampling - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab, - trim_rule=trim_rule, update=update) + report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights( - self.hs, self.negative, self.wv, self.docvecs, update=update) - - -def _note_doctag(key, document_length, docvecs): - """Note a document tag during initial corpus scan, for structure sizing.""" - if isinstance(key, integer_types + (integer,)): - docvecs.max_rawint = max(docvecs.max_rawint, key) - else: - if key in docvecs.doctags: - docvecs.doctags[key] = docvecs.doctags[key].repeat(document_length) - else: - docvecs.doctags[key] = Doctag(len(docvecs.offset2doctag), document_length, 1) - docvecs.offset2doctag.append(key) - docvecs.count = docvecs.max_rawint + 1 + len(docvecs.offset2doctag) - - -class Doc2VecVocab(Word2VecVocab): - """Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`. - - This includes a mapping from words found in the corpus to their total frequency count. - - """ - def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75): - """ + self.prepare_weights(update=update) - Parameters - ---------- - max_vocab_size : int, optional - Maximum number of words in the Vocabulary. Used to limit the RAM during vocabulary building; - if there are more unique words than this, then prune the infrequent ones. - Every 10 million word types need about 1GB of RAM, set to `None` for no limit. - min_count : int - Words with frequency lower than this limit will be discarded from the vocabulary. - sample : float, optional - The threshold for configuring which higher-frequency words are randomly downsampled, - useful range is (0, 1e-5). - sorted_vocab : bool - If True, sort the vocabulary by descending frequency before assigning word indexes. - null_word : {0, 1} - If True, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words). - This word is only ever input – never predicted – so count, huffman-point, etc doesn't matter. - ns_exponent : float, optional - The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion - to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more - than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper. - More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that - other values may perform better for recommendation applications. - - """ - super(Doc2VecVocab, self).__init__( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent) - - def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): + def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): document_no = -1 total_words = 0 min_reduce = 1 @@ -1046,7 +926,10 @@ def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): interval_count = 0 checked_string_types = 0 vocab = defaultdict(int) - for document_no, document in enumerate(documents): + max_rawint = -1 # highest raw int tag seen (-1 for none) + doctags_lookup = {} + doctags_list = [] + for document_no, document in enumerate(corpus_iterable): if not checked_string_types: if isinstance(document.words, string_types): logger.warning( @@ -1059,14 +942,23 @@ def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): interval_rate = (total_words - interval_count) / (default_timer() - interval_start) logger.info( "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", - document_no, total_words, interval_rate, len(vocab), docvecs.count + document_no, total_words, interval_rate, len(vocab), len(doctags_list) ) interval_start = default_timer() interval_count = total_words document_length = len(document.words) for tag in document.tags: - _note_doctag(tag, document_length, docvecs) + # Note a document tag during initial corpus scan, for structure sizing. + if isinstance(tag, integer_types + (integer,)): + max_rawint = max(max_rawint, tag) + else: + if tag in doctags_lookup: + doctags_lookup[tag].doc_count += 1 + doctags_lookup[tag].word_count += document_length + else: + doctags_lookup[tag] = Doctag(index=len(doctags_list), word_count=document_length, doc_count=1) + doctags_list.append(tag) for word in document.words: vocab[word] += 1 @@ -1077,10 +969,28 @@ def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): min_reduce += 1 corpus_count = document_no + 1 + if len(doctags_list) > corpus_count: + logger.warning("More unique tags (%i) than documents (%i).", len(doctags_list), corpus_count) + if max_rawint > corpus_count: + logger.warning( + "Highest int doctag (%i) larger than count of documents (%i). This means " + "at least %i excess, unused slots (%i bytes) will be allocated for vectors.", + max_rawint, corpus_count, ((max_rawint - corpus_count) * self.vector_size * 4)) + if max_rawint > -1: + # adjust indexes/list to account for range of pure-int keyed doctags + for key in doctags_list: + doctags_lookup[key].index = doctags_lookup[key].index + max_rawint + 1 + doctags_list = list(range(0, max_rawint + 1)) + doctags_list + + self.dv.index_to_key = doctags_list + for t, dt in doctags_lookup.items(): + self.dv.key_to_index[t] = dt.index + self.dv.set_vecattr(t, 'word_count', dt.word_count) + self.dv.set_vecattr(t, 'doc_count', dt.doc_count) self.raw_vocab = vocab return total_words, corpus_count - def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_per=10000, trim_rule=None): + def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, trim_rule=None): """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count. Parameters @@ -1091,8 +1001,6 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `documents` to get performance boost. Only one of `documents` or `corpus_file` arguments need to be passed (not both of them). - docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` - The vector representations of the documents in our corpus. Each of them has a size == `vector_size`. progress_per : int Progress will be logged every `progress_per` documents. trim_rule : function, optional @@ -1117,112 +1025,57 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe """ logger.info("collecting all words and their counts") if corpus_file is not None: - documents = TaggedLineDocument(corpus_file) + corpus_iterable = TaggedLineDocument(corpus_file) - total_words, corpus_count = self._scan_vocab(documents, docvecs, progress_per, trim_rule) + total_words, corpus_count = self._scan_vocab(corpus_iterable, progress_per, trim_rule) logger.info( "collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(self.raw_vocab), docvecs.count, corpus_count, total_words + len(self.raw_vocab), len(self.dv), corpus_count, total_words ) return total_words, corpus_count - def indexed_doctags(self, doctag_tokens, docvecs): - """Get the indexes and backing-arrays used in training examples. - - Parameters - ---------- - doctag_tokens : list of {str, int} - A list of tags for which we want the index. - docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` - Vector representations of the documents in the corpus. Each vector has size == `vector_size` - - Returns - ------- - list of int - Indices of the provided tag keys. - - """ - return [ - Doc2VecKeyedVectors._int_index(index, docvecs.doctags, docvecs.max_rawint) - for index in doctag_tokens if self._tag_seen(index, docvecs)] - - def _tag_seen(self, index, docvecs): - """Whether or not the tag exists in our Vocabulary. + def similarity_unseen_docs(self, doc_words1, doc_words2, alpha=None, min_alpha=None, steps=None): + """Compute cosine similarity between two post-bulk out of training documents. Parameters ---------- - index : {str, int} - The tag to be checked. - docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` - Vector representations of the documents in the corpus. Each vector has size == `vector_size` + model : :class:`~gensim.models.doc2vec.Doc2Vec` + An instance of a trained `Doc2Vec` model. + doc_words1 : list of str + Input document. + doc_words2 : list of str + Input document. + alpha : float, optional + The initial learning rate. + min_alpha : float, optional + Learning rate will linearly drop to `min_alpha` as training progresses. + steps : int, optional + Number of epoch to train the new document. Returns ------- - bool - Whether or not the passed tag exists in our vocabulary. + float + The cosine similarity between `doc_words1` and `doc_words2`. """ - if isinstance(index, integer_types + (integer,)): - return index < docvecs.count - else: - return index in docvecs.doctags + d1 = self.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) + d2 = self.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) + return np.dot(matutils.unitvec(d1), matutils.unitvec(d2)) -class Doc2VecTrainables(Word2VecTrainables): - """Represents the inner shallow neural network used to train :class:`~gensim.models.doc2vec.Doc2Vec`.""" - def __init__(self, dm=1, dm_concat=0, dm_tag_count=1, vector_size=100, seed=1, hashfxn=hash, window=5): - super(Doc2VecTrainables, self).__init__( - vector_size=vector_size, seed=seed, hashfxn=hashfxn) - if dm and dm_concat: - self.layer1_size = (dm_tag_count + (2 * window)) * vector_size - logger.info("using concatenative %d-dimensional layer1", self.layer1_size) - - def prepare_weights(self, hs, negative, wv, docvecs, update=False): - """Build tables and model weights based on final vocabulary settings.""" - # set initial input/projection and hidden weights - if not update: - self.reset_weights(hs, negative, wv, docvecs) - else: - self.update_weights(hs, negative, wv) - - def reset_weights(self, hs, negative, wv, docvecs, vocabulary=None): - super(Doc2VecTrainables, self).reset_weights(hs, negative, wv) - self.reset_doc_weights(docvecs) - - def reset_doc_weights(self, docvecs): - length = max(len(docvecs.doctags), docvecs.count) - if docvecs.mapfile_path: - docvecs.vectors_docs = np_memmap( - docvecs.mapfile_path + '.vectors_docs', dtype=REAL, mode='w+', shape=(length, docvecs.vector_size) - ) - self.vectors_docs_lockf = np_memmap( - docvecs.mapfile_path + '.vectors_docs_lockf', dtype=REAL, mode='w+', shape=(length,) - ) - self.vectors_docs_lockf.fill(1.0) - else: - docvecs.vectors_docs = empty((length, docvecs.vector_size), dtype=REAL) - self.vectors_docs_lockf = ones((length,), dtype=REAL) # zeros suppress learning +class Doc2VecVocab(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" - for i in range(length): - # construct deterministic seed from index AND model seed - seed = "%d %s" % ( - self.seed, Doc2VecKeyedVectors._index_to_doctag(i, docvecs.offset2doctag, docvecs.max_rawint)) - docvecs.vectors_docs[i] = self.seeded_vector(seed, docvecs.vector_size) - def get_doctag_trainables(self, doc_words, vector_size): - doctag_vectors = zeros((1, vector_size), dtype=REAL) - doctag_vectors[0] = self.seeded_vector(' '.join(doc_words), vector_size) - doctag_locks = ones(1, dtype=REAL) - return doctag_vectors, doctag_locks +class Doc2VecTrainables(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" class TaggedBrownCorpus(object): - """Reader for the `Brown corpus (part of NLTK data) `_.""" - def __init__(self, dirname): - """ + """Reader for the `Brown corpus (part of NLTK data) `_. Parameters ---------- @@ -1259,14 +1112,11 @@ def __iter__(self): class TaggedLineDocument(object): - """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object. - - Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed - automatically from the document line number (each document gets a unique integer tag). - - """ def __init__(self, source): - """ + """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object. + + Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed + automatically from the document line number (each document gets a unique integer tag). Parameters ---------- diff --git a/gensim/models/doc2vec_corpusfile.pyx b/gensim/models/doc2vec_corpusfile.pyx index 13ceb4aa4e..5b8cbeabff 100644 --- a/gensim/models/doc2vec_corpusfile.pyx +++ b/gensim/models/doc2vec_corpusfile.pyx @@ -54,11 +54,13 @@ cdef int ONE = 1 cdef REAL_t ONEF = 1.0 -cdef void prepare_c_structures_for_batch(vector[string] &doc_words, int sample, int hs, int window, long long *total_words, - int *effective_words, unsigned long long *next_random, cvocab_t *vocab, - np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, - np.uint32_t *reduced_windows, int *document_len, int train_words, - int docvecs_count, int doc_tag) nogil: +cdef void prepare_c_structures_for_batch( + vector[string] &doc_words, int sample, int hs, int window, long long *total_words, + int *effective_words, unsigned long long *next_random, cvocab_t *vocab, + np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, + np.uint32_t *reduced_windows, int *document_len, int train_words, + int docvecs_count, int doc_tag, + ) nogil: cdef VocabItem predict_word cdef string token cdef int i = 0 @@ -92,10 +94,12 @@ cdef void prepare_c_structures_for_batch(vector[string] &doc_words, int sample, effective_words[0] += 1 -def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, - _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, - train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, - doctag_vectors=None, doctag_locks=None): +def d2v_train_epoch_dbow( + model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, + _expected_words, work, neu1, docvecs_count, word_vectors=None, words_lockf=None, + train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, + doctag_vectors=None, doctags_lockf=None, + ): """Train distributed bag of words model ("PV-DBOW") by training on a corpus file. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train`. @@ -124,13 +128,13 @@ def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab Whether or not the weights of the hidden layer will be updated. word_vectors : numpy.ndarray, optional The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. - word_locks : numpy.ndarray, optional - A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, - a value of 1 allows to update word-vectors. + words_lockf : numpy.ndarray, optional + EXPERIMENTAL. A learning lock factor for each word-vector, value 0.0 completely blocks updates, a value + of 1.0 allows normal updates to word-vectors. doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : numpy.ndarray, optional - The lock factors for each tag, same as `word_locks`, but for document-vectors. + doctags_lockf : numpy.ndarray, optional + EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors. Returns ------- @@ -162,8 +166,8 @@ def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab init_d2v_config( &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=train_words, - work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, docvecs_count=docvecs_count) + work=work, neu1=neu1, word_vectors=word_vectors, words_lockf=words_lockf, + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf, docvecs_count=docvecs_count) # release GIL & train on the full corpus, document by document with nogil: @@ -196,27 +200,29 @@ def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose fast_document_dbow_hs( c.points[i], c.codes[i], c.codelens[i], c.word_vectors, c.syn1, c.layer1_size, - c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.word_locks) + c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.words_lockf, + c.words_lockf_len) if c.negative: # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose c.next_random = fast_document_dbow_neg( c.negative, c.cum_table, c.cum_table_len, c.word_vectors, c.syn1neg, c.layer1_size, c.indexes[i], c.indexes[j], c.alpha, c.work, - c.next_random, c.learn_words, c.learn_hidden, c.word_locks) + c.next_random, c.learn_words, c.learn_hidden, c.words_lockf, c.words_lockf_len) # docvec-training if _doc_tag < c.docvecs_count: if c.hs: fast_document_dbow_hs( c.points[i], c.codes[i], c.codelens[i], c.doctag_vectors, c.syn1, c.layer1_size, - _doc_tag, c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctag_locks) + _doc_tag, c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctags_lockf, + c.doctags_lockf_len) if c.negative: c.next_random = fast_document_dbow_neg( c.negative, c.cum_table, c.cum_table_len, c.doctag_vectors, c.syn1neg, c.layer1_size, c.indexes[i], _doc_tag, c.alpha, c.work, c.next_random, - c.learn_doctags, c.learn_hidden, c.doctag_locks) + c.learn_doctags, c.learn_hidden, c.doctags_lockf, c.doctags_lockf_len) total_documents += 1 total_effective_words += effective_words @@ -229,9 +235,11 @@ def d2v_train_epoch_dbow(model, corpus_file, offset, start_doctag, _cython_vocab return total_documents, total_effective_words, total_words -def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, - _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, - learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctag_locks=None): +def d2v_train_epoch_dm( + model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, + _expected_words, work, neu1, docvecs_count, word_vectors=None, words_lockf=None, + learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctags_lockf=None, + ): """Train distributed memory model ("PV-DM") by training on a corpus file. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. @@ -259,13 +267,13 @@ def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, Whether or not the weights of the hidden layer will be updated. word_vectors : numpy.ndarray, optional The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. - word_locks : numpy.ndarray, optional - A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, - a value of 1 allows to update word-vectors. + words_lockf : numpy.ndarray, optional + EXPERIMENTAL. A learning lock factor for each word-vector, value 0.0 completely blocks updates, a value + of 1.0 allows normal updates to word-vectors. doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : numpy.ndarray, optional - The lock factors for each tag, same as `word_locks`, but for document-vectors. + doctags_lockf : numpy.ndarray, optional + EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors. Returns ------- @@ -298,8 +306,8 @@ def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, init_d2v_config( &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False, - work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, docvecs_count=docvecs_count) + work=work, neu1=neu1, word_vectors=word_vectors, words_lockf=words_lockf, + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf, docvecs_count=docvecs_count) # release GIL & train on the full corpus, document by document with nogil: @@ -357,30 +365,35 @@ def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, sscal(&c.layer1_size, &inv_count, c.work, &ONE) # (does this need BLAS-variants like saxpy?) # apply accumulated error in work if c.learn_doctags and _doc_tag < c.docvecs_count: - our_saxpy(&c.layer1_size, &c.doctag_locks[_doc_tag], c.work, - &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE) + our_saxpy( + &c.layer1_size, &c.doctags_lockf[_doc_tag % c.doctags_lockf_len], c.work, + &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE) if c.learn_words: for m in range(j, k): if m == i: continue else: - our_saxpy(&c.layer1_size, &c.word_locks[c.indexes[m]], c.work, &ONE, - &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE) + our_saxpy( + &c.layer1_size, &c.words_lockf[c.indexes[m] % c.words_lockf_len], c.work, &ONE, + &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE) total_documents += 1 total_effective_words += effective_words _doc_tag += 1 - c.alpha = get_next_alpha(start_alpha, end_alpha, total_documents, total_words, expected_examples, - expected_words, cur_epoch, num_epochs) + c.alpha = get_next_alpha( + start_alpha, end_alpha, total_documents, total_words, expected_examples, + expected_words, cur_epoch, num_epochs) return total_documents, total_effective_words, total_words -def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, - _expected_words, work, neu1, docvecs_count, word_vectors=None, word_locks=None, - learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, - doctag_locks=None): +def d2v_train_epoch_dm_concat( + model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples, + _expected_words, work, neu1, docvecs_count, word_vectors=None, words_lockf=None, + learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, + doctags_lockf=None, + ): """Train distributed memory model ("PV-DM") by training on a corpus file, using a concatenation of the context window word vectors (rather than a sum or average). This might be slower since the input at each batch will be significantly larger. @@ -408,13 +421,13 @@ def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_ Whether or not the weights of the hidden layer will be updated. word_vectors : numpy.ndarray, optional The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. - word_locks : numpy.ndarray, optional - A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, - a value of 1 allows to update word-vectors. + words_lockf : numpy.ndarray, optional + EXPERIMENTAL. A learning lock factor for each word-vector, value 0.0 completely blocks updates, a value + of 1.0 allows normal updates to word-vectors. doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : numpy.ndarray, optional - The lock factors for each tag, same as `word_locks`, but for document-vectors. + doctags_lockf : numpy.ndarray, optional + EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors. Returns ------- @@ -446,8 +459,8 @@ def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_ init_d2v_config( &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False, - work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks, docvecs_count=docvecs_count) + work=work, neu1=neu1, word_vectors=word_vectors, words_lockf=words_lockf, + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf, docvecs_count=docvecs_count) # release GIL & train on the full corpus, document by document with nogil: @@ -475,8 +488,7 @@ def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_ # compose l1 & clear work if _doc_tag < c.docvecs_count: # doc vector(s) - memcpy(&c.neu1[0], &c.doctag_vectors[_doc_tag * c.vector_size], - c.vector_size * cython.sizeof(REAL_t)) + memcpy(&c.neu1[0], &c.doctag_vectors[_doc_tag * c.vector_size], c.vector_size * cython.sizeof(REAL_t)) n = 0 for m in range(j, k): # word vectors in window @@ -488,8 +500,9 @@ def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_ c.window_indexes[n] = c.indexes[m] n += 1 for m in range(2 * c.window): - memcpy(&c.neu1[(c.doctag_len + m) * c.vector_size], &c.word_vectors[c.window_indexes[m] * c.vector_size], - c.vector_size * cython.sizeof(REAL_t)) + memcpy( + &c.neu1[(c.doctag_len + m) * c.vector_size], &c.word_vectors[c.window_indexes[m] * c.vector_size], + c.vector_size * cython.sizeof(REAL_t)) memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t)) # work to accumulate l1 error if c.hs: @@ -503,19 +516,22 @@ def d2v_train_epoch_dm_concat(model, corpus_file, offset, start_doctag, _cython_ c.indexes[i], c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden) if c.learn_doctags and _doc_tag < c.docvecs_count: - our_saxpy(&c.vector_size, &c.doctag_locks[_doc_tag], &c.work[m * c.vector_size], - &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE) + our_saxpy( + &c.vector_size, &c.doctags_lockf[_doc_tag % c.doctags_lockf_len], &c.work[m * c.vector_size], + &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE) if c.learn_words: for m in range(2 * c.window): - our_saxpy(&c.vector_size, &c.word_locks[c.window_indexes[m]], &c.work[(c.doctag_len + m) * c.vector_size], - &ONE, &c.word_vectors[c.window_indexes[m] * c.vector_size], &ONE) + our_saxpy( + &c.vector_size, &c.words_lockf[c.window_indexes[m] % c.words_lockf_len], &c.work[(c.doctag_len + m) * c.vector_size], + &ONE, &c.word_vectors[c.window_indexes[m] * c.vector_size], &ONE) total_documents += 1 total_effective_words += effective_words _doc_tag += 1 - c.alpha = get_next_alpha(start_alpha, end_alpha, total_documents, total_words, expected_examples, - expected_words, cur_epoch, num_epochs) + c.alpha = get_next_alpha( + start_alpha, end_alpha, total_documents, total_words, expected_examples, + expected_words, cur_epoch, num_epochs) return total_documents, total_effective_words, total_words diff --git a/gensim/models/doc2vec_inner.pxd b/gensim/models/doc2vec_inner.pxd index c70dc616cc..77da86f449 100644 --- a/gensim/models/doc2vec_inner.pxd +++ b/gensim/models/doc2vec_inner.pxd @@ -26,8 +26,10 @@ cdef struct Doc2VecConfig: REAL_t *word_vectors REAL_t *doctag_vectors - REAL_t *word_locks - REAL_t *doctag_locks + REAL_t *words_lockf + np.uint32_t words_lockf_len + REAL_t *doctags_lockf + np.uint32_t doctags_lockf_len REAL_t *work REAL_t *neu1 REAL_t alpha @@ -54,14 +56,15 @@ cdef void fast_document_dbow_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, REAL_t *context_vectors, REAL_t *syn1, const int size, const np.uint32_t context_index, const REAL_t alpha, REAL_t *work, int learn_context, int learn_hidden, - REAL_t *context_locks) nogil + REAL_t *contexts_lockf, const np.uint32_t contexts_lockf_len) nogil cdef unsigned long long fast_document_dbow_neg( const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, REAL_t *context_vectors, REAL_t *syn1neg, const int size, const np.uint32_t word_index, const np.uint32_t context_index, const REAL_t alpha, REAL_t *work, - unsigned long long next_random, int learn_context, int learn_hidden, REAL_t *context_locks) nogil + unsigned long long next_random, int learn_context, int learn_hidden, REAL_t *contexts_lockf, + const np.uint32_t contexts_lockf_len) nogil cdef void fast_document_dm_hs( @@ -89,4 +92,4 @@ cdef unsigned long long fast_document_dmc_neg( cdef init_d2v_config(Doc2VecConfig *c, model, alpha, learn_doctags, learn_words, learn_hidden, train_words=*, work=*, - neu1=*, word_vectors=*, word_locks=*, doctag_vectors=*, doctag_locks=*, docvecs_count=*) + neu1=*, word_vectors=*, words_lockf=*, doctag_vectors=*, doctags_lockf=*, docvecs_count=*) diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index 8d9ca4862f..23ede53c90 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -38,7 +38,7 @@ cdef void fast_document_dbow_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, REAL_t *context_vectors, REAL_t *syn1, const int size, const np.uint32_t context_index, const REAL_t alpha, REAL_t *work, int learn_context, int learn_hidden, - REAL_t *context_locks) nogil: + REAL_t *contexts_lockf, const np.uint32_t contexts_lockf_len) nogil: cdef long long a, b cdef long long row1 = context_index * size, row2 @@ -56,14 +56,16 @@ cdef void fast_document_dbow_hs( if learn_hidden: our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE) if learn_context: - our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE) + our_saxpy(&size, &contexts_lockf[context_index % contexts_lockf_len], + work, &ONE, &context_vectors[row1], &ONE) cdef unsigned long long fast_document_dbow_neg( const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, REAL_t *context_vectors, REAL_t *syn1neg, const int size, const np.uint32_t word_index, const np.uint32_t context_index, const REAL_t alpha, REAL_t *work, - unsigned long long next_random, int learn_context, int learn_hidden, REAL_t *context_locks) nogil: + unsigned long long next_random, int learn_context, int learn_hidden, REAL_t *contexts_lockf, + const np.uint32_t contexts_lockf_len) nogil: cdef long long a cdef long long row1 = context_index * size, row2 @@ -94,7 +96,8 @@ cdef unsigned long long fast_document_dbow_neg( if learn_hidden: our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) if learn_context: - our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE) + our_saxpy(&size, &contexts_lockf[context_index % contexts_lockf_len], + work, &ONE, &context_vectors[row1], &ONE) return next_random @@ -221,65 +224,67 @@ cdef unsigned long long fast_document_dmc_neg( cdef init_d2v_config(Doc2VecConfig *c, model, alpha, learn_doctags, learn_words, learn_hidden, - train_words=False, work=None, neu1=None, word_vectors=None, word_locks=None, doctag_vectors=None, - doctag_locks=None, docvecs_count=0): + train_words=False, work=None, neu1=None, word_vectors=None, words_lockf=None, + doctag_vectors=None, doctags_lockf=None, docvecs_count=0): c[0].hs = model.hs c[0].negative = model.negative - c[0].sample = (model.vocabulary.sample != 0) + c[0].sample = (model.sample != 0) c[0].cbow_mean = model.cbow_mean c[0].train_words = train_words c[0].learn_doctags = learn_doctags c[0].learn_words = learn_words c[0].learn_hidden = learn_hidden c[0].alpha = alpha - c[0].layer1_size = model.trainables.layer1_size - c[0].vector_size = model.docvecs.vector_size + c[0].layer1_size = model.layer1_size + c[0].vector_size = model.dv.vector_size c[0].workers = model.workers c[0].docvecs_count = docvecs_count c[0].window = model.window c[0].expected_doctag_len = model.dm_tag_count - if '\0' in model.wv.vocab: - c[0].null_word_index = model.wv.vocab['\0'].index + if '\0' in model.wv: + c[0].null_word_index = model.wv.get_index('\0') # default vectors, locks from syn0/doctag_syn0 if word_vectors is None: word_vectors = model.wv.vectors c[0].word_vectors = (np.PyArray_DATA(word_vectors)) if doctag_vectors is None: - doctag_vectors = model.docvecs.vectors_docs + doctag_vectors = model.dv.vectors c[0].doctag_vectors = (np.PyArray_DATA(doctag_vectors)) - if word_locks is None: - word_locks = model.trainables.vectors_lockf - c[0].word_locks = (np.PyArray_DATA(word_locks)) - if doctag_locks is None: - doctag_locks = model.trainables.vectors_docs_lockf - c[0].doctag_locks = (np.PyArray_DATA(doctag_locks)) + if words_lockf is None: + words_lockf = model.wv.vectors_lockf + c[0].words_lockf = (np.PyArray_DATA(words_lockf)) + c[0].words_lockf_len = len(words_lockf) + if doctags_lockf is None: + doctags_lockf = model.dv.vectors_lockf + c[0].doctags_lockf = (np.PyArray_DATA(doctags_lockf)) + c[0].doctags_lockf_len = len(doctags_lockf) if c[0].hs: - c[0].syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c[0].syn1 = (np.PyArray_DATA(model.syn1)) if c[0].negative: - c[0].syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) - c[0].cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) - c[0].cum_table_len = len(model.vocabulary.cum_table) + c[0].syn1neg = (np.PyArray_DATA(model.syn1neg)) + c[0].cum_table = (np.PyArray_DATA(model.cum_table)) + c[0].cum_table_len = len(model.cum_table) if c[0].negative or c[0].sample: c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # convert Python structures to primitive types, so we can release the GIL if work is None: - work = zeros(model.trainables.layer1_size, dtype=REAL) + work = zeros(model.layer1_size, dtype=REAL) c[0].work = np.PyArray_DATA(work) if neu1 is None: - neu1 = zeros(model.trainables.layer1_size, dtype=REAL) + neu1 = zeros(model.layer1_size, dtype=REAL) c[0].neu1 = np.PyArray_DATA(neu1) def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + word_vectors=None, words_lockf=None, doctag_vectors=None, doctags_lockf=None): """Update distributed bag of words model ("PV-DBOW") by training on a single document. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and @@ -310,13 +315,13 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, Whether or not the weights of the hidden layer will be updated. word_vectors : numpy.ndarray, optional The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. - word_locks : numpy.ndarray, optional - A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, - a value of 1 allows to update word-vectors. + words_lockf : numpy.ndarray, optional + EXPERIMENTAL. A learning lock factor for each word-vector; value 0.0 completely blocks updates, a value + of 1.0 allows normal updates to word-vectors. doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : numpy.ndarray, optional - The lock factors for each tag, same as `word_locks`, but for document-vectors. + doctags_lockf : numpy.ndarray, optional + EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors. Returns ------- @@ -328,26 +333,30 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, cdef int i, j cdef long result = 0 + cdef np.uint32_t *vocab_sample_ints init_d2v_config(&c, model, alpha, learn_doctags, learn_words, learn_hidden, train_words=train_words, work=work, - neu1=None, word_vectors=word_vectors, word_locks=word_locks, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) - + neu1=None, word_vectors=word_vectors, words_lockf=words_lockf, + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf) c.doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) + if c.sample: + vocab_sample_ints = np.PyArray_DATA(model.wv.expandos['sample_int']) + if c.hs: + vocab_codes = model.wv.expandos['code'] + vocab_points = model.wv.expandos['point'] - vlookup = model.wv.vocab i = 0 for token in doc_words: - predict_word = vlookup[token] if token in vlookup else None - if predict_word is None: # shrink document to leave out word + word_index = model.wv.key_to_index.get(token, None) + if word_index is None: # shrink document to leave out word continue # leaving i unchanged - if c.sample and predict_word.sample_int < random_int32(&c.next_random): + if c.sample and vocab_sample_ints[word_index] < random_int32(&c.next_random): continue - c.indexes[i] = predict_word.index + c.indexes[i] = word_index if c.hs: - c.codelens[i] = len(predict_word.code) - c.codes[i] = np.PyArray_DATA(predict_word.code) - c.points[i] = np.PyArray_DATA(predict_word.point) + c.codelens[i] = len(vocab_codes[word_index]) + c.codes[i] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[i] = np.PyArray_DATA(vocab_points[word_index]) result += 1 i += 1 if i == MAX_DOCUMENT_LEN: @@ -379,31 +388,33 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, if c.hs: # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose fast_document_dbow_hs(c.points[i], c.codes[i], c.codelens[i], c.word_vectors, c.syn1, c.layer1_size, - c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.word_locks) + c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.words_lockf, + c.words_lockf_len) if c.negative: # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose c.next_random = fast_document_dbow_neg(c.negative, c.cum_table, c.cum_table_len, c.word_vectors, c.syn1neg, c.layer1_size, c.indexes[i], c.indexes[j], c.alpha, c.work, c.next_random, c.learn_words, - c.learn_hidden, c.word_locks) + c.learn_hidden, c.words_lockf, c.words_lockf_len) # docvec-training for j in range(c.doctag_len): if c.hs: fast_document_dbow_hs(c.points[i], c.codes[i], c.codelens[i], c.doctag_vectors, c.syn1, c.layer1_size, - c.doctag_indexes[j], c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctag_locks) + c.doctag_indexes[j], c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctags_lockf, + c.doctags_lockf_len) if c.negative: c.next_random = fast_document_dbow_neg(c.negative, c.cum_table, c.cum_table_len, c.doctag_vectors, c.syn1neg, c.layer1_size, c.indexes[i], c.doctag_indexes[j], c.alpha, c.work, c.next_random, c.learn_doctags, - c.learn_hidden, c.doctag_locks) + c.learn_hidden, c.doctags_lockf, c.doctags_lockf_len) return result def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): + word_vectors=None, words_lockf=None, doctag_vectors=None, doctags_lockf=None): """Update distributed memory model ("PV-DM") by training on a single document. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. @@ -435,13 +446,13 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N Whether or not the weights of the hidden layer will be updated. word_vectors : numpy.ndarray, optional The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. - word_locks : numpy.ndarray, optional - A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, - a value of 1 allows to update word-vectors. + words_lockf : numpy.ndarray, optional + EXPERIMENTAL. A learning lock factor for each word-vector; value 0.0 completely blocks updates, a value + of 1.0 allows normal updates to word-vectors. doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : numpy.ndarray, optional - The lock factors for each tag, same as `word_locks`, but for document-vectors. + doctags_lockf : numpy.ndarray, optional + EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors. Returns ------- @@ -454,26 +465,31 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N cdef REAL_t count, inv_count = 1.0 cdef int i, j, k, m cdef long result = 0 + cdef np.uint32_t *vocab_sample_ints init_d2v_config(&c, model, alpha, learn_doctags, learn_words, learn_hidden, train_words=False, - work=work, neu1=neu1, word_vectors=word_vectors, word_locks=word_locks, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) - + work=work, neu1=neu1, word_vectors=word_vectors, words_lockf=words_lockf, + doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf) c.doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) + if c.sample: + vocab_sample_ints = np.PyArray_DATA(model.wv.expandos['sample_int']) +# vocab_sample_ints = model.wv.expandos['sample_int'] # this variant noticeably slower + if c.hs: + vocab_codes = model.wv.expandos['code'] + vocab_points = model.wv.expandos['point'] - vlookup = model.wv.vocab i = 0 for token in doc_words: - predict_word = vlookup[token] if token in vlookup else None - if predict_word is None: # shrink document to leave out word + word_index = model.wv.key_to_index.get(token, None) + if word_index is None: # shrink document to leave out word continue # leaving i unchanged - if c.sample and predict_word.sample_int < random_int32(&c.next_random): + if c.sample and vocab_sample_ints[word_index] < random_int32(&c.next_random): continue - c.indexes[i] = predict_word.index + c.indexes[i] = word_index if c.hs: - c.codelens[i] = len(predict_word.code) - c.codes[i] = np.PyArray_DATA(predict_word.code) - c.points[i] = np.PyArray_DATA(predict_word.point) + c.codelens[i] = len(vocab_codes[word_index]) + c.codes[i] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[i] = np.PyArray_DATA(vocab_points[word_index]) result += 1 i += 1 if i == MAX_DOCUMENT_LEN: @@ -528,14 +544,14 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N # apply accumulated error in work if c.learn_doctags: for m in range(c.doctag_len): - our_saxpy(&c.layer1_size, &c.doctag_locks[c.doctag_indexes[m]], c.work, + our_saxpy(&c.layer1_size, &c.doctags_lockf[c.doctag_indexes[m] % c.doctags_lockf_len], c.work, &ONE, &c.doctag_vectors[c.doctag_indexes[m] * c.layer1_size], &ONE) if c.learn_words: for m in range(j, k): if m == i: continue else: - our_saxpy(&c.layer1_size, &c.word_locks[c.indexes[m]], c.work, &ONE, + our_saxpy(&c.layer1_size, &c.words_lockf[c.indexes[m] % c.doctags_lockf_len], c.work, &ONE, &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE) return result @@ -543,10 +559,10 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context - window word vectors (rather than a sum or average). - This might be slower since the input at each batch will be significantly larger. + word_vectors=None, words_lockf=None, doctag_vectors=None, doctags_lockf=None): + """Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the + context window word vectors (rather than a sum or average). + This will be slower since the input at each batch will be significantly larger. Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`. @@ -575,13 +591,13 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, Whether or not the weights of the hidden layer will be updated. word_vectors : numpy.ndarray, optional The vector representation for each word in the vocabulary. If None, these will be retrieved from the model. - word_locks : numpy.ndarray, optional - A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates, - a value of 1 allows to update word-vectors. + words_lockf : numpy.ndarray, optional + EXPERIMENTAL. A learning lock factor for each word-vector, value 0.0 completely blocks updates, a value + of 1.0 allows normal updates to word-vectors. doctag_vectors : numpy.ndarray, optional Vector representations of the tags. If None, these will be retrieved from the model. - doctag_locks : numpy.ndarray, optional - The lock factors for each tag, same as `word_locks`, but for document-vectors. + doctags_lockf : numpy.ndarray, optional + EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors. Returns ------- @@ -593,28 +609,32 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, cdef int i, j, k, m, n cdef long result = 0 + cdef np.uint32_t *vocab_sample_ints init_d2v_config(&c, model, alpha, learn_doctags, learn_words, learn_hidden, train_words=False, work=work, neu1=neu1, - word_vectors=word_vectors, word_locks=word_locks, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) - + word_vectors=word_vectors, words_lockf=words_lockf, doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf) c.doctag_len = min(MAX_DOCUMENT_LEN, len(doctag_indexes)) + if c.sample: + vocab_sample_ints = np.PyArray_DATA(model.wv.expandos['sample_int']) + if c.hs: + vocab_codes = model.wv.expandos['code'] + vocab_points = model.wv.expandos['point'] if c.doctag_len != c.expected_doctag_len: return 0 # skip doc without expected number of tags - vlookup = model.wv.vocab i = 0 for token in doc_words: - predict_word = vlookup[token] if token in vlookup else None - if predict_word is None: # shrink document to leave out word + word_index = model.wv.key_to_index.get(token, None) + if word_index is None: # shrink document to leave out word continue # leaving i unchanged - if c.sample and predict_word.sample_int < random_int32(&c.next_random): + if c.sample and vocab_sample_ints[word_index] < random_int32(&c.next_random): continue - c.indexes[i] = predict_word.index + c.indexes[i] = word_index if c.hs: - c.codelens[i] = len(predict_word.code) - c.codes[i] = np.PyArray_DATA(predict_word.code) - c.points[i] = np.PyArray_DATA(predict_word.point) + c.codelens[i] = len(vocab_codes[word_index]) + c.codes[i] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[i] = np.PyArray_DATA(vocab_points[word_index]) result += 1 i += 1 if i == MAX_DOCUMENT_LEN: @@ -662,11 +682,11 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, if c.learn_doctags: for m in range(c.doctag_len): - our_saxpy(&c.vector_size, &c.doctag_locks[c.doctag_indexes[m]], &c.work[m * c.vector_size], + our_saxpy(&c.vector_size, &c.doctags_lockf[c.doctag_indexes[m] % c.doctags_lockf_len], &c.work[m * c.vector_size], &ONE, &c.doctag_vectors[c.doctag_indexes[m] * c.vector_size], &ONE) if c.learn_words: for m in range(2 * c.window): - our_saxpy(&c.vector_size, &c.word_locks[c.window_indexes[m]], &c.work[(c.doctag_len + m) * c.vector_size], + our_saxpy(&c.vector_size, &c.words_lockf[c.window_indexes[m] % c.words_lockf_len], &c.work[(c.doctag_len + m) * c.vector_size], &ONE, &c.word_vectors[c.window_indexes[m] * c.vector_size], &ONE) return result diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 2307b04468..5c07a0b540 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Authors: Shiva Manne , Chinmaya Pancholi +# Authors: Gensim Contributors # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -37,20 +37,20 @@ ['human', 'interface', 'computer'] >>> print(len(common_texts)) 9 - >>> model = FastText(size=4, window=3, min_count=1) # instantiate + >>> model = FastText(vector_size=4, window=3, min_count=1) # instantiate >>> model.build_vocab(sentences=common_texts) >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train Once you have a model, you can access its keyed vectors via the `model.wv` attributes. The keyed vectors instance is quite powerful: it can perform a wide range of NLP tasks. -For a full list of examples, see :class:`~gensim.models.keyedvectors.FastTextKeyedVectors`. +For a full list of examples, see :class:`~gensim.models.keyedvectors.KeyedVectors`. You can also pass all the above parameters to the constructor to do everything in a single line: .. sourcecode:: pycon - >>> model2 = FastText(size=4, window=3, min_count=1, sentences=common_texts, iter=10) + >>> model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, iter=10) .. Important:: This style of initialize-and-train in a single line is **deprecated**. We include it here @@ -84,7 +84,7 @@ >>> from gensim.test.utils import datapath >>> >>> corpus_file = datapath('lee_background.cor') # absolute path to corpus - >>> model3 = FastText(size=4, window=3, min_count=1) + >>> model3 = FastText(vector_size=4, window=3, min_count=1) >>> model3.build_vocab(corpus_file=corpus_file) # scan over corpus to build the vocabulary >>> >>> total_words = model3.corpus_total_words # number of words in the corpus @@ -116,7 +116,7 @@ ... yield list(tokenize(line)) >>> >>> - >>> model4 = FastText(size=4, window=3, min_count=1) + >>> model4 = FastText(vector_size=4, window=3, min_count=1) >>> model4.build_vocab(sentences=MyIter()) >>> total_examples = model4.corpus_count >>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5) @@ -257,12 +257,8 @@ The implementation is split across several submodules: - :mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. -- :mod:`gensim.models.keyedvectors`: Implements both generic and FastText-specific functionality. -- :mod:`gensim.models.word2vec`: Contains implementations for the vocabulary - and the trainables for FastText. -- :mod:`gensim.models.base_any2vec`: Contains implementations for the base. - classes, including functionality such as callbacks, logging. -- :mod:`gensim.models.utils_any2vec`: Wrapper over Cython extensions. +- :mod:`gensim.models.keyedvectors`: Implements generic functionality. +- :mod:`gensim.models.word2vec`: Provides much of the basic scan & train framework. - :mod:`gensim.utils`: Implements model I/O (loading and saving). Our implementation relies heavily on inheritance. @@ -271,7 +267,7 @@ - :class:`~gensim.models.word2vec.Word2VecVocab`: the vocabulary. Keeps track of all the unique words, sometimes discarding the extremely rare ones. This is sometimes called the Dictionary within Gensim. -- :class:`~gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors. +- :class:`~gensim.models.fasttext.FastTextKeyedVectors`: the vectors. Once training is complete, this class is sufficient for calculating embeddings. - :class:`~gensim.models.fasttext.FastTextTrainables`: the underlying neural network. The implementation uses this class to *learn* the word embeddings. @@ -288,80 +284,57 @@ from collections.abc import Iterable import gensim.models._fasttext_bin - -from gensim.models.word2vec import Word2VecVocab, Word2VecTrainables, train_sg_pair, train_cbow_pair # noqa -from gensim.models.keyedvectors import FastTextKeyedVectors -from gensim.models.base_any2vec import BaseWordEmbeddingsModel -from gensim.models.utils_any2vec import ft_ngram_hashes - +from gensim.models.word2vec import Word2Vec +from gensim.models.keyedvectors import KeyedVectors from gensim import utils from gensim.utils import deprecated, call_on_class_only + logger = logging.getLogger(__name__) try: from gensim.models.fasttext_inner import ( # noqa: F401 - train_batch_sg, - train_batch_cbow, - FAST_VERSION, + train_batch_any, MAX_WORDS_IN_BATCH, + compute_ngrams, + compute_ngrams_bytes, + ft_hash_bytes, ) from gensim.models.fasttext_corpusfile import train_epoch_sg, train_epoch_cbow except ImportError: raise utils.NO_CYTHON -class FastText(BaseWordEmbeddingsModel): - """Train, use and evaluate word representations learned using the method - described in `Enriching Word Vectors with Subword Information `_, aka FastText. - - The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and - :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original - Fasttext implementation via :func:`~gensim.models.fasttext.load_facebook_model`. +class FastText(Word2Vec): - Attributes - ---------- - wv : :class:`~gensim.models.keyedvectors.FastTextKeyedVectors` - This object essentially contains the mapping between words and embeddings. These are similar to the embeddings - computed in the :class:`~gensim.models.word2vec.Word2Vec`, however here we also include vectors for n-grams. - This allows the model to compute embeddings even for **unseen** words (that do not exist in the vocabulary), - as the aggregate of the n-grams included in the word. After training the model, this attribute can be used - directly to query those embeddings in various ways. Check the module level docstring for some examples. - vocabulary : :class:`~gensim.models.fasttext.FastTextVocab` - This object represents the vocabulary of the model. - Besides keeping track of all unique words, this object provides extra functionality, such as - constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words. - trainables : :class:`~gensim.models.fasttext.FastTextTrainables` - This object represents the inner shallow neural network used to train the embeddings. This is very - similar to the network of the :class:`~gensim.models.word2vec.Word2Vec` model, but it also trains weights - for the N-Grams (sequences of more than 1 words). The semantics of the network are almost the same as - the one used for the :class:`~gensim.models.word2vec.Word2Vec` model. - You can think of it as a NN with a single projection and hidden layer which we train on the corpus. - The weights are then used as our embeddings. An important difference however between the two models, is the - scoring function used to compute the loss. In the case of FastText, this is modified in word to also account - for the internal structure of words, besides their concurrence counts. - - """ - def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100, alpha=0.025, + window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, + negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), - compatible_hash=True, max_final_vocab=None): - """ + max_final_vocab=None): + """Train, use and evaluate word representations learned using the method + described in `Enriching Word Vectors with Subword Information `_, + aka FastText. + + The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and + :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the + original Fasttext implementation via :func:`~gensim.models.fasttext.load_facebook_model`. Parameters ---------- sentences : iterable of list of str, optional Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it - in some other way. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus' + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such + examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to + initialize it in some other way. corpus_file : str, optional Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). + `corpus_file` arguments need to be passed (or none of them, in that case, the model is left + uninitialized). min_count : int, optional The model ignores all words with total frequency lower than this. size : int, optional @@ -433,21 +406,16 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha max_n : int, optional Max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. - word_ngrams : {1,0}, optional - If 1, uses enriches word vectors with subword(n-grams) information. - If 0, this is equivalent to :class:`~gensim.models.word2vec.Word2Vec`. + word_ngrams : int, optional + In Facebook's FastText, "max length of word ngram" - but gensim only supports the + default of 1 (regular unigram word handling). bucket : int, optional Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model. + The default value of 2000000 consumes as much memory as having 2000000 more in-vocabulary + words in your model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional List of callbacks that need to be executed/run at specific stages during training. - - compatible_hash: bool, optional - By default, newer versions of Gensim's FastText use a hash function - that is 100% compatible with Facebook's FastText. - Older versions were not 100% compatible due to a bug. - To use the older, incompatible hash function, set this to False. - max_final_vocab : int, optional Limits the vocab to a target vocab size by automatically selecting ``min_count```. If the specified ``min_count`` is more than the @@ -467,87 +435,89 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha >>> say_vector = model.wv['say'] # get vector for word >>> of_vector = model.wv['of'] # get vector for out-of-vocab word + Attributes + ---------- + wv : :class:`~gensim.models.fasttext.FastTextKeyedVectors` + This object essentially contains the mapping between words and embeddings. These are similar to + the embedding computed in the :class:`~gensim.models.word2vec.Word2Vec`, however here we also + include vectors for n-grams. This allows the model to compute embeddings even for **unseen** + words (that do not exist in the vocabulary), as the aggregate of the n-grams included in the word. + After training the model, this attribute can be used directly to query those embeddings in various + ways. Check the module level docstring for some examples. + """ self.load = call_on_class_only self.load_fasttext_format = call_on_class_only self.callbacks = callbacks - self.word_ngrams = int(word_ngrams) - if self.word_ngrams <= 1 and max_n == 0: + if word_ngrams != 1: + raise NotImplementedError("Gensim's FastText implementation does not yet support word_ngrams != 1.") + self.word_ngrams = word_ngrams + if max_n < min_n: + # with no eligible char-ngram lengths, no buckets need be allocated bucket = 0 - self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash) - self.vocabulary = FastTextVocab( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent, - max_final_vocab=max_final_vocab, - ) - self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) - self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary) - self.wv.bucket = self.trainables.bucket + self.wv = FastTextKeyedVectors(vector_size, min_n, max_n, bucket) + self.wv.bucket = bucket super(FastText, self).__init__( - sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter, + sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, + max_vocab_size=max_vocab_size, max_final_vocab=max_final_vocab, + min_count=min_count, sample=sample, sorted_vocab=sorted_vocab, + null_word=null_word, ns_exponent=ns_exponent, hashfxn=hashfxn, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha) - @property - @deprecated("Attribute will be removed in 4.0.0, use wv.min_n instead") - def min_n(self): - return self.wv.min_n - - @property - @deprecated("Attribute will be removed in 4.0.0, use wv.max_n instead") - def max_n(self): - return self.wv.max_n - - @property - @deprecated("Attribute will be removed in 4.0.0, use trainables.bucket instead") - def bucket(self): - return self.trainables.bucket - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead") - def syn0_vocab_lockf(self): - return self.trainables.vectors_vocab_lockf - - @syn0_vocab_lockf.setter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead") - def syn0_vocab_lockf(self, value): - self.trainables.vectors_vocab_lockf = value - - @syn0_vocab_lockf.deleter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead") - def syn0_vocab_lockf(self): - del self.trainables.vectors_vocab_lockf - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead") - def syn0_ngrams_lockf(self): - return self.trainables.vectors_ngrams_lockf - - @syn0_ngrams_lockf.setter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead") - def syn0_ngrams_lockf(self, value): - self.trainables.vectors_ngrams_lockf = value - - @syn0_ngrams_lockf.deleter - @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead") - def syn0_ngrams_lockf(self): - del self.trainables.vectors_ngrams_lockf - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.wv.num_ngram_vectors instead") - def num_ngram_vectors(self): - return self.wv.num_ngram_vectors - - def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, - trim_rule=None, **kwargs): + def prepare_weights(self, update=False): + """In addition to superclass allocations, compute ngrams of all words present in vocabulary. + + Parameters + ---------- + update : bool + If True, the new vocab words and their new ngrams word vectors are initialized + with random uniform distribution and updated/added to the existing vocab word and ngram vectors. + """ + super(FastText, self).prepare_weights(update=update) + if not update: + self.wv.init_ngrams_weights(self.seed) + # EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0) + # advanced users should directly resize/adjust as necessary + self.wv.vectors_vocab_lockf = ones(1, dtype=REAL) + self.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) + else: + self.wv.update_ngrams_weights(self.seed, self.old_vocab_len) + # EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0) + # advanced users should directly resize/adjust as necessary + self.wv.vectors_vocab_lockf = ones(1, dtype=REAL) + self.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) + + def _init_post_load(self, hidden_output): + num_vectors = len(self.wv.vectors) + vocab_size = len(self.wv) + vector_size = self.wv.vector_size + + assert num_vectors > 0, 'expected num_vectors to be initialized already' + assert vocab_size > 0, 'expected vocab_size to be initialized already' + + # EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0) + # advanced users should directly resize/adjust as necessary + self.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) + self.wv.vectors_vocab_lockf = ones(1, dtype=REAL) + + if self.hs: + self.syn1 = hidden_output + if self.negative: + self.syn1neg = hidden_output + + self.layer1_size = vector_size + + def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. Parameters ---------- - sentences : iterable of list of str, optional + corpus_iterable : iterable of list of str, optional Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` @@ -578,7 +548,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p **kwargs Additional key word parameters passed to - :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`. + :meth:`~gensim.models.word2vec.Word2Vec.build_vocab`. Examples -------- @@ -599,8 +569,8 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p """ if not update: - self.wv.init_ngrams_weights(self.trainables.seed) - elif not len(self.wv.vocab): + self.wv.init_ngrams_weights(self.seed) + elif not len(self.wv): raise RuntimeError( "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " "First build the vocabulary of your model with a corpus " @@ -608,78 +578,49 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p "before doing an online update." ) else: - self.vocabulary.old_vocab_len = len(self.wv.vocab) + self.old_vocab_len = len(self.wv) retval = super(FastText, self).build_vocab( - sentences=sentences, corpus_file=corpus_file, update=update, progress_per=progress_per, + corpus_iterable=corpus_iterable, corpus_file=corpus_file, update=update, progress_per=progress_per, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) if update: - self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len) + self.wv.update_ngrams_weights(self.seed, self.old_vocab_len) return retval - def _set_train_params(self, **kwargs): - # - # We need the wv.buckets_word member to be initialized in order to - # continue training. The _clear_post_train method destroys this - # variable, so we reinitialize it here, if needed. - # - # The .old_vocab_len member is set only to keep the init_ngrams_weights method happy. - # - if self.wv.buckets_word is None: - self.vocabulary.old_vocab_len = len(self.wv.vocab) - self.trainables.init_ngrams_weights(self.wv, update=True, vocabulary=self.vocabulary) - def _clear_post_train(self): """Clear the model's internal structures after training has finished to free up RAM.""" self.wv.vectors_norm = None - self.wv.vectors_vocab_norm = None - self.wv.vectors_ngrams_norm = None - self.wv.buckets_word = None + self.wv.adjust_vectors() # ensure composite-word vecs reflect latest training def estimate_memory(self, vocab_size=None, report=None): - vocab_size = vocab_size or len(self.wv.vocab) + """Estimate memory that will be needed to train a model, and print the estimates to log.""" + vocab_size = vocab_size or len(self.wv) vec_size = self.vector_size * np.dtype(np.float32).itemsize - l1_size = self.trainables.layer1_size * np.dtype(np.float32).itemsize + l1_size = self.layer1_size * np.dtype(np.float32).itemsize report = report or {} - report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500) - report['syn0_vocab'] = len(self.wv.vocab) * vec_size - num_buckets = self.trainables.bucket + report['vocab'] = len(self.wv) * (700 if self.hs else 500) + report['syn0_vocab'] = len(self.wv) * vec_size + num_buckets = self.wv.bucket if self.hs: - report['syn1'] = len(self.wv.vocab) * l1_size + report['syn1'] = len(self.wv) * l1_size if self.negative: - report['syn1neg'] = len(self.wv.vocab) * l1_size - if self.word_ngrams > 0 and self.wv.vocab: - num_buckets = num_ngrams = 0 - - if self.trainables.bucket: - buckets = set() - num_ngrams = 0 - for word in self.wv.vocab: - hashes = ft_ngram_hashes( - word, - self.wv.min_n, - self.wv.max_n, - self.trainables.bucket, - self.wv.compatible_hash - ) - num_ngrams += len(hashes) - buckets.update(hashes) - num_buckets = len(buckets) - report['syn0_ngrams'] = num_buckets * vec_size - # A tuple (48 bytes) with num_ngrams_word ints (8 bytes) for each word + report['syn1neg'] = len(self.wv) * l1_size + if self.wv.bucket: + report['syn0_ngrams'] = self.wv.bucket * vec_size + num_ngrams = 0 + for word in self.wv.key_to_index: + hashes = ft_ngram_hashes(word, self.wv.min_n, self.wv.max_n, self.wv.bucket) + num_ngrams += len(hashes) + # A list (64 bytes) with one np.array (100 bytes) per key, with a total of + # num_ngrams uint32s (4 bytes) amongst them # Only used during training, not stored with the model - report['buckets_word'] = 48 * len(self.wv.vocab) + 8 * num_ngrams - elif self.word_ngrams > 0: - logger.warn( - 'subword information is enabled, but no vocabulary could be found, estimated required memory might be ' - 'inaccurate!' - ) + report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # FIXME: caching & calc sensible? report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words, %i buckets and %i dimensions: %i bytes", - len(self.wv.vocab), num_buckets, self.vector_size, report['total'] + len(self.wv), num_buckets, self.vector_size, report['total'], ) return report @@ -719,15 +660,11 @@ def _do_train_job(self, sentences, alpha, inits): """ work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, neu1) - else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1) + tally = train_batch_any(self, sentences, alpha, work, neu1) return tally, self._raw_word_count(sentences) - def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, + def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -794,20 +731,21 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor """ - if corpus_file is None and sentences is None: - raise TypeError("Either one of corpus_file or sentences value must be provided") + if corpus_file is None and corpus_iterable is None: + raise TypeError("Either one of corpus_file or corpus_iterable value must be provided") - if corpus_file is not None and sentences is not None: - raise TypeError("Both corpus_file and sentences must not be provided at the same time") + if corpus_file is not None and corpus_iterable is not None: + raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") - if sentences is None and not os.path.isfile(corpus_file): + if corpus_iterable is None and not os.path.isfile(corpus_file): raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) - if sentences is not None and not isinstance(sentences, Iterable): - raise TypeError("sentences must be an iterable of list, got %r instead" % sentences) + if corpus_iterable is not None and not isinstance(corpus_iterable, Iterable): + raise TypeError("sentences must be an iterable of list, got %r instead" % corpus_iterable) super(FastText, self).train( - sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, + corpus_iterable=corpus_iterable, corpus_file=corpus_file, + total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) self.wv.adjust_vectors() @@ -825,8 +763,8 @@ def init_sims(self, replace=False): # init_sims() resides in KeyedVectors because it deals with input layer mainly, but because the # hidden layer is not an attribute of KeyedVectors, it has to be deleted in this class. # The normalizing of input layer happens inside of KeyedVectors. - if replace and hasattr(self.trainables, 'syn1'): - del self.trainables.syn1 + if replace and hasattr(self, 'syn1'): + del self.syn1 self.wv.init_sims(replace) def clear_sims(self): @@ -837,24 +775,6 @@ def clear_sims(self): """ self._clear_post_train() - @deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead") - def __getitem__(self, words): - """Deprecated. Use self.wv.__getitem__() instead. - - Refer to the documentation for :meth:`gensim.models.keyedvectors.KeyedVectors.__getitem__` - - """ - return self.wv.__getitem__(words) - - @deprecated("Method will be removed in 4.0.0, use self.wv.__contains__() instead") - def __contains__(self, word): - """Deprecated. Use self.wv.__contains__() instead. - - Refer to the documentation for :meth:`gensim.models.keyedvectors.KeyedVectors.__contains__` - - """ - return self.wv.__contains__(word) - @classmethod @deprecated( 'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model ' @@ -902,8 +822,7 @@ def save(self, *args, **kwargs): Load :class:`~gensim.models.fasttext.FastText` model. """ - kwargs['ignore'] = kwargs.get( - 'ignore', ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm', 'buckets_word']) + kwargs['ignore'] = kwargs.get('ignore', []) + ['buckets_word', ] super(FastText, self).save(*args, **kwargs) @classmethod @@ -926,153 +845,43 @@ def load(cls, *args, **kwargs): Save :class:`~gensim.models.fasttext.FastText` model. """ - try: - model = super(FastText, cls).load(*args, **kwargs) - - if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): - model.trainables.vectors_vocab_lockf = ones(model.wv.vectors_vocab.shape, dtype=REAL) - if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): - model.trainables.vectors_ngrams_lockf = ones(model.wv.vectors_ngrams.shape, dtype=REAL) - - if not hasattr(model.wv, 'bucket'): - model.wv.bucket = model.trainables.bucket - except AttributeError: - logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.') - from gensim.models.deprecated.fasttext import load_old_fasttext - model = load_old_fasttext(*args, **kwargs) - - gensim.models.keyedvectors._try_upgrade(model.wv) + model = super(FastText, cls).load(*args, rethrow=True, **kwargs) + + if not hasattr(model.wv, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): + # TODO: try trainables-location + model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) + if not hasattr(model, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): + # TODO: try trainables-location + model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) + # fixup mistakenly overdimensioned gensim-3.x lockf arrays + if len(model.wv.vectors_vocab_lockf.shape) > 1: + model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) + if len(model.wv.vectors_ngrams_lockf.shape) > 1: + model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) + if hasattr(model, 'bucket'): + del model.bucket # should only exist in one place: the wv subcomponent + if not hasattr(model.wv, 'buckets_word') or not model.wv.buckets_word: + model.wv.recalc_char_ngram_buckets() return model - @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead") - def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True): - most_similar = most_similar or FastTextKeyedVectors.most_similar - return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive) - -class FastTextVocab(Word2VecVocab): +class FastTextVocab(utils.SaveLoad): """This is a redundant class. It exists only to maintain backwards compatibility with older gensim versions.""" - pass - - -class FastTextTrainables(Word2VecTrainables): - """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`. - - Mostly inherits from its parent (:class:`~gensim.models.word2vec.Word2VecTrainables`). - Adds logic for calculating and maintaining ngram weights. - - Attributes - ---------- - hashfxn : function - Used for randomly initializing weights. Defaults to the built-in hash() - layer1_size : int - The size of the inner layer of the NN. Equal to the vector dimensionality. - Set in the :class:`~gensim.models.word2vec.Word2VecTrainables` constructor. - seed : float - The random generator seed used in reset_weights and update_weights. - syn1 : numpy.array - The inner layer of the NN. Each row corresponds to a term in the vocabulary. - Columns correspond to weights of the inner layer. - There are layer1_size such weights. - Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. - syn1neg : numpy.array - Similar to syn1, but only set if negative sampling is used. - vectors_lockf : numpy.array - A one-dimensional array with one element for each term in the vocab. Set in reset_weights to an array of ones. - vectors_vocab_lockf : numpy.array - Similar to vectors_vocab_lockf, ones(len(model.trainables.vectors), dtype=REAL) - vectors_ngrams_lockf : numpy.array - np.ones((self.bucket, wv.vector_size), dtype=REAL) - - """ - def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000): - super(FastTextTrainables, self).__init__( - vector_size=vector_size, seed=seed, hashfxn=hashfxn) - self.bucket = int(bucket) - - # - # There are also two "hidden" attributes that get initialized outside - # this constructor: - # - # 1. vectors_vocab_lockf - # 2. vectors_ngrams_lockf - # - # These are both 2D matrices of shapes equal to the shapes of - # wv.vectors_vocab and wv.vectors_ngrams. So, each row corresponds to - # a vector, and each column corresponds to a dimension within that - # vector. - # - # Lockf stands for "lock factor": zero values suppress learning, one - # values enable it. Interestingly, the vectors_vocab_lockf and - # vectors_ngrams_lockf seem to be used only by the C code in - # fasttext_inner.pyx. - # - # The word2vec implementation also uses vectors_lockf: in that case, - # it's a 1D array, with a real number for each vector. The FastText - # implementation inherits this vectors_lockf attribute but doesn't - # appear to use it. - # - - def prepare_weights(self, hs, negative, wv, update=False, vocabulary=None): - super(FastTextTrainables, self).prepare_weights(hs, negative, wv, update=update, vocabulary=vocabulary) - self.init_ngrams_weights(wv, update=update, vocabulary=vocabulary) - - def init_ngrams_weights(self, wv, update=False, vocabulary=None): - """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. - Vectors for other ngrams are initialized with a random uniform distribution in FastText. - - Parameters - ---------- - wv : :class:`~gensim.models.keyedvectors.FastTextKeyedVectors` - Contains the mapping between the words and embeddings. - The vectors for the computed ngrams will go here. - update : bool - If True, the new vocab words and their new ngrams word vectors are initialized - with random uniform distribution and updated/added to the existing vocab word and ngram vectors. - vocabulary : :class:`~gensim.models.fasttext.FastTextVocab` - This object represents the vocabulary of the model. - If update is True, then vocabulary may not be None. - - """ - if not update: - wv.init_ngrams_weights(self.seed) - self.vectors_vocab_lockf = ones(wv.vectors_vocab.shape, dtype=REAL) - self.vectors_ngrams_lockf = ones(wv.vectors_ngrams.shape, dtype=REAL) - else: - wv.update_ngrams_weights(self.seed, vocabulary.old_vocab_len) - self.vectors_vocab_lockf = _pad_ones(self.vectors_vocab_lockf, wv.vectors_vocab.shape) - self.vectors_ngrams_lockf = _pad_ones(self.vectors_ngrams_lockf, wv.vectors_ngrams.shape) - - def init_post_load(self, model, hidden_output): - num_vectors = len(model.wv.vectors) - vocab_size = len(model.wv.vocab) - vector_size = model.wv.vector_size - - assert num_vectors > 0, 'expected num_vectors to be initialized already' - assert vocab_size > 0, 'expected vocab_size to be initialized already' - self.vectors_ngrams_lockf = ones(model.wv.vectors_ngrams.shape, dtype=REAL) - self.vectors_vocab_lockf = ones(model.wv.vectors_vocab.shape, dtype=REAL) - - if model.hs: - self.syn1 = hidden_output - if model.negative: - self.syn1neg = hidden_output - self.layer1_size = vector_size +class FastTextTrainables(utils.SaveLoad): + """Obsolete class retained for backward-compatible load()s""" -def _pad_ones(m, new_shape): - """Pad a matrix with additional rows filled with ones.""" - assert m.shape[0] <= new_shape[0], 'the new number of rows must be greater' - assert m.shape[1] == new_shape[1], 'the number of columns must match' - new_rows = new_shape[0] - m.shape[0] - if new_rows == 0: - return m - suffix = ones((new_rows, m.shape[1]), dtype=REAL) - return vstack([m, suffix]) +def _pad_ones(m, new_len): + """Pad array with additional entries filled with ones.""" + if len(m) > new_len: + raise ValueError('the new number of rows %i must be greater than old %i' % (new_len, len(m))) + new_arr = np.ones(new_len, dtype=REAL) + new_arr[:len(m)] = m + return new_arr def load_facebook_model(path, encoding='utf-8'): @@ -1165,7 +974,7 @@ def load_facebook_vectors(path, encoding='utf-8'): Returns ------- - gensim.models.keyedvectors.FastTextKeyedVectors + gensim.models.fasttext.FastTextKeyedVectors The word embeddings. Examples @@ -1193,8 +1002,8 @@ def load_facebook_vectors(path, encoding='utf-8'): model training. """ - model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False) - return model_wrapper.wv + full_model = _load_fasttext_format(path, encoding=encoding, full_model=False) + return full_model.wv def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): @@ -1220,9 +1029,9 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): m = gensim.models._fasttext_bin.load(fin, encoding=encoding, full_model=full_model) model = FastText( - size=m.dim, + vector_size=m.dim, window=m.ws, - iter=m.epoch, + epochs=m.epoch, negative=m.neg, hs=int(m.loss == 1), sg=int(m.model == 2), @@ -1233,9 +1042,9 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): max_n=m.maxn, ) model.corpus_total_words = m.ntokens - model.vocabulary.raw_vocab = m.raw_vocab - model.vocabulary.nwords = m.nwords - model.vocabulary.vocab_size = m.vocab_size + model.raw_vocab = m.raw_vocab + model.nwords = m.nwords + model.vocab_size = m.vocab_size # # This is here to fix https://github.com/RaRe-Technologies/gensim/pull/2373. @@ -1249,15 +1058,13 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): # Native models trained _without_ pretrained vectors already contain the # trimmed raw_vocab, so this change does not affect them. # - model.vocabulary.prepare_vocab( - model.hs, model.negative, model.wv, - update=True, min_count=1, - ) + model.prepare_vocab(update=True, min_count=1) model.num_original_vectors = m.vectors_ngrams.shape[0] model.wv.init_post_load(m.vectors_ngrams) - model.trainables.init_post_load(model, m.hidden_output) + model._init_post_load(m.hidden_output) + _check_model(model) logger.info("loaded %s weight matrix for fastText model from %s", m.vectors_ngrams.shape, fin.name) @@ -1265,35 +1072,34 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): def _check_model(m): - # - # These checks only make sense after everything has been completely initialized. - # - assert m.wv.vector_size == m.wv.vectors_ngrams.shape[1], ( - 'mismatch between vector size in model params ({}) and model vectors ({})' - .format(m.wv.vector_size, m.wv.vectors_ngrams) - ) + """Model sanity checks. Run after everything has been completely initialized.""" + if m.wv.vector_size != m.wv.vectors_ngrams.shape[1]: + raise ValueError( + 'mismatch between vector size in model params (%s) and model vectors (%s)' % ( + m.wv.vector_size, m.wv.vectors_ngrams, + ) + ) - try: - syn1neg = m.trainables.syn1neg - except AttributeError: - syn1neg = None + if hasattr(m, 'syn1neg') and m.syn1neg is not None: + if m.wv.vector_size != m.syn1neg.shape[1]: + raise ValueError( + 'mismatch between vector size in model params (%s) and trainables (%s)' % ( + m.wv.vector_size, m.wv.vectors_ngrams, + ) + ) - if syn1neg is not None: - assert m.wv.vector_size == m.trainables.syn1neg.shape[1], ( - 'mismatch between vector size in model params ({}) and trainables ({})' - .format(m.wv.vector_size, m.wv.vectors_ngrams) + if len(m.wv) != m.nwords: + raise ValueError( + 'mismatch between final vocab size (%s words), and expected number of words (%s words)' % ( + len(m.wv), m.nwords, + ) ) - assert len(m.wv.vocab) == m.vocabulary.nwords, ( - 'mismatch between final vocab size ({} words), ' - 'and expected number of words ({} words)'.format(len(m.wv.vocab), m.vocabulary.nwords) - ) - - if len(m.wv.vocab) != m.vocabulary.vocab_size: + if len(m.wv) != m.vocab_size: # expecting to log this warning only for pretrained french vector, wiki.fr logger.warning( "mismatch between final vocab size (%s words), and expected vocab size (%s words)", - len(m.wv.vocab), m.vocabulary.vocab_size + len(m.wv), m.vocab_size, ) @@ -1332,3 +1138,429 @@ def save_facebook_model(model, path, encoding="utf-8", lr_update_rate=100, word_ """ fb_fasttext_parameters = {"lr_update_rate": lr_update_rate, "word_ngrams": word_ngrams} gensim.models._fasttext_bin.save(model, path, fb_fasttext_parameters, encoding) + + +class FastTextKeyedVectors(KeyedVectors): + def __init__(self, vector_size, min_n, max_n, bucket): + """Vectors and vocab for :class:`~gensim.models.fasttext.FastText`. + + Implements significant parts of the FastText algorithm. For example, + the :func:`word_vec` calculates vectors for out-of-vocabulary (OOV) + entities. FastText achieves this by keeping vectors for ngrams: + adding the vectors for the ngrams of an entity yields the vector for the + entity. + + Similar to a hashmap, this class keeps a fixed number of buckets, and + maps all ngrams to buckets using a hash function. + + Parameters + ---------- + vector_size : int + The dimensionality of all vectors. + min_n : int + The minimum number of characters in an ngram + max_n : int + The maximum number of characters in an ngram + bucket : int + The number of buckets. + + Attributes + ---------- + vectors_vocab : np.array + Each row corresponds to a vector for an entity in the vocabulary. + Columns correspond to vector dimensions. When embedded in a full + FastText model, these are the full-word-token vectors updated + by training, whereas the inherited vectors are the actual per-word + vectors synthesized from the full-word-token and all subword (ngram) + vectors. + vectors_ngrams : np.array + A vector for each ngram across all entities in the vocabulary. + Each row is a vector that corresponds to a bucket. + Columns correspond to vector dimensions. + buckets_word : list of np.array + For each key (by its index), report bucket slots their subwords map to. + + When used in training, FastTextKeyedVectors may be decorated with + extra attributes that closely associate with its core attributes, + such as the experimental vectors_vocab_lockf and vectors_ngrams_lockf + training-update-dampening factors. + + """ + super(FastTextKeyedVectors, self).__init__(vector_size=vector_size) + self.vectors_vocab = None # fka syn0_vocab + self.vectors_ngrams = None # fka syn0_ngrams + self.buckets_word = None + self.min_n = min_n + self.max_n = max_n + self.bucket = bucket # count of buckets, fka num_ngram_vectors + self.compatible_hash = True + + @classmethod + def load(cls, fname_or_handle, **kwargs): + model = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) + if isinstance(model, FastTextKeyedVectors): + if not hasattr(model, 'compatible_hash') or model.compatible_hash is False: + raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." + "Loading into gensim-3.8.3 & re-saving may create a compatible model.") + return model + + def __contains__(self, word): + """Check if `word` or any character ngrams in `word` are present in the vocabulary. + A vector for the word is guaranteed to exist if current method returns True. + + Parameters + ---------- + word : str + Input word. + + Returns + ------- + bool + True if `word` or any character ngrams in `word` are present in the vocabulary, False otherwise. + + Note + ---- + This method **always** returns True, because of the way FastText works. + + If you want to check if a word is an in-vocabulary term, use this instead: + + .. pycon: + + >>> from gensim.test.utils import datapath + >>> from gensim.models import FastText + >>> cap_path = datapath("crime-and-punishment.bin") + >>> model = FastText.load_fasttext_format(cap_path, full_model=False) + >>> 'steamtrain' in model.wv.key_to_index # If False, is an OOV term + False + + """ + return True + + def save(self, *args, **kwargs): + """Save object. + + Parameters + ---------- + fname : str + Path to the output file. + + See Also + -------- + :meth:`~gensim.models.fasttext.FastTextKeyedVectors.load` + Load object. + + """ + # don't bother storing the cached normalized vectors + ignore_attrs = [ + 'buckets_word', + 'hash2index', + ] + kwargs['ignore'] = kwargs.get('ignore', ignore_attrs) + super(FastTextKeyedVectors, self).save(*args, **kwargs) + + def get_vector(self, word, use_norm=False): + """Get `word` representations in vector space, as a 1D numpy array. + + Parameters + ---------- + word : str + Input word + use_norm : bool, optional + If True - resulting vector will be L2-normalized (unit euclidean length). + + Returns + ------- + numpy.ndarray + Vector representation of `word`. + + Raises + ------ + KeyError + If word and all ngrams not in vocabulary. + + """ + if word in self.key_to_index: + return super(FastTextKeyedVectors, self).get_vector(word, use_norm) + elif self.bucket == 0: + raise KeyError('cannot calculate vector for OOV word without ngrams') + else: + word_vec = np.zeros(self.vectors_ngrams.shape[1], dtype=np.float32) + ngram_weights = self.vectors_ngrams + ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket) + if len(ngram_hashes) == 0: + # + # If it is impossible to extract _any_ ngrams from the input + # word, then the best we can do is return a vector that points + # to the origin. The reference FB implementation does this, + # too. + # + # https://github.com/RaRe-Technologies/gensim/issues/2402 + # + logger.warning('could not extract any ngrams from %r, returning origin vector', word) + return word_vec + for nh in ngram_hashes: + word_vec += ngram_weights[nh] + word_vec /= len(ngram_hashes) + if use_norm: + return word_vec / np.linalg.norm(word_vec) + else: + return word_vec + + def init_ngrams_weights(self, seed): + """Initialize the vocabulary and ngrams weights prior to training. + + Creates the weight matrices and initializes them with uniform random values. + + Parameters + ---------- + seed : float + The seed for the PRNG. + + Note + ---- + Call this **after** the vocabulary has been fully initialized. + + """ + self.recalc_char_ngram_buckets() + + rand_obj = np.random.default_rng(seed=seed) # use new instance of numpy's recommended generator/algorithm + + lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size + vocab_shape = (len(self), self.vector_size) + ngrams_shape = (self.bucket, self.vector_size) + self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL) + + # + # We could have initialized vectors_ngrams at construction time, but we + # do it here for two reasons: + # + # 1. The constructor does not have access to the random seed + # 2. We want to use the same rand_obj to fill vectors_vocab _and_ + # vectors_ngrams, and vectors_vocab cannot happen at construction + # time because the vocab is not initialized at that stage. + # + self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL) + + def update_ngrams_weights(self, seed, old_vocab_len): + """Update the vocabulary weights for training continuation. + + Parameters + ---------- + seed : float + The seed for the PRNG. + old_vocab_length : int + The length of the vocabulary prior to its update. + + Note + ---- + Call this **after** the vocabulary has been updated. + + """ + self.recalc_char_ngram_buckets() + + rand_obj = np.random + rand_obj.seed(seed) + + new_vocab = len(self) - old_vocab_len + self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj) + + def init_post_load(self, fb_vectors): + """Perform initialization after loading a native Facebook model. + + Expects that the vocabulary (self.key_to_index) has already been initialized. + + Parameters + ---------- + fb_vectors : np.array + A matrix containing vectors for all the entities, including words + and ngrams. This comes directly from the binary model. + The order of the vectors must correspond to the indices in + the vocabulary. + + """ + vocab_words = len(self) + assert fb_vectors.shape[0] == vocab_words + self.bucket, 'unexpected number of vectors' + assert fb_vectors.shape[1] == self.vector_size, 'unexpected vector dimensionality' + + # + # The incoming vectors contain vectors for both words AND + # ngrams. We split them into two separate matrices, because our + # implementation treats them differently. + # + self.vectors_vocab = np.array(fb_vectors[:vocab_words, :]) + self.vectors_ngrams = np.array(fb_vectors[vocab_words:, :]) + self.recalc_char_ngram_buckets() + self.adjust_vectors() # calculate composite full-word vectors + + def adjust_vectors(self): + """Adjust the vectors for words in the vocabulary. + + The adjustment composes the trained full-word-token vectors with + the vectors of the subword ngrams, matching the Facebook reference + implementation behavior. + + """ + if self.bucket == 0: + return + + self.vectors = self.vectors_vocab[:].copy() + for i, _ in enumerate(self.index_to_key): + ngram_buckets = self.buckets_word[i] + for nh in ngram_buckets: + self.vectors[i] += self.vectors_ngrams[nh] + self.vectors[i] /= len(ngram_buckets) + 1 + + def recalc_char_ngram_buckets(self): + """ + Scan the vocabulary, calculate ngrams and their hashes, and cache the list of ngrams for each known word. + + """ + # FIXME: evaluate if precaching even necessary, compared to recalculating as needed + if self.bucket == 0: + self.buckets_word = [np.array([], dtype=np.uint32)] * len(self.index_to_key) + return + + self.buckets_word = [None] * len(self.index_to_key) + + for i, word in enumerate(self.index_to_key): + self.buckets_word[i] = np.array( + ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket), + dtype=np.uint32, + ) + + +def _pad_random(m, new_rows, rand): + """Pad a matrix with additional rows filled with random values.""" + _, columns = m.shape + low, high = -1.0 / columns, 1.0 / columns + suffix = rand.uniform(low, high, (new_rows, columns)).astype(REAL) + return vstack([m, suffix]) + + +def _unpack(m, num_rows, hash2index, seed=1, fill=None): + """Restore the array to its natural shape, undoing the optimization. + + A packed matrix contains contiguous vectors for ngrams, as well as a hashmap. + The hash map maps the ngram hash to its index in the packed matrix. + To unpack the matrix, we need to do several things: + + 1. Restore the matrix to its "natural" shape, where the number of rows + equals the number of buckets. + 2. Rearrange the existing rows such that the hashmap becomes the identity + function and is thus redundant. + 3. Fill the new rows with random values. + + Parameters + ---------- + + m : np.ndarray + The matrix to restore. + num_rows : int + The number of rows that this array should have. + hash2index : dict + the product of the optimization we are undoing. + seed : float, optional + The seed for the PRNG. Will be used to initialize new rows. + fill : float or array or None, optional + Value for new rows. If None (the default), randomly initialize. + Returns + ------- + np.array + The unpacked matrix. + + Notes + ----- + + The unpacked matrix will reference some rows in the input matrix to save memory. + Throw away the old matrix after calling this function, or use np.copy. + + """ + orig_rows, *more_dims = m.shape + if orig_rows == num_rows: + # + # Nothing to do. + # + return m + assert num_rows > orig_rows + + if fill is None: + rand_obj = np.random + rand_obj.seed(seed) + + # + # Rows at the top of the matrix (the first orig_rows) will contain "packed" learned vectors. + # Rows at the bottom of the matrix will be "free": initialized to random values. + # + m = _pad_random(m, num_rows - orig_rows, rand_obj) + else: + m = np.concatenate([m, [fill] * (num_rows - orig_rows)]) + + # + # Swap rows to transform hash2index into the identify function. + # There are two kinds of swaps. + # First, rearrange the rows that belong entirely within the original matrix dimensions. + # Second, swap out rows from the original matrix dimensions, replacing them with + # randomly initialized values. + # + # N.B. We only do the swap in one direction, because doing it in both directions + # nullifies the effect. + # + swap = {h: i for (h, i) in hash2index.items() if h < i < orig_rows} + swap.update({h: i for (h, i) in hash2index.items() if h >= orig_rows}) + for h, i in swap.items(): + assert h != i + m[[h, i]] = m[[i, h]] # swap rows i and h + + return m + + +# +# UTF-8 bytes that begin with 10 are subsequent bytes of a multi-byte sequence, +# as opposed to a new character. +# +_MB_MASK = 0xC0 +_MB_START = 0x80 + + +def _byte_to_int_py3(b): + return b + + +def _byte_to_int_py2(b): + return ord(b) + + +_byte_to_int = _byte_to_int_py2 if six.PY2 else _byte_to_int_py3 + + +def _is_utf8_continue(b): + return _byte_to_int(b) & _MB_MASK == _MB_START + + +def ft_ngram_hashes(word, minn, maxn, num_buckets): + """Calculate the ngrams of the word and hash them. + + Parameters + ---------- + word : str + The word to calculate ngram hashes for. + minn : int + Minimum ngram length + maxn : int + Maximum ngram length + num_buckets : int + The number of buckets + + Returns + ------- + A list of hashes (integers), one per each detected ngram. + + """ + encoded_ngrams = compute_ngrams_bytes(word, minn, maxn) + hashes = [ft_hash_bytes(n) % num_buckets for n in encoded_ngrams] + return hashes + + +# BACKWARD COMPATIBILITY FOR OLDER PICKLES +from gensim.models import keyedvectors # noqa: E402 +keyedvectors.FastTextKeyedVectors = FastTextKeyedVectors diff --git a/gensim/models/fasttext_inner.pxd b/gensim/models/fasttext_inner.pxd index 927f1b0978..31a1b1d35f 100644 --- a/gensim/models/fasttext_inner.pxd +++ b/gensim/models/fasttext_inner.pxd @@ -46,7 +46,7 @@ cdef struct FastTextConfig: # # Model parameters. These get copied as-is from the Python model. # - int hs, negative, sample, size, window, cbow_mean, workers + int sg, hs, negative, sample, size, window, cbow_mean, workers REAL_t alpha # @@ -59,13 +59,17 @@ cdef struct FastTextConfig: REAL_t *syn0_ngrams # + # EXPERIMENTAL # The arrays below selectively enable/disable training for specific vocab - # terms and ngrams. If word_locks_vocab[i] is 0, training is disabled; - # if it is 1, training is enabled. + # terms and ngrams. If vocab_locks[i] is 0.0, training is disabled; + # if it is 1.0, normal training is enabled. Other values scale updates. + # If undersized for vocab/ngrams, (index % actual_size) is used - + # so that a minimal single-element `lockf` can apply to all slots. # - REAL_t *word_locks_vocab - REAL_t *word_locks_ngrams - + REAL_t *vocab_lockf + np.uint32_t vocab_lockf_len + REAL_t *ngrams_lockf + np.uint32_t ngrams_lockf_len # # Working memory. These are typically large enough to hold a single # vector each. @@ -142,4 +146,4 @@ cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k cdef void fasttext_fast_sentence_cbow_hs(FastTextConfig *c, int i, int j, int k) nogil -cdef void fasttext_train_any(FastTextConfig *c, int num_sentences, int sg) nogil +cdef void fasttext_train_any(FastTextConfig *c, int num_sentences) nogil diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx index a413db8460..e71ed6f31d 100644 --- a/gensim/models/fasttext_inner.pyx +++ b/gensim/models/fasttext_inner.pyx @@ -7,16 +7,16 @@ """Optimized Cython functions for training a :class:`~gensim.models.fasttext.FastText` model. -The main entry points are :func:`~gensim.models.fasttext_inner.train_batch_sg` -and :func:`~gensim.models.fasttext_inner.train_batch_cbow`. They may be -called directly from Python code. +The main entry point is :func:`~gensim.models.fasttext_inner.train_batch_any` +which may be called directly from Python code. Notes ----- The implementation of the above functions heavily depends on the FastTextConfig struct defined in :file:`gensim/models/fasttext_inner.pxd`. -The FAST_VERSION constant determines what flavor of BLAS we're currently using: +The gensim.models.word2vec.FAST_VERSION value reports what flavor of BLAS +we're currently using: 0: double 1: float @@ -36,12 +36,6 @@ from libc.math cimport exp from libc.math cimport log from libc.string cimport memset -# scipy <= 0.15 -try: - from scipy.linalg.blas import fblas -except ImportError: - # in scipy > 0.15, fblas function has been removed - import scipy.linalg.blas as fblas # # We make use of the following BLAS functions (or their analogs, if BLAS is @@ -59,20 +53,16 @@ except ImportError: # # The increments (inc_x and inc_y) are usually 1 in our case. # +# The versions are as chosen in word2vec_inner.pyx, and aliased to `our_` functions -# -# FIXME: why are we importing EXP_TABLE and then redefining it? -# -from word2vec_inner cimport bisect_left, random_int32, scopy, saxpy, dsdot, sscal, \ - REAL_t, EXP_TABLE, our_dot, our_saxpy, our_dot_double, our_dot_float, our_dot_noblas, our_saxpy_noblas - -REAL = np.float32 +from word2vec_inner cimport bisect_left, random_int32, scopy, sscal, \ + REAL_t, our_dot, our_saxpy DEF MAX_SENTENCE_LEN = 10000 DEF MAX_SUBWORDS = 1000 -DEF EXP_TABLE_SIZE = 1000 -DEF MAX_EXP = 6 +DEF EXP_TABLE_SIZE = 512 +DEF MAX_EXP = 8 cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE cdef REAL_t[EXP_TABLE_SIZE] LOG_TABLE @@ -101,41 +91,22 @@ cdef void fasttext_fast_sentence_sg_neg(FastTextConfig *c, int i, int j) nogil: """ - # - # Unpack the struct, extracting only the required parts into separate - # variables. This is here for historical reasons. We could bypass these - # declarations and use parts of the struct directly, but that would be - # somewhat more verbose. - # cdef: - int negative = c.negative - np.uint32_t *cum_table = c.cum_table - unsigned long long cum_table_len = c.cum_table_len - REAL_t *syn0_vocab = c.syn0_vocab - REAL_t *syn0_ngrams = c.syn0_ngrams - REAL_t *syn1neg = c.syn1neg - int size = c.size np.uint32_t word_index = c.indexes[j] np.uint32_t word2_index = c.indexes[i] np.uint32_t *subwords_index = c.subwords_idx[i] np.uint32_t subwords_len = c.subwords_idx_len[i] - REAL_t alpha = c.alpha - REAL_t *work = c.work - REAL_t *l1 = c.neu1 - unsigned long long next_random = c.next_random - REAL_t *word_locks_vocab = c.word_locks_vocab - REAL_t *word_locks_ngrams = c.word_locks_ngrams - - cdef long long row1 = word2_index * size, row2 + + cdef long long row1 = word2_index * c.size, row2 cdef unsigned long long modulo = 281474976710655ULL cdef REAL_t f, g, label, f_dot cdef np.uint32_t target_index cdef int d - memset(work, 0, size * cython.sizeof(REAL_t)) - memset(l1, 0, size * cython.sizeof(REAL_t)) + memset(c.work, 0, c.size * cython.sizeof(REAL_t)) + memset(c.neu1, 0, c.size * cython.sizeof(REAL_t)) - scopy(&size, &syn0_vocab[row1], &ONE, l1, &ONE) + scopy(&c.size, &c.syn0_vocab[row1], &ONE, c.neu1, &ONE) # # Avoid division by zero. @@ -143,34 +114,34 @@ cdef void fasttext_fast_sentence_sg_neg(FastTextConfig *c, int i, int j) nogil: cdef REAL_t norm_factor if subwords_len: for d in range(subwords_len): - our_saxpy(&size, &ONEF, &syn0_ngrams[subwords_index[d] * size], &ONE, l1, &ONE) + our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[subwords_index[d] * c.size], &ONE, c.neu1, &ONE) norm_factor = ONEF / subwords_len - sscal(&size, &norm_factor, l1 , &ONE) + sscal(&c.size, &norm_factor, c.neu1, &ONE) - for d in range(negative+1): + for d in range(c.negative+1): if d == 0: target_index = word_index label = ONEF else: - target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) - next_random = (next_random * 25214903917ULL + 11) & modulo + target_index = bisect_left( + c.cum_table, (c.next_random >> 16) % c.cum_table[c.cum_table_len-1], 0, c.cum_table_len) + c.next_random = (c.next_random * 25214903917ULL + 11) & modulo if target_index == word_index: continue label = 0.0 - row2 = target_index * size - f_dot = our_dot(&size, l1, &ONE, &syn1neg[row2], &ONE) + row2 = target_index * c.size + f_dot = our_dot(&c.size, c.neu1, &ONE, &c.syn1neg[row2], &ONE) if f_dot <= -MAX_EXP or f_dot >= MAX_EXP: continue f = EXP_TABLE[((f_dot + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] - g = (label - f) * alpha - our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) - our_saxpy(&size, &g, l1, &ONE, &syn1neg[row2], &ONE) - our_saxpy(&size, &word_locks_vocab[word2_index], work, &ONE, &syn0_vocab[row1], &ONE) + g = (label - f) * c.alpha + our_saxpy(&c.size, &g, &c.syn1neg[row2], &ONE, c.work, &ONE) + our_saxpy(&c.size, &g, c.neu1, &ONE, &c.syn1neg[row2], &ONE) + our_saxpy(&c.size, &c.vocab_lockf[word2_index % c.vocab_lockf_len], c.work, &ONE, &c.syn0_vocab[row1], &ONE) for d in range(subwords_len): - our_saxpy(&size, &word_locks_ngrams[subwords_index[d]], work, &ONE, &syn0_ngrams[subwords_index[d]*size], &ONE) - - c.next_random = next_random + our_saxpy(&c.size, &c.ngrams_lockf[subwords_index[d] % c.ngrams_lockf_len], + c.work, &ONE, &c.syn0_ngrams[subwords_index[d]*c.size], &ONE) cdef void fasttext_fast_sentence_sg_hs(FastTextConfig *c, int i, int j) nogil: @@ -192,18 +163,9 @@ cdef void fasttext_fast_sentence_sg_hs(FastTextConfig *c, int i, int j) nogil: np.uint32_t *word_point = c.points[j] np.uint8_t *word_code = c.codes[j] int codelen = c.codelens[j] - REAL_t *syn0_vocab = c.syn0_vocab - REAL_t *syn0_ngrams = c.syn0_ngrams - REAL_t *syn1 = c.syn1 - int size = c.size np.uint32_t word2_index = c.indexes[i] np.uint32_t *subwords_index = c.subwords_idx[i] np.uint32_t subwords_len = c.subwords_idx_len[i] - REAL_t alpha = c.alpha - REAL_t *work = c.work - REAL_t *l1 = c.neu1 - REAL_t *word_locks_vocab = c.word_locks_vocab - REAL_t *word_locks_ngrams = c.word_locks_ngrams # # b : long long @@ -220,13 +182,13 @@ cdef void fasttext_fast_sentence_sg_hs(FastTextConfig *c, int i, int j) nogil: # ? # cdef long long b - cdef long long row1 = word2_index * size, row2 + cdef long long row1 = word2_index * c.size, row2 cdef REAL_t f, g, f_dot - memset(work, 0, size * cython.sizeof(REAL_t)) - memset(l1, 0, size * cython.sizeof(REAL_t)) + memset(c.work, 0, c.size * cython.sizeof(REAL_t)) + memset(c.neu1, 0, c.size * cython.sizeof(REAL_t)) - scopy(&size, &syn0_vocab[row1], &ONE, l1, &ONE) + scopy(&c.size, &c.syn0_vocab[row1], &ONE, c.neu1, &ONE) # # Avoid division by zero. @@ -234,26 +196,28 @@ cdef void fasttext_fast_sentence_sg_hs(FastTextConfig *c, int i, int j) nogil: cdef REAL_t norm_factor if subwords_len: for d in range(subwords_len): - row2 = subwords_index[d] * size - our_saxpy(&size, &ONEF, &syn0_ngrams[row2], &ONE, l1, &ONE) + row2 = subwords_index[d] * c.size + our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[row2], &ONE, c.neu1, &ONE) norm_factor = ONEF / subwords_len - sscal(&size, &norm_factor, l1 , &ONE) + sscal(&c.size, &norm_factor, c.neu1, &ONE) for b in range(codelen): - row2 = word_point[b] * size - f_dot = our_dot(&size, l1, &ONE, &syn1[row2], &ONE) + row2 = word_point[b] * c.size + f_dot = our_dot(&c.size, c.neu1, &ONE, &c.syn1[row2], &ONE) if f_dot <= -MAX_EXP or f_dot >= MAX_EXP: continue f = EXP_TABLE[((f_dot + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] - g = (1 - word_code[b] - f) * alpha + g = (1 - word_code[b] - f) * c.alpha - our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) - our_saxpy(&size, &g, l1, &ONE, &syn1[row2], &ONE) + our_saxpy(&c.size, &g, &c.syn1[row2], &ONE, c.work, &ONE) + our_saxpy(&c.size, &g, c.neu1, &ONE, &c.syn1[row2], &ONE) - our_saxpy(&size, &word_locks_vocab[word2_index], work, &ONE, &syn0_vocab[row1], &ONE) + our_saxpy(&c.size, &c.vocab_lockf[word2_index % c.vocab_lockf_len], c.work, &ONE, &c.syn0_vocab[row1], &ONE) for d in range(subwords_len): - row2 = subwords_index[d] * size - our_saxpy(&size, &word_locks_ngrams[subwords_index[d]], work, &ONE, &syn0_ngrams[row2], &ONE) + row2 = subwords_index[d] * c.size + our_saxpy( + &c.size, &c.ngrams_lockf[subwords_index[d] % c.ngrams_lockf_len], c.work, &ONE, + &c.syn0_ngrams[row2], &ONE) cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k) nogil: @@ -276,84 +240,69 @@ cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k """ - cdef: - int negative = c.negative - np.uint32_t *cum_table = c.cum_table - unsigned long long cum_table_len = c.cum_table_len - # int *codelens = c.codelens - REAL_t *neu1 = c.neu1 - REAL_t *syn0_vocab = c.syn0_vocab - REAL_t *syn0_ngrams = c.syn0_ngrams - REAL_t *syn1neg = c.syn1neg - int size = c.size - np.uint32_t *indexes = c.indexes - np.uint32_t **subwords_idx = c.subwords_idx - int *subwords_idx_len = c.subwords_idx_len - REAL_t alpha = c.alpha - REAL_t *work = c.work - int cbow_mean = c.cbow_mean - unsigned long long next_random = c.next_random - REAL_t *word_locks_vocab = c.word_locks_vocab - REAL_t *word_locks_ngrams = c.word_locks_ngrams - cdef long long row2 cdef unsigned long long modulo = 281474976710655ULL cdef REAL_t f, g, count, inv_count = 1.0, label, f_dot cdef np.uint32_t target_index, word_index cdef int d, m - word_index = indexes[i] + word_index = c.indexes[i] - memset(neu1, 0, size * cython.sizeof(REAL_t)) + memset(c.neu1, 0, c.size * cython.sizeof(REAL_t)) count = 0.0 for m in range(j, k): if m == i: continue count += ONEF - our_saxpy(&size, &ONEF, &syn0_vocab[indexes[m] * size], &ONE, neu1, &ONE) - for d in range(subwords_idx_len[m]): + our_saxpy(&c.size, &ONEF, &c.syn0_vocab[c.indexes[m] * c.size], &ONE, c.neu1, &ONE) + for d in range(c.subwords_idx_len[m]): count += ONEF - our_saxpy(&size, &ONEF, &syn0_ngrams[subwords_idx[m][d] * size], &ONE, neu1, &ONE) + our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[c.subwords_idx[m][d] * c.size], &ONE, c.neu1, &ONE) if count > (0.5): inv_count = ONEF / count - if cbow_mean: - sscal(&size, &inv_count, neu1, &ONE) + if c.cbow_mean: + sscal(&c.size, &inv_count, c.neu1, &ONE) - memset(work, 0, size * cython.sizeof(REAL_t)) + memset(c.work, 0, c.size * cython.sizeof(REAL_t)) - for d in range(negative+1): + for d in range(c.negative+1): if d == 0: target_index = word_index label = ONEF else: - target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) - next_random = (next_random * 25214903917ULL + 11) & modulo + target_index = bisect_left(c.cum_table, (c.next_random >> 16) % c.cum_table[c.cum_table_len-1], 0, c.cum_table_len) + c.next_random = (c.next_random * 25214903917ULL + 11) & modulo if target_index == word_index: continue label = 0.0 - row2 = target_index * size - f_dot = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE) - if f_dot <= -MAX_EXP or f_dot >= MAX_EXP: - continue - f = EXP_TABLE[((f_dot + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] - g = (label - f) * alpha + row2 = target_index * c.size + f_dot = our_dot(&c.size, c.neu1, &ONE, &c.syn1neg[row2], &ONE) + if f_dot <= -MAX_EXP: + f = 0.0 + elif f_dot >= MAX_EXP: + f = 1.0 + else: + f = EXP_TABLE[((f_dot + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] + g = (label - f) * c.alpha - our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) - our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE) + our_saxpy(&c.size, &g, &c.syn1neg[row2], &ONE, c.work, &ONE) + our_saxpy(&c.size, &g, c.neu1, &ONE, &c.syn1neg[row2], &ONE) - if not cbow_mean: # divide error over summed window vectors - sscal(&size, &inv_count, work, &ONE) + if not c.cbow_mean: # divide error over summed window vectors + sscal(&c.size, &inv_count, c.work, &ONE) for m in range(j,k): if m == i: continue - our_saxpy(&size, &word_locks_vocab[indexes[m]], work, &ONE, &syn0_vocab[indexes[m]*size], &ONE) - for d in range(subwords_idx_len[m]): - our_saxpy(&size, &word_locks_ngrams[subwords_idx[m][d]], work, &ONE, &syn0_ngrams[subwords_idx[m][d]*size], &ONE) - - c.next_random = next_random + our_saxpy( + &c.size, &c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len], c.work, &ONE, + &c.syn0_vocab[c.indexes[m]*c.size], &ONE) + for d in range(c.subwords_idx_len[m]): + our_saxpy( + &c.size, &c.ngrams_lockf[c.subwords_idx[m][d] % c.ngrams_lockf_len], c.work, &ONE, + &c.syn0_ngrams[c.subwords_idx[m][d]*c.size], &ONE) cdef void fasttext_fast_sentence_cbow_hs(FastTextConfig *c, int i, int j, int k) nogil: @@ -375,62 +324,52 @@ cdef void fasttext_fast_sentence_cbow_hs(FastTextConfig *c, int i, int j, int k) cdef: np.uint32_t *word_point = c.points[i] np.uint8_t *word_code = c.codes[i] - int *codelens = c.codelens - REAL_t *neu1 = c.neu1 - REAL_t *syn0_vocab = c.syn0_vocab - REAL_t *syn0_ngrams = c.syn0_ngrams - REAL_t *syn1 = c.syn1 - int size = c.size - np.uint32_t *indexes = c.indexes - np.uint32_t **subwords_idx = c.subwords_idx - int *subwords_idx_len = c.subwords_idx_len - REAL_t alpha = c.alpha - REAL_t *work = c.work - int cbow_mean = c.cbow_mean - REAL_t *word_locks_vocab = c.word_locks_vocab - REAL_t *word_locks_ngrams = c.word_locks_ngrams cdef long long b cdef long long row2 cdef REAL_t f, g, count, inv_count = 1.0, f_dot cdef int m - memset(neu1, 0, size * cython.sizeof(REAL_t)) + memset(c.neu1, 0, c.size * cython.sizeof(REAL_t)) count = 0.0 for m in range(j, k): if m == i: continue count += ONEF - our_saxpy(&size, &ONEF, &syn0_vocab[indexes[m] * size], &ONE, neu1, &ONE) - for d in range(subwords_idx_len[m]): + our_saxpy(&c.size, &ONEF, &c.syn0_vocab[c.indexes[m] * c.size], &ONE, c.neu1, &ONE) + for d in range(c.subwords_idx_len[m]): count += ONEF - our_saxpy(&size, &ONEF, &syn0_ngrams[subwords_idx[m][d] * size], &ONE, neu1, &ONE) + our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[c.subwords_idx[m][d] * c.size], &ONE, c.neu1, &ONE) if count > (0.5): inv_count = ONEF / count - if cbow_mean: - sscal(&size, &inv_count, neu1, &ONE) + if c.cbow_mean: + sscal(&c.size, &inv_count, c.neu1, &ONE) - memset(work, 0, size * cython.sizeof(REAL_t)) - for b in range(codelens[i]): - row2 = word_point[b] * size - f_dot = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) + memset(c.work, 0, c.size * cython.sizeof(REAL_t)) + for b in range(c.codelens[i]): + row2 = word_point[b] * c.size + f_dot = our_dot(&c.size, c.neu1, &ONE, &c.syn1[row2], &ONE) if f_dot <= -MAX_EXP or f_dot >= MAX_EXP: continue f = EXP_TABLE[((f_dot + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] - g = (1 - word_code[b] - f) * alpha + g = (1 - word_code[b] - f) * c.alpha - our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) - our_saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE) + our_saxpy(&c.size, &g, &c.syn1[row2], &ONE, c.work, &ONE) + our_saxpy(&c.size, &g, c.neu1, &ONE, &c.syn1[row2], &ONE) - if not cbow_mean: # divide error over summed window vectors - sscal(&size, &inv_count, work, &ONE) + if not c.cbow_mean: # divide error over summed window vectors + sscal(&c.size, &inv_count, c.work, &ONE) for m in range(j,k): if m == i: continue - our_saxpy(&size, &word_locks_vocab[indexes[m]], work, &ONE, &syn0_vocab[indexes[m]*size], &ONE) - for d in range(subwords_idx_len[m]): - our_saxpy(&size, &word_locks_ngrams[subwords_idx[m][d]], work, &ONE, &syn0_ngrams[subwords_idx[m][d]*size], &ONE) + our_saxpy( + &c.size, &c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len], c.work, &ONE, + &c.syn0_vocab[c.indexes[m]*c.size], &ONE) + for d in range(c.subwords_idx_len[m]): + our_saxpy( + &c.size, &c.ngrams_lockf[c.subwords_idx[m][d] % c.ngrams_lockf_len], c.work, &ONE, + &c.syn0_ngrams[c.subwords_idx[m][d]*c.size], &ONE) cdef void init_ft_config(FastTextConfig *c, model, alpha, _work, _neu1): @@ -452,28 +391,33 @@ cdef void init_ft_config(FastTextConfig *c, model, alpha, _work, _neu1): Private working memory for each worker. """ + c.sg = model.sg c.hs = model.hs c.negative = model.negative - c.sample = (model.vocabulary.sample != 0) + c.sample = (model.sample != 0) c.cbow_mean = model.cbow_mean c.window = model.window c.workers = model.workers c.syn0_vocab = (np.PyArray_DATA(model.wv.vectors_vocab)) - c.word_locks_vocab = (np.PyArray_DATA(model.trainables.vectors_vocab_lockf)) c.syn0_ngrams = (np.PyArray_DATA(model.wv.vectors_ngrams)) - c.word_locks_ngrams = (np.PyArray_DATA(model.trainables.vectors_ngrams_lockf)) + + # EXPERIMENTAL lockf scaled suppression/enablement of training + c.vocab_lockf = (np.PyArray_DATA(model.wv.vectors_vocab_lockf)) + c.vocab_lockf_len = len(model.wv.vectors_vocab_lockf) + c.ngrams_lockf = (np.PyArray_DATA(model.wv.vectors_ngrams_lockf)) + c.ngrams_lockf_len = len(model.wv.vectors_ngrams_lockf) c.alpha = alpha c.size = model.wv.vector_size if c.hs: - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) if c.negative: - c.syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) - c.cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) - c.cum_table_len = len(model.vocabulary.cum_table) + c.syn1neg = (np.PyArray_DATA(model.syn1neg)) + c.cum_table = (np.PyArray_DATA(model.cum_table)) + c.cum_table_len = len(model.cum_table) if c.negative or c.sample: c.next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) @@ -482,7 +426,7 @@ cdef void init_ft_config(FastTextConfig *c, model, alpha, _work, _neu1): c.neu1 = np.PyArray_DATA(_neu1) -cdef object populate_ft_config(FastTextConfig *c, vocab, buckets_word, sentences): +cdef object populate_ft_config(FastTextConfig *c, wv, buckets_word, sentences): """Prepare C structures so we can go "full C" and release the Python GIL. We create indices over the sentences. We also perform some calculations for @@ -494,10 +438,10 @@ cdef object populate_ft_config(FastTextConfig *c, vocab, buckets_word, sentences ---------- c : FastTextConfig* A pointer to the struct that will contain the populated indices. - vocab : dict + wv : FastTextKeyedVectors The vocabulary - buckets_word : dict - A map containing the buckets each word appears in + buckets_word : list + A list containing the buckets each word appears in sentences : iterable The sentences to read @@ -520,25 +464,35 @@ cdef object populate_ft_config(FastTextConfig *c, vocab, buckets_word, sentences """ cdef int effective_words = 0 cdef int effective_sentences = 0 + cdef np.uint32_t *vocab_sample_ints c.sentence_idx[0] = 0 # indices of the first sentence always start at 0 + + if c.sample: + vocab_sample_ints = np.PyArray_DATA(wv.expandos['sample_int']) + if c.hs: + vocab_codes = wv.expandos['code'] + vocab_points = wv.expandos['point'] for sent in sentences: if not sent: continue # ignore empty sentences; leave effective_sentences unchanged for token in sent: - word = vocab[token] if token in vocab else None - if word is None: + word_index = wv.key_to_index.get(token, None) + if word_index is None: continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window - if c.sample and word.sample_int < random_int32(&c.next_random): + if c.sample and vocab_sample_ints[word_index] < random_int32(&c.next_random): continue - c.indexes[effective_words] = word.index + c.indexes[effective_words] = word_index - c.subwords_idx_len[effective_words] = (len(buckets_word[word.index])) - c.subwords_idx[effective_words] = np.PyArray_DATA(buckets_word[word.index]) + if wv.bucket: + c.subwords_idx_len[effective_words] = (len(buckets_word[word_index])) + c.subwords_idx[effective_words] = np.PyArray_DATA(buckets_word[word_index]) + else: + c.subwords_idx_len[effective_words] = 0 if c.hs: - c.codelens[effective_words] = len(word.code) - c.codes[effective_words] = np.PyArray_DATA(word.code) - c.points[effective_words] = np.PyArray_DATA(word.point) + c.codelens[effective_words] = len(vocab_codes[word_index]) + c.codes[effective_words] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[effective_words] = np.PyArray_DATA(vocab_points[word_index]) effective_words += 1 if effective_words == MAX_SENTENCE_LEN: @@ -555,7 +509,7 @@ cdef object populate_ft_config(FastTextConfig *c, vocab, buckets_word, sentences return effective_words, effective_sentences -cdef void fasttext_train_any(FastTextConfig *c, int num_sentences, int sg) nogil: +cdef void fasttext_train_any(FastTextConfig *c, int num_sentences) nogil: """Performs training on a fully initialized and populated configuration. Parameters @@ -564,8 +518,6 @@ cdef void fasttext_train_any(FastTextConfig *c, int num_sentences, int sg) nogil A pointer to the configuration struct. num_sentences : int The number of sentences to train. - sg : int - 1 for skipgram, 0 for CBOW. """ cdef: @@ -598,7 +550,7 @@ cdef void fasttext_train_any(FastTextConfig *c, int num_sentences, int sg) nogil # window_start = max(sentence_start, i - c.window + c.reduced_windows[i]) # window_end = min(sentence_end, i + c.window + 1 - c.reduced_windows[i]) # - if sg == 0: + if c.sg == 0: if c.hs: fasttext_fast_sentence_cbow_hs(c, i, window_start, window_end) if c.negative: @@ -606,10 +558,7 @@ cdef void fasttext_train_any(FastTextConfig *c, int num_sentences, int sg) nogil else: for j in range(window_start, window_end): if j == i: - # - # TODO: why do we ignore the token at the "center" of - # the window? - # + # no reason to train a center word as predicting itself continue if c.hs: fasttext_fast_sentence_sg_hs(c, i, j) @@ -617,8 +566,8 @@ cdef void fasttext_train_any(FastTextConfig *c, int num_sentences, int sg) nogil fasttext_fast_sentence_sg_neg(c, i, j) -def train_batch_sg(model, sentences, alpha, _work, _l1): - """Update skip-gram model by training on a sequence of sentences. +def train_batch_any(model, sentences, alpha, _work, _neu1): + """Update the model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`~gensim.models.fasttext.FastText.train`. @@ -633,7 +582,7 @@ def train_batch_sg(model, sentences, alpha, _work, _l1): Learning rate. _work : np.ndarray Private working memory for each worker. - _l1 : np.ndarray + _neu1 : np.ndarray Private working memory for each worker. Returns @@ -647,87 +596,133 @@ def train_batch_sg(model, sentences, alpha, _work, _l1): int num_words = 0 int num_sentences = 0 - init_ft_config(&c, model, alpha, _work, _l1) + init_ft_config(&c, model, alpha, _work, _neu1) - num_words, num_sentences = populate_ft_config(&c, model.wv.vocab, model.wv.buckets_word, sentences) + num_words, num_sentences = populate_ft_config(&c, model.wv, model.wv.buckets_word, sentences) # precompute "reduced window" offsets in a single randint() call for i, randint in enumerate(model.random.randint(0, c.window, num_words)): c.reduced_windows[i] = randint + # release GIL & train on all sentences in the batch with nogil: - fasttext_train_any(&c, num_sentences, 1) + fasttext_train_any(&c, num_sentences) return num_words -def train_batch_cbow(model, sentences, alpha, _work, _neu1): - """Update the CBOW model by training on a sequence of sentences. +cpdef ft_hash_bytes(bytes bytez): + """Calculate hash based on `bytez`. + Reproduce `hash method from Facebook fastText implementation + `_. - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from :meth:`~gensim.models.fasttext.FastText.train`. + Parameters + ---------- + bytez : bytes + The string whose hash needs to be calculated, encoded as UTF-8. + + Returns + ------- + unsigned int + The hash of the string. + + """ + cdef np.uint32_t h = 2166136261 + cdef char b + + for b in bytez: + h = h ^ (b) + h = h * 16777619 + return h + + +cpdef compute_ngrams(word, unsigned int min_n, unsigned int max_n): + """Get the list of all possible ngrams for a given word. Parameters ---------- - model : :class:`~gensim.models.fasttext.FastText` - Model to be trained. - sentences : iterable of list of str - A single batch: part of the corpus streamed directly from disk/network. - alpha : float - Learning rate. - _work : np.ndarray - Private working memory for each worker. - _neu1 : np.ndarray - Private working memory for each worker. + word : str + The word whose ngrams need to be computed. + min_n : unsigned int + Minimum character length of the ngrams. + max_n : unsigned int + Maximum character length of the ngrams. Returns ------- - int - Effective number of words trained. + list of str + Sequence of character ngrams. """ - cdef: - FastTextConfig c - int num_words = 0 - int num_sentences = 0 + cdef unicode extended_word = f'<{word}>' + ngrams = [] + for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): + for i in range(0, len(extended_word) - ngram_length + 1): + ngrams.append(extended_word[i:i + ngram_length]) + return ngrams - init_ft_config(&c, model, alpha, _work, _neu1) +# +# UTF-8 bytes that begin with 10 are subsequent bytes of a multi-byte sequence, +# as opposed to a new character. +# +cdef unsigned char _MB_MASK = 0xC0 +cdef unsigned char _MB_START = 0x80 - num_words, num_sentences = populate_ft_config(&c, model.wv.vocab, model.wv.buckets_word, sentences) - # precompute "reduced window" offsets in a single randint() call - for i, randint in enumerate(model.random.randint(0, c.window, num_words)): - c.reduced_windows[i] = randint +cpdef compute_ngrams_bytes(word, unsigned int min_n, unsigned int max_n): + """Computes ngrams for a word. - # release GIL & train on all sentences in the batch - with nogil: - fasttext_train_any(&c, num_sentences, 0) + Ported from the original FB implementation. - return num_words + Parameters + ---------- + word : str + A unicode string. + min_n : unsigned int + The minimum ngram length. + max_n : unsigned int + The maximum ngram length. + + Returns: + -------- + list of str + A list of ngrams, where each ngram is a list of **bytes**. + + See Also + -------- + `Original implementation `__ + + """ + cdef bytes utf8_word = ('<%s>' % word).encode("utf-8") + cdef const unsigned char *bytez = utf8_word + cdef size_t num_bytes = len(utf8_word) + cdef size_t j, i, n + + ngrams = [] + for i in range(num_bytes): + if bytez[i] & _MB_MASK == _MB_START: + continue + + j, n = i, 1 + while j < num_bytes and n <= max_n: + j += 1 + while j < num_bytes and (bytez[j] & _MB_MASK) == _MB_START: + j += 1 + if n >= min_n and not (n == 1 and (i == 0 or j == num_bytes)): + ngram = bytes(bytez[i:j]) + ngrams.append(ngram) + n += 1 + return ngrams def init(): """Precompute function `sigmoid(x) = 1 / (1 + exp(-x))`, for x values discretized into table EXP_TABLE. Also calculate log(sigmoid(x)) into LOG_TABLE. - Returns - ------- - {0, 1, 2} - Enumeration to signify underlying data type returned by the BLAS dot product calculation. - 0 signifies double, 1 signifies double, and 2 signifies that custom cython loops were used - instead of BLAS. - + We recalc, rather than re-use the table from word2vec_inner, because Facebook's FastText + code uses a 512-slot table rather than the 1000 precedent of word2vec.c. """ - global our_dot - global our_saxpy - cdef int i - cdef float *x = [10.0] - cdef float *y = [0.01] - cdef float expected = 0.1 - cdef int size = 1 - cdef double d_res - cdef float *p_res # build the sigmoid table for i in range(EXP_TABLE_SIZE): @@ -735,23 +730,6 @@ def init(): EXP_TABLE[i] = (EXP_TABLE[i] / (EXP_TABLE[i] + 1)) LOG_TABLE[i] = log( EXP_TABLE[i] ) - # check whether sdot returns double or float - d_res = dsdot(&size, x, &ONE, y, &ONE) - p_res = &d_res - if abs(d_res - expected) < 0.0001: - our_dot = our_dot_double - our_saxpy = saxpy - return 0 # double - elif abs(p_res[0] - expected) < 0.0001: - our_dot = our_dot_float - our_saxpy = saxpy - return 1 # float - else: - # neither => use cython loops, no BLAS - # actually, the BLAS is so messed up we'll probably have segfaulted above and never even reach here - our_dot = our_dot_noblas - our_saxpy = our_saxpy_noblas - return 2 - -FAST_VERSION = init() # initialize the module + +init() # initialize the module MAX_WORDS_IN_BATCH = MAX_SENTENCE_LEN diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index ac5ad9dd4f..7c386ac038 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1,24 +1,29 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Author: Shiva Manne +# Author: Gensim Contributors # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module implements word vectors and their similarity look-ups. +"""This module implements word vectors, and more generally sets of vectors keyed by lookup tokens/ints, + and various similarity look-ups. Since trained word vectors are independent from the way they were trained (:class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.fasttext.FastText`, :class:`~gensim.models.wrappers.wordrank.WordRank`, :class:`~gensim.models.wrappers.varembed.VarEmbed` etc), they can be represented by a standalone structure, as implemented in this module. -The structure is called "KeyedVectors" and is essentially a mapping between *entities* -and *vectors*. Each entity is identified by its string id, so this is a mapping between {str => 1D numpy array}. +The structure is called "KeyedVectors" and is essentially a mapping between *keys* +and *vectors*. Each vector is identified by its lookup key, most often a short string token, so this is usually +a mapping between {str => 1D numpy array}. -The entity typically corresponds to a word (so the mapping maps words to 1D vectors), -but for some models, the key can also correspond to a document, a graph node etc. To generalize -over different use-cases, this module calls the keys **entities**. Each entity is -always represented by its string id, no matter whether the entity is a word, a document or a graph node. +The key is, in the original motivating case, a word (so the mapping maps words to 1D vectors), +but for some models, the key can also correspond to a document, a graph node etc. + +(Because some applications may maintain their own integral identifiers, compact and contiguous +starting at zero, this class also supports use of plain ints as keys – in that case using them as literal +pointers to the position of the desired vector in the underlying array, and saving the overhead of +a lookup map entry.) Why use KeyedVectors instead of a full model? ============================================= @@ -35,7 +40,7 @@ | fasttext/word2vec format | ✅ | ❌ | do not support further training, but you can still load | | | | | them into KeyedVectors. | +---------------------------+--------------+------------+-------------------------------------------------------------+ -| append new vectors | ✅ | ✅ | Add new entity-vector entries to the mapping dynamically. | +| append new vectors | ✅ | ✅ | Add new-vector entries to the mapping dynamically. | +---------------------------+--------------+------------+-------------------------------------------------------------+ | concurrency | ✅ | ✅ | Thread-safe, allows concurrent vector queries. | +---------------------------+--------------+------------+-------------------------------------------------------------+ @@ -59,22 +64,20 @@ .. sourcecode:: pycon - >>> from gensim.test.utils import common_texts + >>> from gensim.test.utils import lee_corpus_list >>> from gensim.models import Word2Vec >>> - >>> model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) + >>> model = Word2Vec(lee_corpus_list, size=24, epochs=100) >>> word_vectors = model.wv Persist the word vectors to disk with .. sourcecode:: pycon - >>> from gensim.test.utils import get_tmpfile >>> from gensim.models import KeyedVectors >>> - >>> fname = get_tmpfile("vectors.kv") - >>> word_vectors.save(fname) - >>> word_vectors = KeyedVectors.load(fname, mmap='r') + >>> word_vectors.save('vectors.kv') + >>> reloaded_word_vectors = KeyedVectors.load('vectors.kv') The vectors can also be instantiated from an existing file on disk in the original Google's word2vec C format as a KeyedVectors instance @@ -136,7 +139,7 @@ >>> vector.shape (100,) >>> - >>> vector = word_vectors.wv.word_vec('office', use_norm=True) + >>> vector = word_vectors.wv.get_vector('office', use_norm=True) >>> vector.shape (100,) @@ -158,264 +161,429 @@ """ -from __future__ import division # py3 "true division" - -from itertools import chain import logging +import sys +import itertools +import warnings +from itertools import chain from numbers import Integral -try: - from queue import Queue, Empty -except ImportError: - from Queue import Queue, Empty # noqa:F401 - -from numpy import dot, float32 as REAL, memmap as np_memmap, \ - double, array, zeros, vstack, sqrt, newaxis, integer, \ - ndarray, sum as np_sum, prod, argmax +from numpy import dot, float32 as REAL, \ + double, array, zeros, vstack, \ + ndarray, sum as np_sum, prod, argmax, dtype, ascontiguousarray, \ + frombuffer import numpy as np from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.corpora.dictionary import Dictionary -from six import string_types, integer_types -from six.moves import zip, range -from scipy import stats from gensim.utils import deprecated -from gensim.models.utils_any2vec import ( - _save_word2vec_format, - _load_word2vec_format, - ft_ngram_hashes, -) -from gensim.similarities.termsim import TermSimilarityIndex, SparseTermSimilarityMatrix +from scipy import stats -# -# For backwards compatibility, see https://github.com/RaRe-Technologies/gensim/issues/2201 -# -from gensim.models.deprecated.keyedvectors import EuclideanKeyedVectors # noqa logger = logging.getLogger(__name__) -class Vocab(object): - """A single vocabulary item, used internally for collecting per-word frequency/sampling info, - and for constructing binary trees (incl. both word leaves and inner nodes). +KEY_TYPES = (str, int, np.integer) - """ - def __init__(self, **kwargs): - self.count = 0 - self.__dict__.update(kwargs) - def __lt__(self, other): # used for sorting in a priority queue - return self.count < other.count +class KeyedVectors(utils.SaveLoad): + def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None): + """Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec` + and related models. - def __str__(self): - vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] - return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) + Used to perform operations on the vectors such as vector lookup, distance, similarity etc. + To support the needs of specific models and other downstream uses, each key may also have + additional attributes set and read via the `set_vecattr(key, attr, value)` and `get_vecattr(key, attr)` + methods. Note that all such attributes under the same `attr` name must have compatible `numpy` + types, as the type and storage array for such attributes is established by the 1st time such + `attr` is set. -class BaseKeyedVectors(utils.SaveLoad): - """Abstract base class / interface for various types of word vectors.""" - def __init__(self, vector_size): - self.vectors = zeros((0, vector_size), dtype=REAL) - self.vocab = {} + """ self.vector_size = vector_size - self.index2entity = [] + # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos` + self.index_to_key = [None] * count # fka index2entity or index2word + self.next_index = 0 # pointer to where next new entry will land + self.key_to_index = {} - def save(self, fname_or_handle, **kwargs): - super(BaseKeyedVectors, self).save(fname_or_handle, **kwargs) + self.vectors = zeros((count, vector_size), dtype=dtype) # fka (formerly known as) syn0 + self.norms = None - @classmethod - def load(cls, fname_or_handle, **kwargs): - return super(BaseKeyedVectors, cls).load(fname_or_handle, **kwargs) + self.expandos = {} # dynamically-expandable per-vector named, numpy-typed attributes + + self.mapfile_path = mapfile_path + + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(KeyedVectors, self)._load_specials(*args, **kwargs) + if hasattr(self, 'doctags'): + self._upconvert_old_d2vkv() + # fixup rename/consolidation into index_to_key of older index2word, index2entity + if not hasattr(self, 'index_to_key'): + self.index_to_key = self.__dict__.pop('index2word', self.__dict__.pop('index2word', None)) + # fixup rename into vectors of older syn0 + if not hasattr(self, 'vectors'): + self.vectors = self.__dict__.pop('syn0', None) + self.vector_size = self.vectors.shape[1] + # ensure at least a 'None' in 'norms' to force recalc + if not hasattr(self, 'norms'): + self.norms = None + # ensure at least an empty 'expandos' + if not hasattr(self, 'expandos'): + self.expandos = {} + # fixup rename of vocab into map + if 'key_to_index' not in self.__dict__: + self._upconvert_old_vocab() + + def _upconvert_old_vocab(self): + """Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects""" + old_vocab = self.__dict__.pop('vocab', None) + self.key_to_index = {} + for k in old_vocab.keys(): + old_v = old_vocab[k] + self.key_to_index[k] = old_v.index + for attr in old_v.__dict__.keys(): + self.set_vecattr(old_v.index, attr, old_v.__dict__[attr]) + # special case to enforce required type on `sample_int` + if 'sample_int' in self.expandos: + self.expandos['sample_int'] = self.expandos['sample_int'].astype(np.uint32) + + def allocate_vecattrs(self, attrs=None, types=None): + """Ensure arrays for given per-vector extra-attribute names & types exist, at right size. + + The length of the index_to_key list is canonical 'intended size' of KeyedVectors, + even if other properties (vectors array) hasn't yet been allocated or expanded. + So this allocation targets that size. + """ + # with no arguments, adjust lengths of existing vecattr arrays to match length of index_to_key + if attrs is None: + attrs = list(self.expandos.keys()) + types = [self.expandos[attr].dtype for attr in attrs] + target_size = len(self.index_to_key) + for attr, t in zip(attrs, types): + if t is int: + t = np.int64 # ensure 'int' type 64-bit (numpy-on-Windows https://github.com/numpy/numpy/issues/9464) + if attr not in self.expandos: + self.expandos[attr] = np.zeros(target_size, dtype=t) + continue + prev_expando = self.expandos[attr] + if not np.issubdtype(t, prev_expando.dtype): + raise TypeError("can't allocate {0} for existing {1}".format(t, prev_expando.dtype)) + if len(prev_expando) == target_size: + continue # no resizing necessary + prev_count = len(prev_expando) + self.expandos[attr] = np.zeros(target_size, dtype=prev_expando.dtype) + self.expandos[attr][0:min(prev_count, target_size), ] = \ + prev_expando[0:min(prev_count, target_size), ] + + def set_vecattr(self, key, attr, val): + """Set attribute associated with given key to value. TODO: param docs""" + self.allocate_vecattrs(attrs=[attr], types=[type(val)]) + index = self.get_index(key) + self.expandos[attr][index] = val + + def get_vecattr(self, key, attr): + """Get attribute value associate with given key. TODO: param docs""" + index = self.get_index(key) + return self.expandos[attr][index] + + def resize_vectors(self): + """Make underlying vectors match index_to_key size.""" + target_count = len(self.index_to_key) + prev_count = len(self.vectors) + if prev_count == target_count: + return () + prev_vectors = self.vectors + if hasattr(self, 'mapfile_path') and self.mapfile_path: + self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL) + else: + self.vectors = np.zeros((target_count, self.vector_size), dtype=REAL) + self.vectors[0:min(prev_count, target_count), ] = prev_vectors[0:min(prev_count, target_count), ] + self.allocate_vecattrs() + self.norms = None + return range(prev_count, target_count) + + def randomly_initialize_vectors(self, indexes=None, seed=0): + """Initialize vectors with low-magnitude random vectors, as is typical for pre-trained + Word2Vec and related models. + + """ + if indexes is None: + indexes = range(0, len(self.vectors)) + for i in indexes: + self.vectors[i] = pseudorandom_weak_vector(self.vectors.shape[1], + seed_string=(str(self.index_to_key[i]) + str(seed))) + self.norms = None + + def __len__(self): + return len(self.index_to_key) + + def __getitem__(self, key_or_keys): + """Get vector representation of `key_or_keys`. - def similarity(self, entity1, entity2): - """Compute cosine similarity between two entities, specified by their string id.""" - raise NotImplementedError() + Parameters + ---------- + key_or_keys : {str, list of str, int, list of int} + Requested key or list-of-keys - def most_similar(self, **kwargs): - """Find the top-N most similar entities. - Possibly have `positive` and `negative` list of entities in `**kwargs`. + Returns + ------- + numpy.ndarray + Vector representation for `key_or_keys` (1D if `key_or_keys` is single key, otherwise - 2D). """ - return NotImplementedError() + if isinstance(key_or_keys, KEY_TYPES): + return self.get_vector(key_or_keys) - def distance(self, entity1, entity2): - """Compute distance between vectors of two input entities, specified by their string id.""" - raise NotImplementedError() + return vstack([self.get_vector(key) for key in key_or_keys]) - def distances(self, entity1, other_entities=()): - """Compute distances from a given entity (its string id) to all entities in `other_entity`. - If `other_entities` is empty, return the distance between `entity1` and all entities in vocab. + def get_index(self, key, default=None): + """Return the integer index (slot/position) where the given key's vector is stored in the + backing vectors array. """ - raise NotImplementedError() + val = self.key_to_index.get(key, -1) + if val >= 0: + return val + elif isinstance(key, (int, np.integer)) and 0 <= key < len(self.index_to_key): + return key + elif default is not None: + return default + else: + raise KeyError("Key '%s' not present" % key) - def get_vector(self, entity): - """Get the entity's representations in vector space, as a 1D numpy array. + def get_vector(self, key, use_norm=False): + """Get the key's vector, as a 1D numpy array. Parameters ---------- - entity : str - Identifier of the entity to return the vector for. + key : str or int + Key for vector to return, or int slot + use_norm : bool, optional + If True - resulting vector will be L2-normalized (unit euclidean length). Returns ------- numpy.ndarray - Vector for the specified entity. + Vector for the specified key. Raises ------ KeyError - If the given entity identifier doesn't exist. + If the given key doesn't exist. """ - if entity in self.vocab: - result = self.vectors[self.vocab[entity].index] - result.setflags(write=False) - return result + index = self.get_index(key) + if use_norm: + self.fill_norms() + result = self.vectors[index] / self.norms[index] + else: + result = self.vectors[index] + + result.setflags(write=False) # disallow direct tampering that would invalidate `norms` etc + return result + + @deprecated("Use get_vector instead") + def word_vec(self, *args, **kwargs): + """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector()""" + return self.get_vector(*args, **kwargs) + + def add_one(self, key, vector): + """Add one new vector at the given key, into existing slot if available. + + Warning: using this repeatedly is inefficient, requiring a full reallocation & copy, + if this instance hasn't been preallocated to be ready fro such incremental additions. + + returns: actual index used TODO: other param docs + """ + + target_index = self.next_index + if target_index >= len(self) or self.index_to_key[target_index] is not None: + # must append at end by expanding existing structures + target_index = len(self) + warnings.warn( + "Adding single vectors to a KeyedVectors which grows by one each time can be costly. " + "Consider adding in batches or preallocating to the required size.", + UserWarning) + self.add([key], [vector]) + self.allocate_vecattrs() # grow any adjunct arrays + self.next_index = target_index + 1 else: - raise KeyError("'%s' not in vocabulary" % entity) + # can add to existing slot + self.index_to_key[target_index] = key + self.key_to_index[key] = target_index + self.vectors[target_index] = vector + self.next_index += 1 + return target_index - def add(self, entities, weights, replace=False): - """Append entities and theirs vectors in a manual way. - If some entity is already in the vocabulary, the old vector is kept unless `replace` flag is True. + def add(self, keys, weights, extras=None, replace=False): + """Append keys and their vectors in a manual way. + If some key is already in the vocabulary, the old vector is kept unless `replace` flag is True. Parameters ---------- - entities : list of str - Entities specified by string ids. + keys : list of (str or int) + keys specified by string or int ids. weights: list of numpy.ndarray or numpy.ndarray List of 1D np.array vectors or a 2D np.array of vectors. replace: bool, optional - Flag indicating whether to replace vectors for entities which already exist in the vocabulary, + Flag indicating whether to replace vectors for keys which already exist in the map; if True - replace vectors, otherwise - keep old vectors. """ - if isinstance(entities, string_types): - entities = [entities] + if isinstance(keys, KEY_TYPES): + keys = [keys] weights = np.array(weights).reshape(1, -1) elif isinstance(weights, list): weights = np.array(weights) + if extras is None: + extras = {} - in_vocab_mask = np.zeros(len(entities), dtype=np.bool) - for idx, entity in enumerate(entities): - if entity in self.vocab: + # TODO? warn if not matching extras already present? + # initially allocate extras, check type compatibility + self.allocate_vecattrs(extras.keys(), [extras[k].dtype for k in extras.keys()]) + + in_vocab_mask = np.zeros(len(keys), dtype=np.bool) + for idx, key in enumerate(keys): + if key in self: in_vocab_mask[idx] = True # add new entities to the vocab for idx in np.nonzero(~in_vocab_mask)[0]: - entity = entities[idx] - self.vocab[entity] = Vocab(index=len(self.vocab), count=1) - self.index2entity.append(entity) + key = keys[idx] + self.key_to_index[key] = len(self.index_to_key) + self.index_to_key.append(key) - # add vectors for new entities + # add vectors, extras for new entities self.vectors = vstack((self.vectors, weights[~in_vocab_mask].astype(self.vectors.dtype))) + for attr, extra in extras: + self.expandos[attr] = np.vstack((self.expandos[attr], extra[~in_vocab_mask])) - # change vectors for in_vocab entities if `replace` flag is specified + # change vectors, extras for in_vocab entities if `replace` flag is specified if replace: - in_vocab_idxs = [self.vocab[entities[idx]].index for idx in np.nonzero(in_vocab_mask)[0]] + in_vocab_idxs = [self.get_index(keys[idx]) for idx in np.nonzero(in_vocab_mask)[0]] self.vectors[in_vocab_idxs] = weights[in_vocab_mask] + for attr, extra in extras: + self.expandos[attr][in_vocab_idxs] = extra[in_vocab_mask] - def __setitem__(self, entities, weights): - """Add entities and theirs vectors in a manual way. - If some entity is already in the vocabulary, old vector is replaced with the new one. - This method is alias for :meth:`~gensim.models.keyedvectors.BaseKeyedVectors.add` with `replace=True`. + def __setitem__(self, keys, weights): + """Add keys and theirs vectors in a manual way. + If some key is already in the vocabulary, old vector is replaced with the new one. + This method is alias for :meth:`~gensim.models.keyedvectors.KeyedVectors.add` with `replace=True`. Parameters ---------- - entities : {str, list of str} - Entities specified by their string ids. + keys : {str, int, list of (str or int)} + keys specified by their string or int ids. weights: list of numpy.ndarray or numpy.ndarray List of 1D np.array vectors or 2D np.array of vectors. """ - if not isinstance(entities, list): - entities = [entities] + if not isinstance(keys, list): + keys = [keys] weights = weights.reshape(1, -1) - self.add(entities, weights, replace=True) - - def __getitem__(self, entities): - """Get vector representation of `entities`. + self.add(keys, weights, replace=True) - Parameters - ---------- - entities : {str, list of str} - Input entity/entities. + def has_index_for(self, key): + """Can this model return a single index for this key? - Returns - ------- - numpy.ndarray - Vector representation for `entities` (1D if `entities` is string, otherwise - 2D). + Subclasses that synthesize vectors for out-of-vocabulary words (like + :class:`~gensim.models.fasttext.FastText`) may respond True for a + simple `word in wv` (`__contains__()`) check but False for this + more-specific check. """ - if isinstance(entities, string_types): - # allow calls like trained_model['office'], as a shorthand for trained_model[['office']] - return self.get_vector(entities) - - return vstack([self.get_vector(entity) for entity in entities]) + return self.get_index(key, -1) >= 0 - def __contains__(self, entity): - return entity in self.vocab + def __contains__(self, key): + return self.has_index_for(key) - def most_similar_to_given(self, entity1, entities_list): - """Get the `entity` from `entities_list` most similar to `entity1`.""" - return entities_list[argmax([self.similarity(entity1, entity) for entity in entities_list])] + def most_similar_to_given(self, key1, keys_list): + """Get the `key` from `keys_list` most similar to `key1`.""" + return keys_list[argmax([self.similarity(key1, key) for key in keys_list])] - def closer_than(self, entity1, entity2): - """Get all entities that are closer to `entity1` than `entity2` is to `entity1`.""" - all_distances = self.distances(entity1) - e1_index = self.vocab[entity1].index - e2_index = self.vocab[entity2].index + def closer_than(self, key1, key2): + """Get all keys that are closer to `key1` than `key2` is to `key1`.""" + all_distances = self.distances(key1) + e1_index = self.get_index(key1) + e2_index = self.get_index(key2) closer_node_indices = np.where(all_distances < all_distances[e2_index])[0] - return [self.index2entity[index] for index in closer_node_indices if index != e1_index] + return [self.index_to_key[index] for index in closer_node_indices if index != e1_index] - def rank(self, entity1, entity2): - """Rank of the distance of `entity2` from `entity1`, in relation to distances of all entities from `entity1`.""" - return len(self.closer_than(entity1, entity2)) + 1 + @deprecated("Use closer_than instead") + def words_closer_than(self, word1, word2): + return self.closer_than(word1, word2) + def rank(self, key1, key2): + """Rank of the distance of `key2` from `key1`, in relation to distances of all keys from `key1`.""" + return len(self.closer_than(key1, key2)) + 1 -class WordEmbeddingsKeyedVectors(BaseKeyedVectors): - """Class containing common methods for operations over word vectors.""" - def __init__(self, vector_size): - super(WordEmbeddingsKeyedVectors, self).__init__(vector_size=vector_size) - self.vectors_norm = None - self.index2word = [] - + # backward compatibility; some would be annotated `@deprecated` if that stacked with @property/.setter @property - @deprecated("Attribute will be removed in 4.0.0, use self instead") - def wv(self): - return self + def vectors_norm(self): + self.fill_norms() + return self.vectors / self.norms[..., np.newaxis] + + def fill_norms(self, force=False): + """ + Ensure per-vector norms are available. + + Any code which modifies vectors should ensure the accompanying norms are + either recalculated or 'None', to trigger full recalc later. + + """ + if self.norms is None or force: + self.norms = np.linalg.norm(self.vectors, axis=1) + + @vectors_norm.setter + def vectors_norm(self, _): + pass # no-op; shouldn't be set @property def index2entity(self): - return self.index2word + return self.index_to_key @index2entity.setter def index2entity(self, value): - self.index2word = value + self.index_to_key = value @property - @deprecated("Attribute will be removed in 4.0.0, use self.vectors instead") - def syn0(self): - return self.vectors + def index2word(self): + return self.index_to_key - @syn0.setter - @deprecated("Attribute will be removed in 4.0.0, use self.vectors instead") - def syn0(self, value): - self.vectors = value + @index2word.setter + def index2word(self, value): + self.index_to_key = value @property - @deprecated("Attribute will be removed in 4.0.0, use self.vectors_norm instead") - def syn0norm(self): - return self.vectors_norm - - @syn0norm.setter - @deprecated("Attribute will be removed in 4.0.0, use self.vectors_norm instead") - def syn0norm(self, value): - self.vectors_norm = value + def vocab(self): + raise NotImplementedError( + "The .vocab dict of 'Vocab' propery objects, one per key, has been removed.\n" + "See the KeyedVectors .key_to_index dict, .index_to_key list, and methods\n" + ".get_vecattr(key, attr)/.set_vecattr(key, attr, new_val) for replacement\n" + "functionality." + ) - def __contains__(self, word): - return word in self.vocab + @vocab.setter + def vocab(self, value): + self.vocab() # trigger above NotImplementedError + + def sort_by_descending_frequency(self): + """Sort the vocabulary so the most frequent words have the lowest indexes.""" + if not len(self): + return # noop if empty + count_sorted_indexes = np.argsort(self.expandos['count'])[::-1] + self.index_to_key = list(np.array(self.index_to_key)[count_sorted_indexes]) + self.allocate_vecattrs() + for k in self.expandos: + # Use numpy's "fancy indexing" to permutate the entire array in one step. + self.expandos[k] = self.expandos[k][count_sorted_indexes] + if len(self.vectors): + logger.warning("sorting after vectors have been allocated is expensive & error-prone") + self.vectors = self.vectors[count_sorted_indexes] + for i, word in enumerate(self.index_to_key): + self.key_to_index[word] = i def save(self, *args, **kwargs): """Save KeyedVectors. @@ -427,96 +595,47 @@ def save(self, *args, **kwargs): See Also -------- - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.load` + :meth:`~gensim.models.keyedvectors.KeyedVectors.load` Load saved model. """ - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['vectors_norm']) - super(WordEmbeddingsKeyedVectors, self).save(*args, **kwargs) - - def word_vec(self, word, use_norm=False): - """Get `word` representations in vector space, as a 1D numpy array. - - Parameters - ---------- - word : str - Input word - use_norm : bool, optional - If True - resulting vector will be L2-normalized (unit euclidean length). - - Returns - ------- - numpy.ndarray - Vector representation of `word`. - - Raises - ------ - KeyError - If word not in vocabulary. - - """ - if word in self.vocab: - if use_norm: - result = self.vectors_norm[self.vocab[word].index] - else: - result = self.vectors[self.vocab[word].index] - - result.setflags(write=False) - return result - else: - raise KeyError("word '%s' not in vocabulary" % word) - - def get_vector(self, word): - return self.word_vec(word) - - def words_closer_than(self, w1, w2): - """Get all words that are closer to `w1` than `w2` is to `w1`. - - Parameters - ---------- - w1 : str - Input word. - w2 : str - Input word. - - Returns - ------- - list (str) - List of words that are closer to `w1` than `w2` is to `w1`. - - """ - return super(WordEmbeddingsKeyedVectors, self).closer_than(w1, w2) + super(KeyedVectors, self).save(*args, **kwargs) - def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): - """Find the top-N most similar words. - Positive words contribute positively towards the similarity, negative words negatively. + def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, + restrict_vocab=None, indexer=None): + """Find the top-N most similar keys. + Positive keys contribute positively towards the similarity, negative keys negatively. This method computes cosine similarity between a simple mean of the projection - weight vectors of the given words and the vectors for each word in the model. + weight vectors of the given keys and the vectors for each key in the model. The method corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. Parameters ---------- - positive : list of str, optional - List of words that contribute positively. - negative : list of str, optional - List of words that contribute negatively. + positive : list of (str or int or ndarray), optional + List of keys that contribute positively. + negative : list of (str or int or ndarray), optional + List of keys that contribute negatively. topn : int or None, optional - Number of top-N similar words to return, when `topn` is int. When `topn` is None, - then similarities for all words are returned. + Number of top-N similar keys to return, when `topn` is int. When `topn` is None, + then similarities for all keys are returned. + clip_start : int + Start clipping index. + clip_end : int + End clipping index. restrict_vocab : int, optional Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) + only check the first 10000 key vectors in the vocabulary order. (This may be + meaningful if you've sorted the vocabulary by descending frequency.) If + specified, overrides any values of ``clip_start`` or ``clip_end``. Returns ------- list of (str, float) or numpy.array - When `topn` is int, a sequence of (word, similarity) is returned. - When `topn` is None, then similarities for all words are returned as a + When `topn` is int, a sequence of (key, similarity) is returned. + When `topn` is None, then similarities for all keys are returned as a one-dimensional numpy array with the size of the vocabulary. """ @@ -528,31 +647,36 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non if negative is None: negative = [] - self.init_sims() + self.fill_norms() + clip_end = clip_end or len(self.vectors) + + if restrict_vocab: + clip_start = 0 + clip_end = restrict_vocab - if isinstance(positive, string_types) and not negative: + if isinstance(positive, KEY_TYPES) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] - # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words + # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys positive = [ - (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in positive + (item, 1.0) if isinstance(item, KEY_TYPES + (ndarray,)) + else item for item in positive ] negative = [ - (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in negative + (item, -1.0) if isinstance(item, KEY_TYPES + (ndarray,)) + else item for item in negative ] - # compute the weighted average of all words - all_words, mean = set(), [] - for word, weight in positive + negative: - if isinstance(word, ndarray): - mean.append(weight * word) + # compute the weighted average of all keys + all_keys, mean = set(), [] + for key, weight in positive + negative: + if isinstance(key, ndarray): + mean.append(weight * key) else: - mean.append(weight * self.word_vec(word, use_norm=True)) - if word in self.vocab: - all_words.add(self.vocab[word].index) + mean.append(weight * self.get_vector(key, use_norm=True)) + if self.has_index_for(key): + all_keys.add(self.get_index(key)) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) @@ -560,120 +684,71 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non if indexer is not None and isinstance(topn, int): return indexer.most_similar(mean, topn) - limited = self.vectors_norm if restrict_vocab is None else self.vectors_norm[:restrict_vocab] - dists = dot(limited, mean) + dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end] if not topn: return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) - # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] + best = matutils.argsort(dists, topn=topn + len(all_keys), reverse=True) + # ignore (don't return) keys from the input + result = [(self.index_to_key[sim + clip_start], float(dists[sim])) + for sim in best if (sim + clip_start) not in all_keys] return result[:topn] def similar_by_word(self, word, topn=10, restrict_vocab=None): - """Find the top-N most similar words. + """Compatibility alias for similar_by_key()""" + return self.similar_by_key(word, topn, restrict_vocab) + + def similar_by_key(self, key, topn=10, restrict_vocab=None): + """Find the top-N most similar keys. Parameters ---------- - word : str - Word + key : str + Key topn : int or None, optional - Number of top-N similar words to return. If topn is None, similar_by_word returns + Number of top-N similar keys to return. If topn is None, similar_by_key returns the vector of similarity scores. restrict_vocab : int, optional Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be + only check the first 10000 key vectors in the vocabulary order. (This may be meaningful if you've sorted the vocabulary by descending frequency.) Returns ------- list of (str, float) or numpy.array - When `topn` is int, a sequence of (word, similarity) is returned. - When `topn` is None, then similarities for all words are returned as a + When `topn` is int, a sequence of (key, similarity) is returned. + When `topn` is None, then similarities for all keys are returned as a one-dimensional numpy array with the size of the vocabulary. """ - return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) + return self.most_similar(positive=[key], topn=topn, restrict_vocab=restrict_vocab) def similar_by_vector(self, vector, topn=10, restrict_vocab=None): - """Find the top-N most similar words by vector. + """Find the top-N most similar keys by vector. Parameters ---------- vector : numpy.array Vector from which similarities are to be computed. topn : int or None, optional - Number of top-N similar words to return, when `topn` is int. When `topn` is None, - then similarities for all words are returned. + Number of top-N similar keys to return, when `topn` is int. When `topn` is None, + then similarities for all keys are returned. restrict_vocab : int, optional Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be + only check the first 10000 key vectors in the vocabulary order. (This may be meaningful if you've sorted the vocabulary by descending frequency.) Returns ------- list of (str, float) or numpy.array - When `topn` is int, a sequence of (word, similarity) is returned. - When `topn` is None, then similarities for all words are returned as a + When `topn` is int, a sequence of (key, similarity) is returned. + When `topn` is None, then similarities for all keys are returned as a one-dimensional numpy array with the size of the vocabulary. """ return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) - @deprecated( - "Method will be removed in 4.0.0, use " - "gensim.models.keyedvectors.WordEmbeddingSimilarityIndex instead") - def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=REAL): - """Construct a term similarity matrix for computing Soft Cosine Measure. - - This creates a sparse term similarity matrix in the :class:`scipy.sparse.csc_matrix` format for computing - Soft Cosine Measure between documents. - - Parameters - ---------- - dictionary : :class:`~gensim.corpora.dictionary.Dictionary` - A dictionary that specifies the considered terms. - tfidf : :class:`gensim.models.tfidfmodel.TfidfModel` or None, optional - A model that specifies the relative importance of the terms in the dictionary. The - columns of the term similarity matrix will be build in a decreasing order of importance - of terms, or in the order of term identifiers if None. - threshold : float, optional - Only embeddings more similar than `threshold` are considered when retrieving word - embeddings closest to a given word embedding. - exponent : float, optional - Take the word embedding similarities larger than `threshold` to the power of `exponent`. - nonzero_limit : int, optional - The maximum number of non-zero elements outside the diagonal in a single column of the - sparse term similarity matrix. - dtype : numpy.dtype, optional - Data-type of the sparse term similarity matrix. - - Returns - ------- - :class:`scipy.sparse.csc_matrix` - Term similarity matrix. - - See Also - -------- - :func:`gensim.matutils.softcossim` - The Soft Cosine Measure. - :class:`~gensim.similarities.docsim.SoftCosineSimilarity` - A class for performing corpus-based similarity queries with Soft Cosine Measure. - - Notes - ----- - The constructed matrix corresponds to the matrix Mrel defined in section 2.1 of - `Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity - between Questions for Community Question Answering", 2017 - `_. - - """ - index = WordEmbeddingSimilarityIndex(self, threshold=threshold, exponent=exponent) - similarity_matrix = SparseTermSimilarityMatrix( - index, dictionary, tfidf=tfidf, nonzero_limit=nonzero_limit, dtype=dtype) - return similarity_matrix.matrix - def wmdistance(self, document1, document2): """Compute the Word Mover's Distance between two documents. @@ -754,8 +829,9 @@ def wmdistance(self, document1, document2): if t2 not in docset2 or distance_matrix[i, j] != 0.0: continue - # Compute Euclidean distance between word vectors. - distance_matrix[i, j] = distance_matrix[j, i] = sqrt(np_sum((self[t1] - self[t2])**2)) + # Compute Euclidean distance between unit-normed word vectors. + distance_matrix[i, j] = distance_matrix[j, i] = np.sqrt( + np_sum((self.get_vector(t1, use_norm=True) - self.get_vector(t2, use_norm=True))**2)) if np_sum(distance_matrix) == 0.0: # `emd` gets stuck if the distance matrix contains only zeros. @@ -788,7 +864,7 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): Additional positive or negative examples contribute to the numerator or denominator, respectively - a potentially sensible but untested extension of the method. With a single positive example, rankings will be the same as in the default - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar`. + :meth:`~gensim.models.keyedvectors.KeyedVectors.most_similar`. Parameters ---------- @@ -808,6 +884,7 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): one-dimensional numpy array with the size of the vocabulary. """ + # FIXME: Update to better match & share code with most_similar() if isinstance(topn, Integral) and topn < 1: return [] @@ -816,23 +893,23 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): if negative is None: negative = [] - self.init_sims() + self.fill_norms() - if isinstance(positive, string_types) and not negative: + if isinstance(positive, str) and not negative: # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) positive = [positive] all_words = { - self.vocab[word].index for word in positive + negative - if not isinstance(word, ndarray) and word in self.vocab + self.get_index(word) for word in positive + negative + if not isinstance(word, ndarray) and word in self.key_to_index } positive = [ - self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word + self.get_vector(word, use_norm=True) if isinstance(word, str) else word for word in positive ] negative = [ - self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word + self.get_vector(word, use_norm=True) if isinstance(word, str) else word for word in negative ] @@ -841,32 +918,34 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): # equation (4) of Levy & Goldberg "Linguistic Regularities...", # with distances shifted to [0,1] per footnote (7) - pos_dists = [((1 + dot(self.vectors_norm, term)) / 2) for term in positive] - neg_dists = [((1 + dot(self.vectors_norm, term)) / 2) for term in negative] + pos_dists = [((1 + dot(self.vectors, term) / self.norms) / 2) for term in positive] + neg_dists = [((1 + dot(self.vectors, term) / self.norms) / 2) for term in negative] dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] + result = [(self.index_to_key[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn] - def doesnt_match(self, words): - """Which word from the given list doesn't go with the others? + def rank_by_centrality(self, words, use_norm=True): + """Rank the given words by similarity to the centroid of all the words. Parameters ---------- words : list of str - List of words. + List of keys. + use_norm : bool, optional + Whether to calculate centroid using unit-normed vectors; default True. Returns ------- - str - The word further away from the mean of all words. + list of (float, str) + Ranked list of (similarity, key), most-similar to the centroid first. """ - self.init_sims() + self.fill_norms() used_words = [word for word in words if word in self] if len(used_words) != len(words): @@ -874,10 +953,26 @@ def doesnt_match(self, words): logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words) if not used_words: raise ValueError("cannot select a word from an empty list") - vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) + vectors = vstack([self.get_vector(word, use_norm=use_norm) for word in used_words]).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) - return sorted(zip(dists, used_words))[0][1] + return sorted(zip(dists, used_words), reverse=True) + + def doesnt_match(self, words): + """Which key from the given list doesn't go with the others? + + Parameters + ---------- + words : list of str + List of keys. + + Returns + ------- + str + The key further away from the mean of all keys. + + """ + return self.rank_by_centrality(words)[-1][1] @staticmethod def cosine_similarities(vector_1, vectors_all): @@ -925,27 +1020,27 @@ def distances(self, word_or_vector, other_words=()): If either `word_or_vector` or any word in `other_words` is absent from vocab. """ - if isinstance(word_or_vector, string_types): - input_vector = self.word_vec(word_or_vector) + if isinstance(word_or_vector, KEY_TYPES): + input_vector = self.get_vector(word_or_vector) else: input_vector = word_or_vector if not other_words: other_vectors = self.vectors else: - other_indices = [self.vocab[word].index for word in other_words] + other_indices = [self.get_index(word) for word in other_words] other_vectors = self.vectors[other_indices] return 1 - self.cosine_similarities(input_vector, other_vectors) def distance(self, w1, w2): - """Compute cosine distance between two words. - Calculate 1 - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`. + """Compute cosine distance between two keys. + Calculate 1 - :meth:`~gensim.models.keyedvectors.KeyedVectors.similarity`. Parameters ---------- w1 : str - Input word. + Input key. w2 : str - Input word. + Input key. Returns ------- @@ -956,14 +1051,14 @@ def distance(self, w1, w2): return 1 - self.similarity(w1, w2) def similarity(self, w1, w2): - """Compute cosine similarity between two words. + """Compute cosine similarity between two keys. Parameters ---------- w1 : str - Input word. + Input key. w2 : str - Input word. + Input key. Returns ------- @@ -974,14 +1069,14 @@ def similarity(self, w1, w2): return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2])) def n_similarity(self, ws1, ws2): - """Compute cosine similarity between two sets of words. + """Compute cosine similarity between two sets of keys. Parameters ---------- ws1 : list of str - Sequence of words. + Sequence of keys. ws2: list of str - Sequence of words. + Sequence of keys. Returns ------- @@ -991,14 +1086,14 @@ def n_similarity(self, ws1, ws2): """ if not(len(ws1) and len(ws2)): raise ZeroDivisionError('At least one of the passed list is empty.') - v1 = [self[word] for word in ws1] - v2 = [self[word] for word in ws2] + v1 = [self[key] for key in ws1] + v2 = [self[key] for key in ws2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) @staticmethod def _log_evaluate_word_analogies(section): """Calculate score by section, helper for - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.evaluate_word_analogies`. + :meth:`~gensim.models.keyedvectors.KeyedVectors.evaluate_word_analogies`. Parameters ---------- @@ -1020,7 +1115,7 @@ def _log_evaluate_word_analogies(section): def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """Compute performance of the model on an analogy test set. - This is modern variant of :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.accuracy`, see + This is modern variant of :meth:`~gensim.models.keyedvectors.KeyedVectors.accuracy`, see `discussion on GitHub #1935 `_. The accuracy is reported (printed to log and returned as a score) for each section separately, @@ -1057,8 +1152,11 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi keys 'correct' and 'incorrect'. """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) + ok_keys = self.index_to_key[:restrict_vocab] + if case_insensitive: + ok_vocab = {k.upper(): self.get_index(k) for k in reversed(ok_keys)} + else: + ok_vocab = {k: self.get_index(k) for k in reversed(ok_keys)} oov = 0 logger.info("Evaluating word analogies for top %i words in the model on %s", restrict_vocab, analogies) sections, section = [], None @@ -1092,14 +1190,14 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi else: logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) continue - original_vocab = self.vocab - self.vocab = ok_vocab + original_key_to_index = self.key_to_index + self.key_to_index = ok_vocab ignore = {a, b, c} # input words to be ignored predicted = None # find the most likely prediction using 3CosAdd (vector offset) method # TODO: implement 3CosMul and set-based methods for solving analogies sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) - self.vocab = original_vocab + self.key_to_index = original_key_to_index for element in sims: predicted = element[0].upper() if case_insensitive else element[0] if predicted in ok_vocab and predicted not in ignore: @@ -1142,95 +1240,6 @@ def log_accuracy(section): section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect ) - @deprecated("Method will be removed in 4.0.0, use self.evaluate_word_analogies() instead") - def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): - """Compute accuracy of the model. - - The accuracy is reported (=printed to log and returned as a list) for each - section separately, plus there's one aggregate summary at the end. - - Parameters - ---------- - questions : str - Path to file, where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. - See `gensim/test/test_data/questions-words.txt` as example. - restrict_vocab : int, optional - Ignore all 4-tuples containing a word not in the first `restrict_vocab` words. - This may be meaningful if you've sorted the model vocabulary by descending frequency (which is standard - in modern word embedding models). - most_similar : function, optional - Function used for similarity calculation. - case_insensitive : bool, optional - If True - convert all words to their uppercase form before evaluating the performance. - Useful to handle case-mismatch between training tokens and words in the test set. - In case of multiple case variants of a single word, the vector for the first occurrence - (also the most frequent if vocabulary is sorted) is taken. - - Returns - ------- - list of dict of (str, (str, str, str) - Full lists of correct and incorrect predictions divided by sections. - - """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) - - sections, section = [], None - with utils.open(questions, 'rb') as fin: - for line_no, line in enumerate(fin): - # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed - line = utils.to_unicode(line) - if line.startswith(': '): - # a new section starts => store the old section - if section: - sections.append(section) - self.log_accuracy(section) - section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} - else: - if not section: - raise ValueError("Missing section header before line #%i in %s" % (line_no, questions)) - try: - if case_insensitive: - a, b, c, expected = [word.upper() for word in line.split()] - else: - a, b, c, expected = [word for word in line.split()] - except ValueError: - logger.info("Skipping invalid line #%i in %s", line_no, questions) - continue - if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) - continue - original_vocab = self.vocab - self.vocab = ok_vocab - ignore = {a, b, c} # input words to be ignored - predicted = None - # find the most likely prediction, ignoring OOV words and input words - sims = most_similar(self, positive=[b, c], negative=[a], topn=None, restrict_vocab=restrict_vocab) - self.vocab = original_vocab - for index in matutils.argsort(sims, reverse=True): - predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] - if predicted in ok_vocab and predicted not in ignore: - if predicted != expected: - logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) - break - if predicted == expected: - section['correct'].append((a, b, c, expected)) - else: - section['incorrect'].append((a, b, c, expected)) - if section: - # store the last section, too - sections.append(section) - self.log_accuracy(section) - - total = { - 'section': 'total', - 'correct': list(chain.from_iterable(s['correct'] for s in sections)), - 'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)), - } - self.log_accuracy(total) - sections.append(total) - return sections - @staticmethod def log_evaluate_word_pairs(pearson, spearman, oov, pairs): logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0]) @@ -1278,15 +1287,18 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, The ratio of pairs with unknown words. """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) + ok_keys = self.index_to_key[:restrict_vocab] + if case_insensitive: + ok_vocab = {k.upper(): self.get_index(k) for k in reversed(ok_keys)} + else: + ok_vocab = {k: self.get_index(k) for k in reversed(ok_keys)} similarity_gold = [] similarity_model = [] oov = 0 - original_vocab = self.vocab - self.vocab = ok_vocab + original_key_to_index = self.key_to_index + self.key_to_index = ok_vocab with utils.open(pairs, 'rb') as fin: for line_no, line in enumerate(fin): @@ -1316,7 +1328,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, continue similarity_gold.append(sim) # Similarity from the dataset similarity_model.append(self.similarity(a, b)) # Similarity from the model - self.vocab = original_vocab + self.key_to_index = original_key_to_index spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) if dummy4unknown: @@ -1333,25 +1345,37 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) return pearson, spearman, oov_ratio + @deprecated("use fill_norms instead") def init_sims(self, replace=False): - """Precompute L2-normalized vectors. + """Precompute data helpful for bulk similarity calculations. + + :meth:`~gensim.models.keyedvectors.KeyedVectors.fill_norms` now preferred for this purpose. Parameters ---------- replace : bool, optional - If True - forget the original vectors and only keep the normalized ones = saves lots of memory! + If True - forget the original vectors and only keep the normalized ones. Warnings -------- - You **cannot continue training** after doing a replace. - The model becomes effectively read-only: you can call - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar`, - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`, etc., but not train. + You **cannot sensibly continue training** after doing a replace on a model's + internal KeyedVectors, and a replace is no longer necessary to save RAM. + + """ + self.fill_norms() + if replace: + logger.warning("destructive init_sims(replace=True) deprecated & no longer required for space-efficiency") + self.unit_normalize_all() + + def unit_normalize_all(self): + """Destructively scale all vectors to unit-length. + + (You cannot sensibly continue training after such a step.) """ - if getattr(self, 'vectors_norm', None) is None or replace: - logger.info("precomputing L2-norms of word weight vectors") - self.vectors_norm = _l2_norm(self.vectors, replace=replace) + self.fill_norms() + self.vectors /= self.norms[..., np.newaxis] + self.norms = np.ones((len(self.vectors),)) def relative_cosine_similarity(self, wa, wb, topn=10): """Compute the relative cosine similarity between two words given top-n similar words, @@ -1383,136 +1407,74 @@ def relative_cosine_similarity(self, wa, wb, topn=10): return rcs - def get_keras_embedding(self, train_embeddings=False, word_index=None): - """Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings. + def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None, write_header=True, + prefix='', append=False, sort_attr='count'): + """Store the input-hidden weight matrix in the same format used by the original + C word2vec-tool, for compatibility. Parameters ---------- - train_embeddings : bool - If False, the weights are frozen and stopped from being updated. - If True, the weights can/will be further trained/updated. - - word_index : {str : int} - A mapping from tokens to their indices the way they will be provided in the input to the embedding layer. - The embedding of each token will be placed at the corresponding index in the returned matrix. - Tokens not in the index are ignored. - This is useful when the token indices are produced by a process that is not coupled with the embedding - model, e.x. an Keras Tokenizer object. - If None, the embedding matrix in the embedding layer will be indexed according to self.vocab - - Returns - ------- - `keras.layers.Embedding` - Embedding layer. + fname : str + The file path used to save the vectors in. + fvocab : str, optional + File path used to save the vocabulary. + binary : bool, optional + If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. + total_vec : int, optional + Explicitly specify total number of vectors + (in case word vectors are appended with document vectors afterwards). + write_header : bool, optional + If False, don't write the 1st line declaring the count of vectors and dimensions. + TODO: doc prefix, append, sort_attr + """ + if total_vec is None: + total_vec = len(self.index_to_key) + mode = 'wb' if not append else 'ab' + if 'count' in self.expandos: + # if frequency-info available, store in most-to-least-frequent order + store_order_vocab_keys = sorted(self.key_to_index.keys(), key=lambda k: -self.get_vecattr(k, sort_attr)) + else: + store_order_vocab_keys = self.index_to_key + + if fvocab is not None: + logger.info("storing vocabulary in %s", fvocab) + with utils.open(fvocab, mode) as vout: + for word in store_order_vocab_keys: + vout.write(utils.to_utf8("%s%s %s\n" % (prefix, word, self.get_vecattr(word, sort_attr)))) + + logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) + assert (len(self.index_to_key), self.vector_size) == self.vectors.shape + + # after (possibly-empty) initial range of int-only keys, + # store in sorted order: most frequent keys at the top + index_id_count = 0 + for i, val in enumerate(self.index_to_key): + if not (i == val): + break + index_id_count += 1 + keys_to_write = chain(range(0, index_id_count), store_order_vocab_keys) + + with utils.open(fname, mode) as fout: + if write_header: + fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) + for key in keys_to_write: + row = self[key] + if binary: + row = row.astype(REAL) + fout.write(utils.to_utf8(prefix + str(key)) + b" " + row.tostring()) + else: + fout.write(utils.to_utf8("%s%s %s\n" % (prefix, str(key), ' '.join(repr(val) for val in row)))) - Raises - ------ - ImportError - If `Keras `_ not installed. + @classmethod + def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', + limit=None, datatype=REAL, no_header=False): + """Load the input-hidden weight matrix from the original C word2vec-tool format. Warnings -------- - Current method works only if `Keras `_ installed. - - """ - try: - from keras.layers import Embedding - except ImportError: - raise ImportError("Please install Keras to use this function") - if word_index is None: - weights = self.vectors - else: - max_index = max(word_index.values()) - weights = np.random.normal(size=(max_index + 1, self.vectors.shape[1])) - for word, index in word_index.items(): - if word in self.vocab: - weights[index] = self.get_vector(word) - - layer = Embedding( - input_dim=weights.shape[0], output_dim=weights.shape[1], - weights=[weights], trainable=train_embeddings - ) - return layer - - -class WordEmbeddingSimilarityIndex(TermSimilarityIndex): - """ - Computes cosine similarities between word embeddings and retrieves the closest word embeddings - by cosine similarity for a given word embedding. - - Parameters - ---------- - keyedvectors : :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors` - The word embeddings. - threshold : float, optional - Only embeddings more similar than `threshold` are considered when retrieving word embeddings - closest to a given word embedding. - exponent : float, optional - Take the word embedding similarities larger than `threshold` to the power of `exponent`. - kwargs : dict or None - A dict with keyword arguments that will be passed to the `keyedvectors.most_similar` method - when retrieving the word embeddings closest to a given word embedding. - - See Also - -------- - :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` - Build a term similarity matrix and compute the Soft Cosine Measure. - - """ - def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None): - assert isinstance(keyedvectors, WordEmbeddingsKeyedVectors) - self.keyedvectors = keyedvectors - self.threshold = threshold - self.exponent = exponent - self.kwargs = kwargs or {} - super(WordEmbeddingSimilarityIndex, self).__init__() - - def most_similar(self, t1, topn=10): - if t1 not in self.keyedvectors.vocab: - logger.debug('an out-of-dictionary term "%s"', t1) - else: - most_similar = self.keyedvectors.most_similar(positive=[t1], topn=topn, **self.kwargs) - for t2, similarity in most_similar: - if similarity > self.threshold: - yield (t2, similarity**self.exponent) - - -class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors): - """Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model. - Used to perform operations on the vectors such as vector lookup, distance, similarity etc. - - """ - def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): - """Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility. - - Parameters - ---------- - fname : str - The file path used to save the vectors in - fvocab : str, optional - Optional file path used to save the vocabulary - binary : bool, optional - If True, the data will be saved in binary word2vec format, else it will be saved in plain text. - total_vec : int, optional - Optional parameter to explicitly specify total no. of vectors - (in case word vectors are appended with document vectors afterwards). - - """ - # from gensim.models.word2vec import save_word2vec_format - _save_word2vec_format( - fname, self.vocab, self.vectors, fvocab=fvocab, binary=binary, total_vec=total_vec) - - @classmethod - def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): - """Load the input-hidden weight matrix from the original C word2vec-tool format. - - Warnings - -------- - The information stored in the file is incomplete (the binary tree is missing), - so while you can query for word similarity etc., you cannot continue training - with a model loaded this way. + The information stored in the file is incomplete (the binary tree is missing), + so while you can query for word similarity etc., you cannot continue training + with a model loaded this way. Parameters ---------- @@ -1536,991 +1498,359 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', datatype : type, optional (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory. Such types may result in much slower bulk operations or incompatibility with optimized routines.) + no_header : bool, optional + Default False means a usual word2vec-format file, with a 1st line declaring the count of + following vectors & number of dimensions. If True, the file is assumed to lack a declaratory + (vocab_size, vector_size) header and instead start with the 1st vector, and an extra + reading-pass will be used to discover the number of vectors. Works only with `binary=False`. Returns ------- - :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` + :class:`~gensim.models.keyedvectors.KeyedVectors` Loaded model. """ - # from gensim.models.word2vec import load_word2vec_format return _load_word2vec_format( cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors, - limit=limit, datatype=datatype) - - @classmethod - def load(cls, fname_or_handle, **kwargs): - model = super(WordEmbeddingsKeyedVectors, cls).load(fname_or_handle, **kwargs) - if isinstance(model, FastTextKeyedVectors): - if not hasattr(model, 'compatible_hash'): - model.compatible_hash = False - - return model - - -KeyedVectors = Word2VecKeyedVectors # alias for backward compatibility - - -class Doc2VecKeyedVectors(BaseKeyedVectors): - - def __init__(self, vector_size, mapfile_path): - super(Doc2VecKeyedVectors, self).__init__(vector_size=vector_size) - self.doctags = {} # string -> Doctag (only filled if necessary) - self.max_rawint = -1 # highest rawint-indexed doctag - self.offset2doctag = [] # int offset-past-(max_rawint+1) -> String (only filled if necessary) - self.count = 0 - self.vectors_docs = [] - self.mapfile_path = mapfile_path - self.vector_size = vector_size - self.vectors_docs_norm = None - - @property - def index2entity(self): - return self.offset2doctag - - @index2entity.setter - def index2entity(self, value): - self.offset2doctag = value - - @property - @deprecated("Attribute will be removed in 4.0.0, use docvecs.vectors_docs instead") - def doctag_syn0(self): - return self.vectors_docs - - @property - @deprecated("Attribute will be removed in 4.0.0, use docvecs.vectors_docs_norm instead") - def doctag_syn0norm(self): - return self.vectors_docs_norm - - def __getitem__(self, index): - """Get vector representation of `index`. - - Parameters - ---------- - index : {str, list of str} - Doctag or sequence of doctags. - - Returns - ------- - numpy.ndarray - Vector representation for `index` (1D if `index` is string, otherwise - 2D). - - """ - if index in self: - if isinstance(index, string_types + integer_types + (integer,)): - return self.vectors_docs[self._int_index(index, self.doctags, self.max_rawint)] - return vstack([self[i] for i in index]) - raise KeyError("tag '%s' not seen in training corpus/invalid" % index) - - def __contains__(self, index): - if isinstance(index, integer_types + (integer,)): - return index < self.count - else: - return index in self.doctags - - def __len__(self): - return self.count - - def save(self, *args, **kwargs): - """Save object. - - Parameters - ---------- - fname : str - Path to the output file. - - See Also - -------- - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.load` - Load object. - - """ - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['vectors_docs_norm']) - super(Doc2VecKeyedVectors, self).save(*args, **kwargs) - - def init_sims(self, replace=False): - """Precompute L2-normalized vectors. - - Parameters - ---------- - replace : bool, optional - If True - forget the original vectors and only keep the normalized ones = saves lots of memory! - - Warnings - -------- - You **cannot continue training** after doing a replace. - The model becomes effectively read-only: you can call - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.most_similar`, - :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.similarity`, etc., but not train and infer_vector. - - """ - if getattr(self, 'vectors_docs_norm', None) is None or replace: - logger.info("precomputing L2-norms of doc weight vectors") - if not replace and self.mapfile_path: - self.vectors_docs_norm = np_memmap( - self.mapfile_path + '.vectors_docs_norm', dtype=REAL, - mode='w+', shape=self.vectors_docs.shape) - else: - self.vectors_docs_norm = _l2_norm(self.vectors_docs, replace=replace) - - def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, indexer=None): - """Find the top-N most similar docvecs from the training set. - Positive docvecs contribute positively towards the similarity, negative docvecs negatively. - - This method computes cosine similarity between a simple mean of the projection - weight vectors of the given docs. Docs may be specified as vectors, integer indexes - of trained docvecs, or if the documents were originally presented with string tags, - by the corresponding tags. - - TODO: Accept vectors of out-of-training-set docs, as if from inference. - - Parameters - ---------- - positive : list of {str, int}, optional - List of doctags/indexes that contribute positively. - negative : list of {str, int}, optional - List of doctags/indexes that contribute negatively. - topn : int or None, optional - Number of top-N similar docvecs to return, when `topn` is int. When `topn` is None, - then similarities for all docvecs are returned. - clip_start : int - Start clipping index. - clip_end : int - End clipping index. - - Returns - ------- - list of ({str, int}, float) - Sequence of (doctag/index, similarity). - - """ - if isinstance(topn, Integral) and topn < 1: - return [] - - if positive is None: - positive = [] - if negative is None: - negative = [] - - self.init_sims() - clip_end = clip_end or len(self.vectors_docs_norm) - - if isinstance(positive, string_types + integer_types + (integer,)) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) - positive = [positive] - - # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs - positive = [ - (doc, 1.0) if isinstance(doc, string_types + integer_types + (ndarray, integer)) - else doc for doc in positive - ] - negative = [ - (doc, -1.0) if isinstance(doc, string_types + integer_types + (ndarray, integer)) - else doc for doc in negative - ] - - # compute the weighted average of all docs - all_docs, mean = set(), [] - for doc, weight in positive + negative: - if isinstance(doc, ndarray): - mean.append(weight * doc) - elif doc in self.doctags or doc < self.count: - mean.append(weight * self.vectors_docs_norm[self._int_index(doc, self.doctags, self.max_rawint)]) - all_docs.add(self._int_index(doc, self.doctags, self.max_rawint)) - else: - raise KeyError("doc '%s' not in trained set" % doc) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - - if indexer is not None and isinstance(topn, int): - return indexer.most_similar(mean, topn) - - dists = dot(self.vectors_docs_norm[clip_start:clip_end], mean) - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) - # ignore (don't return) docs from the input - result = [ - (self._index_to_doctag(sim + clip_start, self.offset2doctag, self.max_rawint), float(dists[sim])) - for sim in best - if (sim + clip_start) not in all_docs - ] - return result[:topn] - - def doesnt_match(self, docs): - """Which document from the given list doesn't go with the others from the training set? - - TODO: Accept vectors of out-of-training-set docs, as if from inference. - - Parameters - ---------- - docs : list of {str, int} - Sequence of doctags/indexes. - - Returns - ------- - {str, int} - Doctag/index of the document farthest away from the mean of all the documents. - - """ - self.init_sims() - - docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns - logger.debug("using docs %s", docs) - if not docs: - raise ValueError("cannot select a doc from an empty list") - vectors = vstack( - self.vectors_docs_norm[self._int_index(doc, self.doctags, self.max_rawint)] for doc in docs).astype(REAL) - mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) - dists = dot(vectors, mean) - return sorted(zip(dists, docs))[0][1] - - def similarity(self, d1, d2): - """Compute cosine similarity between two docvecs from the training set. - - TODO: Accept vectors of out-of-training-set docs, as if from inference. - - Parameters - ---------- - d1 : {int, str} - Doctag/index of document. - d2 : {int, str} - Doctag/index of document. - - Returns - ------- - float - The cosine similarity between the vectors of the two documents. - - """ - return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2])) - - def n_similarity(self, ds1, ds2): - """Compute cosine similarity between two sets of docvecs from the trained set. - - TODO: Accept vectors of out-of-training-set docs, as if from inference. - - Parameters - ---------- - ds1 : list of {str, int} - Set of document as sequence of doctags/indexes. - ds2 : list of {str, int} - Set of document as sequence of doctags/indexes. - - Returns - ------- - float - The cosine similarity between the means of the documents in each of the two sets. - - """ - v1 = [self[doc] for doc in ds1] - v2 = [self[doc] for doc in ds2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) - - def distance(self, d1, d2): - """ - Compute cosine distance between two documents. - - """ - return 1 - self.similarity(d1, d2) - - # required by base keyed vectors class - def distances(self, d1, other_docs=()): - """Compute cosine distances from given `d1` to all documents in `other_docs`. - - TODO: Accept vectors of out-of-training-set docs, as if from inference. - - Parameters - ---------- - d1 : {str, numpy.ndarray} - Doctag/index of document. - other_docs : iterable of {str, int} - Sequence of doctags/indexes. - If None or empty, distance of `d1` from all doctags in vocab is computed (including itself). - - Returns - ------- - numpy.array - Array containing distances to all documents in `other_docs` from input `d1`. - - """ - input_vector = self[d1] - if not other_docs: - other_vectors = self.vectors_docs - else: - other_vectors = self[other_docs] - return 1 - WordEmbeddingsKeyedVectors.cosine_similarities(input_vector, other_vectors) + limit=limit, datatype=datatype, no_header=no_header) - def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=None, min_alpha=None, steps=None): - """Compute cosine similarity between two post-bulk out of training documents. + def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): + """Merge in an input-hidden weight matrix loaded from the original C word2vec-tool format, + where it intersects with the current vocabulary. - Parameters - ---------- - model : :class:`~gensim.models.doc2vec.Doc2Vec` - An instance of a trained `Doc2Vec` model. - doc_words1 : list of str - Input document. - doc_words2 : list of str - Input document. - alpha : float, optional - The initial learning rate. - min_alpha : float, optional - Learning rate will linearly drop to `min_alpha` as training progresses. - steps : int, optional - Number of epoch to train the new document. - - Returns - ------- - float - The cosine similarity between `doc_words1` and `doc_words2`. - - """ - d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) - d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) - return dot(matutils.unitvec(d1), matutils.unitvec(d2)) - - def save_word2vec_format(self, fname, prefix='*dt_', fvocab=None, - total_vec=None, binary=False, write_first_line=True): - """Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility. + No words are added to the existing vocabulary, but intersecting words adopt the file's weights, and + non-intersecting words are left alone. Parameters ---------- fname : str - The file path used to save the vectors in. - prefix : str, optional - Uniquely identifies doctags from word vocab, and avoids collision - in case of repeated string in doctag and word vocab. - fvocab : str, optional - UNUSED. - total_vec : int, optional - Explicitly specify total no. of vectors - (in case word vectors are appended with document vectors afterwards) + The file path to load the vectors from. + lockf : float, optional + Lock-factor value to be set for any imported word-vectors; the + default value of 0.0 prevents further updating of the vector during subsequent + training. Use 1.0 to allow further training updates of merged vectors. binary : bool, optional - If True, the data will be saved in binary word2vec format, else it will be saved in plain text. - write_first_line : bool, optional - Whether to print the first line in the file. Useful when saving doc-vectors after word-vectors. - - """ - total_vec = total_vec or len(self) - with utils.open(fname, 'ab') as fout: - if write_first_line: - logger.info("storing %sx%s projection weights into %s", total_vec, self.vectors_docs.shape[1], fname) - fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vectors_docs.shape[1]))) - # store as in input order - for i in range(len(self)): - doctag = u"%s%s" % (prefix, self._index_to_doctag(i, self.offset2doctag, self.max_rawint)) - row = self.vectors_docs[i] - if binary: - fout.write(utils.to_utf8(doctag) + b" " + row.tostring()) - else: - fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row)))) - - @staticmethod - def _int_index(index, doctags, max_rawint): - """Get int index for either string or int index.""" - if isinstance(index, integer_types + (integer,)): - return index - else: - return max_rawint + 1 + doctags[index].offset - - @staticmethod - def _index_to_doctag(i_index, offset2doctag, max_rawint): - """Get string key for given `i_index`, if available. Otherwise return raw int doctag (same int).""" - candidate_offset = i_index - max_rawint - 1 - if 0 <= candidate_offset < len(offset2doctag): - return offset2doctag[candidate_offset] - else: - return i_index - - # for backward compatibility - def index_to_doctag(self, i_index): - """Get string key for given `i_index`, if available. Otherwise return raw int doctag (same int).""" - candidate_offset = i_index - self.max_rawint - 1 - if 0 <= candidate_offset < len(self.offset2doctag): - return self.offset2doctag[candidate_offset] - else: - return i_index - - # for backward compatibility - def int_index(self, index, doctags, max_rawint): - """Get int index for either string or int index""" - if isinstance(index, integer_types + (integer,)): - return index - else: - return max_rawint + 1 + doctags[index].offset - - -class FastTextKeyedVectors(WordEmbeddingsKeyedVectors): - """Vectors and vocab for :class:`~gensim.models.fasttext.FastText`. - - Implements significant parts of the FastText algorithm. For example, - the :func:`word_vec` calculates vectors for out-of-vocabulary (OOV) - entities. FastText achieves this by keeping vectors for ngrams: - adding the vectors for the ngrams of an entity yields the vector for the - entity. - - Similar to a hashmap, this class keeps a fixed number of buckets, and - maps all ngrams to buckets using a hash function. - - This class also provides an abstraction over the hash functions used by - Gensim's FastText implementation over time. The hash function connects - ngrams to buckets. Originally, the hash function was broken and - incompatible with Facebook's implementation. The current hash is fully - compatible. - - Parameters - ---------- - vector_size : int - The dimensionality of all vectors. - min_n : int - The minimum number of characters in an ngram - max_n : int - The maximum number of characters in an ngram - bucket : int - The number of buckets. - compatible_hash : boolean - If True, uses the Facebook-compatible hash function instead of the - Gensim backwards-compatible hash function. - - Attributes - ---------- - vectors_vocab : np.array - Each row corresponds to a vector for an entity in the vocabulary. - Columns correspond to vector dimensions. - vectors_vocab_norm : np.array - Same as vectors_vocab, but the vectors are L2 normalized. - vectors_ngrams : np.array - A vector for each ngram across all entities in the vocabulary. - Each row is a vector that corresponds to a bucket. - Columns correspond to vector dimensions. - vectors_ngrams_norm : np.array - Same as vectors_ngrams, but the vectors are L2 normalized. - Under some conditions, may actually be the same matrix as - vectors_ngrams, e.g. if :func:`init_sims` was called with - replace=True. - buckets_word : dict - Maps vocabulary items (by their index) to the buckets they occur in. - - """ - def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash): - super(FastTextKeyedVectors, self).__init__(vector_size=vector_size) - self.vectors_vocab = None - self.vectors_vocab_norm = None - self.vectors_ngrams = None - self.vectors_ngrams_norm = None - self.buckets_word = None - self.min_n = min_n - self.max_n = max_n - self.bucket = bucket - self.compatible_hash = compatible_hash - - @classmethod - def load(cls, fname_or_handle, **kwargs): - model = super(WordEmbeddingsKeyedVectors, cls).load(fname_or_handle, **kwargs) - _try_upgrade(model) - return model - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.vectors_vocab instead") - def syn0_vocab(self): - return self.vectors_vocab - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.vectors_vocab_norm instead") - def syn0_vocab_norm(self): - return self.vectors_vocab_norm - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.vectors_ngrams instead") - def syn0_ngrams(self): - return self.vectors_ngrams - - @property - @deprecated("Attribute will be removed in 4.0.0, use self.vectors_ngrams_norm instead") - def syn0_ngrams_norm(self): - return self.vectors_ngrams_norm - - def __contains__(self, word): - """Check if `word` or any character ngrams in `word` are present in the vocabulary. - A vector for the word is guaranteed to exist if current method returns True. - - Parameters - ---------- - word : str - Input word. - - Returns - ------- - bool - True if `word` or any character ngrams in `word` are present in the vocabulary, False otherwise. - - Note - ---- - This method **always** returns True, because of the way FastText works. - - If you want to check if a word is an in-vocabulary term, use this instead: - - .. pycon: - - >>> from gensim.test.utils import datapath - >>> from gensim.models import FastText - >>> cap_path = datapath("crime-and-punishment.bin") - >>> model = FastText.load_fasttext_format(cap_path, full_model=False) - >>> 'steamtrain' in model.wv.vocab # If False, is an OOV term - False - - """ - return True - - def save(self, *args, **kwargs): - """Save object. - - Parameters - ---------- - fname : str - Path to the output file. - - See Also - -------- - :meth:`~gensim.models.keyedvectors.FastTextKeyedVectors.load` - Load object. + If True, `fname` is in the binary word2vec C format. + encoding : str, optional + Encoding of `text` for `unicode` function (python2 only). + unicode_errors : str, optional + Error handling behaviour, used as parameter for `unicode` function (python2 only). """ - # don't bother storing the cached normalized vectors - ignore_attrs = [ - 'vectors_norm', - 'vectors_vocab_norm', - 'vectors_ngrams_norm', - 'buckets_word', - 'hash2index', - ] - kwargs['ignore'] = kwargs.get('ignore', ignore_attrs) - super(FastTextKeyedVectors, self).save(*args, **kwargs) - - def word_vec(self, word, use_norm=False): - """Get `word` representations in vector space, as a 1D numpy array. + overlap_count = 0 + logger.info("loading projection weights from %s", fname) + with utils.open(fname, 'rb') as fin: + header = utils.to_unicode(fin.readline(), encoding=encoding) + vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format + if not vector_size == self.vector_size: + raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) + # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? + if binary: + binary_len = dtype(REAL).itemsize * vector_size + for _ in range(vocab_size): + # mixed text and binary: read text first, then binary + word = [] + while True: + ch = fin.read(1) + if ch == b' ': + break + if ch != b'\n': # ignore newlines in front of words (some binary files have) + word.append(ch) + word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) + weights = np.fromstring(fin.read(binary_len), dtype=REAL) + if word in self.key_to_index: + overlap_count += 1 + self.vectors[self.get_index(word)] = weights + self.vectors_lockf[self.get_index(word)] = lockf # lock-factor: 0.0=no changes + else: + for line_no, line in enumerate(fin): + parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") + if len(parts) != vector_size + 1: + raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) + word, weights = parts[0], [REAL(x) for x in parts[1:]] + if word in self.key_to_index: + overlap_count += 1 + self.vectors[self.get_index(word)] = weights + self.vectors_lockf[self.get_index(word)] = lockf # lock-factor: 0.0=no changes + logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.vectors.shape, fname) + + def get_keras_embedding(self, train_embeddings=False): + """Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings. Parameters ---------- - word : str - Input word - use_norm : bool, optional - If True - resulting vector will be L2-normalized (unit euclidean length). + train_embeddings : bool + If False, the weights are frozen and stopped from being updated. + If True, the weights can/will be further trained/updated. Returns ------- - numpy.ndarray - Vector representation of `word`. + `keras.layers.Embedding` + Embedding layer. Raises ------ - KeyError - If word and all ngrams not in vocabulary. - - """ - if word in self.vocab: - return super(FastTextKeyedVectors, self).word_vec(word, use_norm) - elif self.bucket == 0: - raise KeyError('cannot calculate vector for OOV word without ngrams') - else: - word_vec = np.zeros(self.vectors_ngrams.shape[1], dtype=REAL) - ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash) - if len(ngram_hashes) == 0: - # - # If it is impossible to extract _any_ ngrams from the input - # word, then the best we can do is return a vector that points - # to the origin. The reference FB implementation does this, - # too. - # - # https://github.com/RaRe-Technologies/gensim/issues/2402 - # - logger.warning('could not extract any ngrams from %r, returning origin vector', word) - return word_vec - for nh in ngram_hashes: - word_vec += self.vectors_ngrams[nh] - result = word_vec / len(ngram_hashes) - if use_norm: - result /= sqrt(sum(result ** 2)) - return result - - def init_sims(self, replace=False): - """Precompute L2-normalized vectors. - - Parameters - ---------- - replace : bool, optional - If True - forget the original vectors and only keep the normalized ones = saves lots of memory! + ImportError + If `Keras `_ not installed. Warnings -------- - You **cannot continue training** after doing a replace. - The model becomes effectively read-only: you can call - :meth:`~gensim.models.keyedvectors.FastTextKeyedVectors.most_similar`, - :meth:`~gensim.models.keyedvectors.FastTextKeyedVectors.similarity`, etc., but not train. + Current method work only if `Keras `_ installed. """ - super(FastTextKeyedVectors, self).init_sims(replace) - if getattr(self, 'vectors_ngrams_norm', None) is None or replace: - logger.info("precomputing L2-norms of ngram weight vectors") - self.vectors_ngrams_norm = _l2_norm(self.vectors_ngrams, replace=replace) - - def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): - """Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility. - - Parameters - ---------- - fname : str - The file path used to save the vectors in - fvocab : str, optional - Optional file path used to save the vocabulary - binary : bool, optional - If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. - total_vec : int, optional - Optional parameter to explicitly specify total no. of vectors - (in case word vectors are appended with document vectors afterwards). - - """ - # from gensim.models.word2vec import save_word2vec_format - _save_word2vec_format( - fname, self.vocab, self.vectors, fvocab=fvocab, binary=binary, total_vec=total_vec) - - def init_ngrams_weights(self, seed): - """Initialize the vocabulary and ngrams weights prior to training. - - Creates the weight matrices and initializes them with uniform random values. - - Parameters - ---------- - seed : float - The seed for the PRNG. - - Note - ---- - Call this **after** the vocabulary has been fully initialized. + try: + from keras.layers import Embedding + except ImportError: + raise ImportError("Please install Keras to use this function") + weights = self.vectors - """ - self.buckets_word = _process_fasttext_vocab( - self.vocab.items(), - self.min_n, - self.max_n, - self.bucket, - self.compatible_hash, + # set `trainable` as `False` to use the pretrained word embedding + # No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights + layer = Embedding( + input_dim=weights.shape[0], output_dim=weights.shape[1], + weights=[weights], trainable=train_embeddings ) + return layer - rand_obj = np.random - rand_obj.seed(seed) - - lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size - vocab_shape = (len(self.vocab), self.vector_size) - ngrams_shape = (self.bucket, self.vector_size) - self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL) - - # - # We could have initialized vectors_ngrams at construction time, but we - # do it here for two reasons: - # - # 1. The constructor does not have access to the random seed - # 2. We want to use the same rand_obj to fill vectors_vocab _and_ - # vectors_ngrams, and vectors_vocab cannot happen at construction - # time because the vocab is not initialized at that stage. - # - self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL) - - def update_ngrams_weights(self, seed, old_vocab_len): - """Update the vocabulary weights for training continuation. - - Parameters - ---------- - seed : float - The seed for the PRNG. - old_vocab_length : int - The length of the vocabulary prior to its update. - - Note - ---- - Call this **after** the vocabulary has been updated. - - """ - self.buckets_word = _process_fasttext_vocab( - self.vocab.items(), - self.min_n, - self.max_n, - self.bucket, - self.compatible_hash, - ) + def _upconvert_old_d2vkv(self): + """Convert a deserialized older Doc2VecKeyedVectors instance to latest generic KeyedVectors""" + + self.vocab = self.doctags + self._upconvert_old_vocab() # destroys 'vocab', fills 'key_to_index' & 'extras' + for k in self.key_to_index.keys(): + old_offset = self.get_vecattr(k, 'offset') + true_index = old_offset + self.max_rawint + 1 + self.key_to_index[k] = true_index + del self.expandos['offset'] # no longer needed + if(self.max_rawint > -1): + self.index_to_key = list(range(0, self.max_rawint + 1)) + self.offset2doctag + else: + self.index_to_key = self.offset2doctag + self.vectors = self.vectors_docs + del self.doctags + del self.vectors_docs + del self.count + del self.max_rawint + del self.offset2doctag - rand_obj = np.random - rand_obj.seed(seed) + def similarity_unseen_docs(self, *args, **kwargs): + raise NotImplementedError("Call similarity_unseen_docs on a Doc2Vec model instead.") - new_vocab = len(self.vocab) - old_vocab_len - self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj) - def init_post_load(self, vectors): - """Perform initialization after loading a native Facebook model. +# to help 3.8.1 & older pickles load properly +Word2VecKeyedVectors = KeyedVectors +Doc2VecKeyedVectors = KeyedVectors +EuclideanKeyedVectors = KeyedVectors - Expects that the vocabulary (self.vocab) has already been initialized. - Parameters - ---------- - vectors : np.array - A matrix containing vectors for all the entities, including words - and ngrams. This comes directly from the binary model. - The order of the vectors must correspond to the indices in - the vocabulary. - match_gensim : boolean, optional - No longer supported. +class CompatVocab(object): + def __init__(self, **kwargs): + """A single vocabulary item, used internally for collecting per-word frequency/sampling info, + and for constructing binary trees (incl. both word leaves and inner nodes). + Retained for now to ease the loading of older models. """ - vocab_words = len(self.vocab) - assert vectors.shape[0] == vocab_words + self.bucket, 'unexpected number of vectors' - assert vectors.shape[1] == self.vector_size, 'unexpected vector dimensionality' + self.count = 0 + self.__dict__.update(kwargs) - # - # The incoming vectors contain vectors for both words AND - # ngrams. We split them into two separate matrices, because our - # implementation treats them differently. - # - self.vectors = np.array(vectors[:vocab_words, :]) - self.vectors_vocab = np.array(vectors[:vocab_words, :]) - self.vectors_ngrams = np.array(vectors[vocab_words:, :]) - self.buckets_word = None # This can get initialized later + def __lt__(self, other): # used for sorting in a priority queue + return self.count < other.count - self.adjust_vectors() + def __str__(self): + vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] + return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) - def adjust_vectors(self): - """Adjust the vectors for words in the vocabulary. - The adjustment relies on the vectors of the ngrams making up each - individual word. +# compatibility alias, allowing older pickle-based `.save()`s to load +Vocab = CompatVocab - """ - if self.bucket == 0: - return - for w, v in self.vocab.items(): - word_vec = np.copy(self.vectors_vocab[v.index]) - ngram_hashes = ft_ngram_hashes(w, self.min_n, self.max_n, self.bucket, self.compatible_hash) - for nh in ngram_hashes: - word_vec += self.vectors_ngrams[nh] - word_vec /= len(ngram_hashes) + 1 - self.vectors[v.index] = word_vec +# Functions for internal use by _load_word2vec_format function - @property - @deprecated("Attribute will be removed in 4.0.0, use self.bucket instead") - def num_ngram_vectors(self): - return self.bucket +def _add_word_to_kv(kv, counts, word, weights, vocab_size): + if kv.has_index_for(word): + logger.warning("duplicate word '%s' in word2vec file, ignoring all but first", word) + return + word_id = kv.add_one(word, weights) -def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash): - """ - Performs a common operation for FastText weight initialization and - updates: scan the vocabulary, calculate ngrams and their hashes, keep - track of new ngrams, the buckets that each word relates to via its - ngrams, etc. + if counts is None: + # most common scenario: no vocab file given. just make up some bogus counts, in descending order + # FIXME(someday): make this faking optional, include more realistic (Zipf-based) fake numbers + word_count = vocab_size - word_id + elif word in counts: + # use count from the vocab file + word_count = counts[word] + else: + logger.warning("vocabulary file is incomplete: '%s' is missing", word) + word_count = None + kv.set_vecattr(word, 'count', word_count) + + +def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors): + start = 0 + processed_words = 0 + bytes_per_vector = vector_size * dtype(REAL).itemsize + max_words = vocab_size - kv.next_index # don't read more than kv preallocated to hold + assert max_words > 0 + for _ in range(max_words): + i_space = chunk.find(b' ', start) + i_vector = i_space + 1 + + if i_space == -1 or (len(chunk) - i_vector) < bytes_per_vector: + break + + word = chunk[start:i_space].decode("utf-8", errors=unicode_errors) + # Some binary files are reported to have obsolete new line in the beginning of word, remove it + word = word.lstrip('\n') + vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype) + _add_word_to_kv(kv, counts, word, vector, vocab_size) + start = i_vector + bytes_per_vector + processed_words += 1 + + return processed_words, chunk[start:] + + +def _word2vec_read_binary(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size): + chunk = b'' + tot_processed_words = 0 + + while tot_processed_words < vocab_size: + new_chunk = fin.read(binary_chunk_size) + chunk += new_chunk + processed_words, chunk = _add_bytes_to_kv( + kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors) + tot_processed_words += processed_words + if len(new_chunk) < binary_chunk_size: + break + if tot_processed_words != vocab_size: + raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") + + +def _word2vec_read_text(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, encoding): + for line_no in range(vocab_size): + line = fin.readline() + if line == b'': + raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") + word, weights = _word2vec_line_to_vector(line, datatype, unicode_errors, encoding) + _add_word_to_kv(kv, counts, word, weights, vocab_size) + + +def _word2vec_line_to_vector(line, datatype, unicode_errors, encoding): + parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") + word, weights = parts[0], [datatype(x) for x in parts[1:]] + return word, weights + + +def _word2vec_detect_sizes_text(fin, limit, datatype, unicode_errors, encoding): + vector_size = None + for vocab_size in itertools.count(): + line = fin.readline() + if line == b'' or vocab_size == limit: # EOF/max: return what we've got + break + if vector_size: + continue # don't bother parsing lines past the 1st + word, weights = _word2vec_line_to_vector(line, datatype, unicode_errors, encoding) + vector_size = len(weights) + return vocab_size, vector_size + + +def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', + limit=sys.maxsize, datatype=REAL, no_header=False, binary_chunk_size=100 * 1024): + """Load the input-hidden weight matrix from the original C word2vec-tool format. + + Note that the information stored in the file is incomplete (the binary tree is missing), + so while you can query for word similarity etc., you cannot continue training + with a model loaded this way. Parameters ---------- - iterable : list - A list of (word, :class:`Vocab`) tuples. - min_n : int - The minimum length of ngrams. - max_n : int - The maximum length of ngrams. - num_buckets : int - The number of buckets used by the model. - compatible_hash : boolean - True for compatibility with the Facebook implementation. - False for compatibility with the old Gensim implementation. + fname : str + The file path to the saved word2vec-format file. + fvocab : str, optional + File path to the vocabulary.Word counts are read from `fvocab` filename, if set + (this is the file generated by `-save-vocab` flag of the original C tool). + binary : bool, optional + If True, indicates whether the data is in binary word2vec format. + encoding : str, optional + If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. + unicode_errors : str, optional + default 'strict', is a string suitable to be passed as the `errors` + argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source + file may include word tokens truncated in the middle of a multibyte unicode character + (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. + limit : int, optional + Sets a maximum number of word-vectors to read from the file. The default, + None, means read all. + datatype : type, optional + (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory. + Such types may result in much slower bulk operations or incompatibility with optimized routines.) + binary_chunk_size : int, optional + Read input file in chunks of this many bytes for performance reasons. Returns ------- - dict - Keys are indices of entities in the vocabulary (words). Values are - arrays containing indices into vectors_ngrams for each ngram of the - word. + object + Returns the loaded model as an instance of :class:`cls`. """ - word_indices = {} - if num_buckets == 0: - return {v.index: np.array([], dtype=np.uint32) for w, v in iterable} - - for word, vocab in iterable: - wi = [] - for ngram_hash in ft_ngram_hashes(word, min_n, max_n, num_buckets, compatible_hash): - wi.append(ngram_hash) - word_indices[vocab.index] = np.array(wi, dtype=np.uint32) + counts = None + if fvocab is not None: + logger.info("loading word counts from %s", fvocab) + counts = {} + with utils.open(fvocab, 'rb') as fin: + for line in fin: + word, count = utils.to_unicode(line, errors=unicode_errors).strip().split() + counts[word] = int(count) + + logger.info("loading projection weights from %s", fname) + with utils.open(fname, 'rb') as fin: + if no_header: + # deduce both vocab_size & vector_size from 1st pass over file + if binary: + raise NotImplementedError("no_header only available for text-format files") + else: # text + vocab_size, vector_size = _word2vec_detect_sizes_text(fin, limit, datatype, unicode_errors, encoding) + fin.close() + fin = utils.open(fname, 'rb') + else: + header = utils.to_unicode(fin.readline(), encoding=encoding) + vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format + if limit: + vocab_size = min(vocab_size, limit) + kv = cls(vector_size, vocab_size, dtype=datatype) + + if binary: + _word2vec_read_binary(fin, kv, counts, + vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size) + else: + _word2vec_read_text(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, encoding) + if kv.vectors.shape[0] != len(kv): + logger.info( + "duplicate words detected, shrinking matrix size from %i to %i", + kv.vectors.shape[0], len(kv) + ) + kv.vectors = ascontiguousarray(kv.vectors[: len(kv)]) + assert (len(kv), vector_size) == kv.vectors.shape - return word_indices + logger.info("loaded %s matrix from %s", kv.vectors.shape, fname) + return kv -def _pad_random(m, new_rows, rand): - """Pad a matrix with additional rows filled with random values.""" - rows, columns = m.shape - low, high = -1.0 / columns, 1.0 / columns - suffix = rand.uniform(low, high, (new_rows, columns)).astype(REAL) - return vstack([m, suffix]) +def load_word2vec_format(*args, **kwargs): + """Alias for `KeyedVectors.load_word2vec_format(...)`""" + return KeyedVectors.load_word2vec_format(*args, **kwargs) -def _l2_norm(m, replace=False): - """Return an L2-normalized version of a matrix. - - Parameters - ---------- - m : np.array - The matrix to normalize. - replace : boolean, optional - If True, modifies the existing matrix. - - Returns - ------- - The normalized matrix. If replace=True, this will be the same as m. +def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash): + """Get a 'random' vector (but deterministically derived from seed_string if supplied). + Useful for initializing KeyedVectors that will be the starting + projection/input layers of _2Vec models. """ - dist = sqrt((m ** 2).sum(-1))[..., newaxis] - if replace: - m /= dist - return m + if seed_string: + once = np.random.Generator(np.random.SFC64(hashfxn(seed_string) & 0xffffffff)) else: - return (m / dist).astype(REAL) - - -def _rollback_optimization(kv): - """Undo the optimization that pruned buckets. - - This unfortunate optimization saves memory and CPU cycles, but breaks - compatibility with Facebook's model by introducing divergent behavior - for OOV words. - - """ - logger.warning( - "This saved FastText model was trained with an optimization we no longer support. " - "The current Gensim version automatically reverses this optimization during loading. " - "Save the loaded model to a new file and reload to suppress this message." - ) - assert hasattr(kv, 'hash2index') - assert hasattr(kv, 'num_ngram_vectors') - - kv.vectors_ngrams = _unpack(kv.vectors_ngrams, kv.bucket, kv.hash2index) - - # - # We have replaced num_ngram_vectors with a property and deprecated it. - # We can't delete it because the new attribute masks the member. - # - del kv.hash2index - - -def _unpack_copy(m, num_rows, hash2index, seed=1): - """Same as _unpack, but makes a copy of the matrix. - - Simpler implementation, but uses more RAM. - - """ - rows, columns = m.shape - if rows == num_rows: - # - # Nothing to do. - # - return m - assert num_rows > rows - - rand_obj = np.random - rand_obj.seed(seed) - - n = np.empty((0, columns), dtype=m.dtype) - n = _pad_random(n, num_rows, rand_obj) - - for src, dst in hash2index.items(): - n[src] = m[dst] - - return n - - -def _unpack(m, num_rows, hash2index, seed=1): - """Restore the array to its natural shape, undoing the optimization. - - A packed matrix contains contiguous vectors for ngrams, as well as a hashmap. - The hash map maps the ngram hash to its index in the packed matrix. - To unpack the matrix, we need to do several things: - - 1. Restore the matrix to its "natural" shape, where the number of rows - equals the number of buckets. - 2. Rearrange the existing rows such that the hashmap becomes the identity - function and is thus redundant. - 3. Fill the new rows with random values. - - Parameters - ---------- - - m : np.ndarray - The matrix to restore. - num_rows : int - The number of rows that this array should have. - hash2index : dict - the product of the optimization we are undoing. - seed : float, optional - The seed for the PRNG. Will be used to initialize new rows. - - Returns - ------- - np.array - The unpacked matrix. - - Notes - ----- - - The unpacked matrix will reference some rows in the input matrix to save memory. - Throw away the old matrix after calling this function, or use np.copy. - - """ - orig_rows, orig_columns = m.shape - if orig_rows == num_rows: - # - # Nothing to do. - # - return m - assert num_rows > orig_rows - - rand_obj = np.random - rand_obj.seed(seed) - - # - # Rows at the top of the matrix (the first orig_rows) will contain "packed" learned vectors. - # Rows at the bottom of the matrix will be "free": initialized to random values. - # - m = _pad_random(m, num_rows - orig_rows, rand_obj) - - # - # Swap rows to transform hash2index into the identify function. - # There are two kinds of swaps. - # First, rearrange the rows that belong entirely within the original matrix dimensions. - # Second, swap out rows from the original matrix dimensions, replacing them with - # randomly initialized values. - # - # N.B. We only do the swap in one direction, because doing it in both directions - # nullifies the effect. - # - swap = {h: i for (h, i) in hash2index.items() if h < i < orig_rows} - swap.update({h: i for (h, i) in hash2index.items() if h >= orig_rows}) - for h, i in swap.items(): - assert h != i - m[[h, i]] = m[[i, h]] # swap rows i and h - - return m - - -def _try_upgrade(wv): - if hasattr(wv, 'hash2index'): - _rollback_optimization(wv) - - if not hasattr(wv, 'compatible_hash'): - logger.warning( - "This older model was trained with a buggy hash function. " - "The model will continue to work, but consider training it " - "from scratch." - ) - wv.compatible_hash = False + once = utils.default_prng + return (once.random(size).astype(REAL) - 0.5) / size diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index a5c4539e34..1133c52061 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -50,15 +50,13 @@ import numpy as np from collections import defaultdict, Counter -from numpy import random as np_random +from numpy import random as np_random, float32 as REAL from scipy.stats import spearmanr from six import string_types from six.moves import zip, range from gensim import utils, matutils -from gensim.models.keyedvectors import Vocab, BaseKeyedVectors -from gensim.models.utils_any2vec import _save_word2vec_format, _load_word2vec_format -from numpy import float32 as REAL +from gensim.models.keyedvectors import KeyedVectors try: from autograd import grad # Only required for optionally verifying gradients while training @@ -151,7 +149,7 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil """ self.train_data = train_data - self.kv = PoincareKeyedVectors(size) + self.kv = PoincareKeyedVectors(size, 0) self.all_relations = [] self.node_relations = defaultdict(set) self._negatives_buffer = NegativesBuffer([]) @@ -208,51 +206,49 @@ def build_vocab(self, relations, update=False): >>> model.train(epochs=50) """ - old_index2word_len = len(self.kv.index2word) + old_index_to_key_len = len(self.kv.index_to_key) logger.info("loading relations from train data..") for relation in relations: if len(relation) != 2: raise ValueError('Relation pair "%s" should have exactly two items' % repr(relation)) for item in relation: - if item in self.kv.vocab: - self.kv.vocab[item].count += 1 + if item in self.kv.key_to_index: + self.kv.set_vecattr(item, 'count', self.kv.get_vecattr(item, 'count') + 1) else: - self.kv.vocab[item] = Vocab(count=1, index=len(self.kv.index2word)) - self.kv.index2word.append(item) + self.kv.key_to_index[item] = len(self.kv.index_to_key) + self.kv.index_to_key.append(item) + self.kv.set_vecattr(item, 'count', 1) + node_1, node_2 = relation - node_1_index, node_2_index = self.kv.vocab[node_1].index, self.kv.vocab[node_2].index + node_1_index, node_2_index = self.kv.key_to_index[node_1], self.kv.key_to_index[node_2] self.node_relations[node_1_index].add(node_2_index) relation = (node_1_index, node_2_index) self.all_relations.append(relation) - logger.info("loaded %d relations from train data, %d nodes", len(self.all_relations), len(self.kv.vocab)) - self.indices_set = set(range(len(self.kv.index2word))) # Set of all node indices - self.indices_array = np.fromiter(range(len(self.kv.index2word)), dtype=int) # Numpy array of all node indices + logger.info("loaded %d relations from train data, %d nodes", len(self.all_relations), len(self.kv)) + self.indices_set = set(range(len(self.kv.index_to_key))) # Set of all node indices + self.indices_array = np.fromiter(range(len(self.kv.index_to_key)), dtype=int) # Numpy array of all node indices self._init_node_probabilities() if not update: self._init_embeddings() else: - self._update_embeddings(old_index2word_len) + self._update_embeddings(old_index_to_key_len) def _init_embeddings(self): """Randomly initialize vectors for the items in the vocab.""" - shape = (len(self.kv.index2word), self.size) - self.kv.syn0 = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype) + shape = (len(self.kv.index_to_key), self.size) + self.kv.vectors = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype) - def _update_embeddings(self, old_index2word_len): + def _update_embeddings(self, old_index_to_key_len): """Randomly initialize vectors for the items in the additional vocab.""" - shape = (len(self.kv.index2word) - old_index2word_len, self.size) + shape = (len(self.kv.index_to_key) - old_index_to_key_len, self.size) v = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype) - self.kv.syn0 = np.concatenate([self.kv.syn0, v]) + self.kv.vectors = np.concatenate([self.kv.vectors, v]) def _init_node_probabilities(self): """Initialize a-priori probabilities.""" - counts = np.fromiter(( - self.kv.vocab[self.kv.index2word[i]].count - for i in range(len(self.kv.index2word)) - ), - dtype=np.float64, count=len(self.kv.index2word)) + counts = self.kv.expandos['count'].astype(np.float64) self._node_counts_cumsum = np.cumsum(counts) self._node_probabilities = counts / counts.sum() @@ -290,14 +286,14 @@ def _sample_negatives(self, node_index): """ node_relations = self.node_relations[node_index] - num_remaining_nodes = len(self.kv.vocab) - len(node_relations) + num_remaining_nodes = len(self.kv) - len(node_relations) if num_remaining_nodes < self.negative: raise ValueError( 'Cannot sample %d negative nodes from a set of %d negative nodes for %s' % - (self.negative, num_remaining_nodes, self.kv.index2word[node_index]) + (self.negative, num_remaining_nodes, self.kv.index_to_key[node_index]) ) - positive_fraction = float(len(node_relations)) / len(self.kv.vocab) + positive_fraction = float(len(node_relations)) / len(self.kv) if positive_fraction < 0.01: # If number of positive relations is a small fraction of total nodes # re-sample till no positively connected nodes are chosen @@ -461,8 +457,8 @@ def _prepare_training_batch(self, relations, all_negatives, check_gradients=Fals indices_v.append(v) indices_v.extend(negatives) - vectors_u = self.kv.syn0[indices_u] - vectors_v = self.kv.syn0[indices_v].reshape((batch_size, 1 + self.negative, self.size)) + vectors_u = self.kv.vectors[indices_u] + vectors_v = self.kv.vectors[indices_v].reshape((batch_size, 1 + self.negative, self.size)) vectors_v = vectors_v.swapaxes(0, 1).swapaxes(1, 2) batch = PoincareBatch(vectors_u, vectors_v, indices_u, indices_v, self.regularization_coeff) batch.compute_all() @@ -499,7 +495,7 @@ def _check_gradients(self, relations, all_negatives, batch, tol=1e-8): for i, (relation, negatives) in enumerate(zip(relations, all_negatives)): u, v = relation auto_gradients = self._loss_grad( - np.vstack((self.kv.syn0[u], self.kv.syn0[[v] + negatives])), self.regularization_coeff) + np.vstack((self.kv.vectors[u], self.kv.vectors[[v] + negatives])), self.regularization_coeff) computed_gradients = np.vstack((batch.gradients_u[:, i], batch.gradients_v[:, :, i])) diff = np.abs(auto_gradients - computed_gradients).max() if diff > max_diff: @@ -594,16 +590,16 @@ def _update_vectors_batch(self, batch): u_updates = (self.alpha * (batch.alpha ** 2) / 4 * grad_u).T self._handle_duplicates(u_updates, indices_u) - self.kv.syn0[indices_u] -= u_updates - self.kv.syn0[indices_u] = self._clip_vectors(self.kv.syn0[indices_u], self.epsilon) + self.kv.vectors[indices_u] -= u_updates + self.kv.vectors[indices_u] = self._clip_vectors(self.kv.vectors[indices_u], self.epsilon) v_updates = self.alpha * (batch.beta ** 2)[:, np.newaxis] / 4 * grad_v v_updates = v_updates.swapaxes(1, 2).swapaxes(0, 1) v_updates = v_updates.reshape(((1 + self.negative) * batch_size, self.size)) self._handle_duplicates(v_updates, indices_v) - self.kv.syn0[indices_v] -= v_updates - self.kv.syn0[indices_v] = self._clip_vectors(self.kv.syn0[indices_v], self.epsilon) + self.kv.vectors[indices_v] -= v_updates + self.kv.vectors[indices_v] = self._clip_vectors(self.kv.vectors[indices_v], self.epsilon) def train(self, epochs, batch_size=10, print_every=1000, check_gradients_every=None): """Train Poincare embeddings using loaded data and model parameters. @@ -860,148 +856,38 @@ def compute_loss(self): self._loss_computed = True -class PoincareKeyedVectors(BaseKeyedVectors): +class PoincareKeyedVectors(KeyedVectors): """Vectors and vocab for the :class:`~gensim.models.poincare.PoincareModel` training class. Used to perform operations on the vectors such as vector lookup, distance calculations etc. - """ - def __init__(self, vector_size): - super(PoincareKeyedVectors, self).__init__(vector_size) - self.max_distance = 0 - self.index2word = [] - self.vocab = {} - - @property - def vectors(self): - return self.syn0 - - @vectors.setter - def vectors(self, value): - self.syn0 = value - - @property - def index2entity(self): - return self.index2word - - @index2entity.setter - def index2entity(self, value): - self.index2word = value - - def word_vec(self, word): - """Get the word's representations in vector space, as a 1D numpy array. - - Examples - -------- - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> - >>> # Read the sample relations file and train the model - >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) - >>> model = PoincareModel(train_data=relations) - >>> model.train(epochs=50) - >>> - >>> # Query the trained model. - >>> wv = model.kv.word_vec('kangaroo.n.01') - - """ - return super(PoincareKeyedVectors, self).get_vector(word) - - def words_closer_than(self, w1, w2): - """Get all words that are closer to `w1` than `w2` is to `w1`. - - Parameters - ---------- - w1 : str - Input word. - w2 : str - Input word. - - Returns - ------- - list (str) - List of words that are closer to `w1` than `w2` is to `w1`. - - Examples - -------- - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> - >>> # Read the sample relations file and train the model - >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) - >>> model = PoincareModel(train_data=relations) - >>> model.train(epochs=50) - >>> - >>> # Which term is closer to 'kangaroo' than 'metatherian' is to 'kangaroo'? - >>> model.kv.words_closer_than('kangaroo.n.01', 'metatherian.n.01') - [u'marsupial.n.01', u'phalanger.n.01'] - - """ - return super(PoincareKeyedVectors, self).closer_than(w1, w2) - - def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): - """Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility, using :func:`~gensim.models.utils_any2vec._save_word2vec_format`. - - Parameters - ---------- - fname : str - Path to file that will be used for storing. - fvocab : str, optional - File path used to save the vocabulary. - binary : bool, optional - If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. - total_vec : int, optional - Explicitly specify total number of vectors - (in case word vectors are appended with document vectors afterwards). - - """ - _save_word2vec_format(fname, self.vocab, self.syn0, fvocab=fvocab, binary=binary, total_vec=total_vec) + (May be used to save/load final vectors in the plain word2vec format, via the inherited + methods save_word2vec_format() and load_word2vec_format().) - @classmethod - def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): - """Load the input-hidden weight matrix from the original C word2vec-tool format. - Use :func:`~gensim.models.utils_any2vec._load_word2vec_format`. + Examples + -------- + .. sourcecode:: pycon - Note that the information stored in the file is incomplete (the binary tree is missing), - so while you can query for word similarity etc., you cannot continue training - with a model loaded this way. + >>> from gensim.test.utils import datapath + >>> + >>> # Read the sample relations file and train the model + >>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv')) + >>> model = PoincareModel(train_data=relations) + >>> model.train(epochs=50) + >>> + >>> # Query the trained model. + >>> wv = model.kv.get_vector('kangaroo.n.01') - Parameters - ---------- - fname : str - The file path to the saved word2vec-format file. - fvocab : str, optional - File path to the vocabulary.Word counts are read from `fvocab` filename, if set - (this is the file generated by `-save-vocab` flag of the original C tool). - binary : bool, optional - If True, indicates whether the data is in binary word2vec format. - encoding : str, optional - If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. - unicode_errors : str, optional - default 'strict', is a string suitable to be passed as the `errors` - argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source - file may include word tokens truncated in the middle of a multibyte unicode character - (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. - limit : int, optional - Sets a maximum number of word-vectors to read from the file. The default, - None, means read all. - datatype : type, optional - (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory. - Such types may result in much slower bulk operations or incompatibility with optimized routines.) - - Returns - ------- - :class:`~gensim.models.poincare.PoincareModel` - Loaded Poincare model. + """ + def __init__(self, vector_size, vector_count, dtype=REAL): + super(PoincareKeyedVectors, self).__init__(vector_size, vector_count, dtype=dtype) + self.max_distance = 0 - """ - return _load_word2vec_format( - cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors, - limit=limit, datatype=datatype) + def _load_specials(self, *args, **kwargs): + super(PoincareKeyedVectors, self)._load_specials(*args, **kwargs) + # fixup rename of syn0 + if not hasattr(self, 'vectors'): + self.vectors = self.__dict__.pop('syn0') @staticmethod def vector_distance(vector_1, vector_2): @@ -1064,14 +950,14 @@ def closest_child(self, node): """ all_distances = self.distances(node) - all_norms = np.linalg.norm(self.syn0, axis=1) - node_norm = all_norms[self.vocab[node].index] + all_norms = np.linalg.norm(self.vectors, axis=1) + node_norm = all_norms[self.get_index(node)] mask = node_norm >= all_norms if mask.all(): # No nodes lower in the hierarchy return None all_distances = np.ma.array(all_distances, mask=mask) closest_child_index = np.ma.argmin(all_distances) - return self.index2word[closest_child_index] + return self.index_to_key[closest_child_index] def closest_parent(self, node): """Get the node closest to `node` that is higher in the hierarchy than `node`. @@ -1089,14 +975,14 @@ def closest_parent(self, node): """ all_distances = self.distances(node) - all_norms = np.linalg.norm(self.syn0, axis=1) - node_norm = all_norms[self.vocab[node].index] + all_norms = np.linalg.norm(self.vectors, axis=1) + node_norm = all_norms[self.get_index(node)] mask = node_norm <= all_norms if mask.all(): # No nodes higher in the hierarchy return None all_distances = np.ma.array(all_distances, mask=mask) closest_child_index = np.ma.argmin(all_distances) - return self.index2word[closest_child_index] + return self.index_to_key[closest_child_index] def descendants(self, node, max_depth=5): """Get the list of recursively closest children from the given node, up to a max depth of `max_depth`. @@ -1181,8 +1067,8 @@ def distance(self, w1, w2): If either of `w1` and `w2` is absent from vocab. """ - vector_1 = self.word_vec(w1) - vector_2 = self.word_vec(w2) + vector_1 = self.get_vector(w1) + vector_2 = self.get_vector(w2) return self.vector_distance(vector_1, vector_2) def similarity(self, w1, w2): @@ -1267,11 +1153,11 @@ def most_similar(self, node_or_vector, topn=10, restrict_vocab=None): if not restrict_vocab: all_distances = self.distances(node_or_vector) else: - nodes_to_use = self.index2word[:restrict_vocab] + nodes_to_use = self.index_to_key[:restrict_vocab] all_distances = self.distances(node_or_vector, nodes_to_use) if isinstance(node_or_vector, string_types + (int,)): - node_index = self.vocab[node_or_vector].index + node_index = self.get_index(node_or_vector) else: node_index = None if not topn: @@ -1279,7 +1165,7 @@ def most_similar(self, node_or_vector, topn=10, restrict_vocab=None): else: closest_indices = matutils.argsort(all_distances, topn=1 + topn) result = [ - (self.index2word[index], float(all_distances[index])) + (self.index_to_key[index], float(all_distances[index])) for index in closest_indices if (not node_index or index != node_index) # ignore the input node ] if topn: @@ -1329,14 +1215,14 @@ def distances(self, node_or_vector, other_nodes=()): """ if isinstance(node_or_vector, string_types): - input_vector = self.word_vec(node_or_vector) + input_vector = self.get_vector(node_or_vector) else: input_vector = node_or_vector if not other_nodes: - other_vectors = self.syn0 + other_vectors = self.vectors else: - other_indices = [self.vocab[node].index for node in other_nodes] - other_vectors = self.syn0[other_indices] + other_indices = [self.get_index(node) for node in other_nodes] + other_vectors = self.vectors[other_indices] return self.vector_distance_batch(input_vector, other_vectors) def norm(self, node_or_vector): @@ -1374,7 +1260,7 @@ def norm(self, node_or_vector): """ if isinstance(node_or_vector, string_types): - input_vector = self.word_vec(node_or_vector) + input_vector = self.get_vector(node_or_vector) else: input_vector = node_or_vector return np.linalg.norm(input_vector) @@ -1536,14 +1422,13 @@ def __init__(self, file_path, embedding): """ items = set() - embedding_vocab = embedding.vocab relations = defaultdict(set) with utils.open(file_path, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in reader: assert len(row) == 2, 'Hypernym pair has more than two items' - item_1_index = embedding_vocab[row[0]].index - item_2_index = embedding_vocab[row[1]].index + item_1_index = embedding.get_index(row[0]) + item_2_index = embedding.get_index(row[1]) relations[item_1_index].add(item_2_index) items.update([item_1_index, item_2_index]) self.items = items @@ -1614,7 +1499,7 @@ def evaluate_mean_rank_and_map(self, max_n=None): if item not in self.relations: continue item_relations = list(self.relations[item]) - item_term = self.embedding.index2word[item] + item_term = self.embedding.index_to_key[item] item_distances = self.embedding.distances(item_term) positive_relation_ranks, avg_precision = \ self.get_positive_relation_ranks_and_avg_prec(item_distances, item_relations) @@ -1642,7 +1527,6 @@ def __init__(self, train_path, test_path, embedding): """ items = set() - embedding_vocab = embedding.vocab relations = {'known': defaultdict(set), 'unknown': defaultdict(set)} data_files = {'known': train_path, 'unknown': test_path} for relation_type, data_file in data_files.items(): @@ -1650,8 +1534,8 @@ def __init__(self, train_path, test_path, embedding): reader = csv.reader(f, delimiter='\t') for row in reader: assert len(row) == 2, 'Hypernym pair has more than two items' - item_1_index = embedding_vocab[row[0]].index - item_2_index = embedding_vocab[row[1]].index + item_1_index = embedding.get_index(row[0]) + item_2_index = embedding.get_index(row[1]) relations[relation_type][item_1_index].add(item_2_index) items.update([item_1_index, item_2_index]) self.items = items @@ -1726,7 +1610,7 @@ def evaluate_mean_rank_and_map(self, max_n=None): continue unknown_relations = list(self.relations['unknown'][item]) known_relations = list(self.relations['known'][item]) - item_term = self.embedding.index2word[item] + item_term = self.embedding.index_to_key[item] item_distances = self.embedding.distances(item_term) unknown_relation_ranks, avg_precision = \ self.get_unknown_relation_ranks_and_avg_prec(item_distances, unknown_relations, known_relations) @@ -1792,7 +1676,7 @@ def score_function(self, embedding, trie, term_1, term_2): min_term_1, min_term_2 = term_1, term_2 min_distance = distance assert min_term_1 is not None and min_term_2 is not None - vector_1, vector_2 = embedding.word_vec(min_term_1), embedding.word_vec(min_term_2) + vector_1, vector_2 = embedding.get_vector(min_term_1), embedding.get_vector(min_term_2) norm_1, norm_2 = np.linalg.norm(vector_1), np.linalg.norm(vector_2) return -1 * (1 + self.alpha * (norm_2 - norm_1)) * min_distance @@ -1839,7 +1723,7 @@ def create_vocab_trie(embedding): 'pygtrie could not be imported, please install pygtrie in order to use LexicalEntailmentEvaluation') vocab_trie = Trie() - for key in embedding.vocab: + for key in embedding.key_to_index: vocab_trie[key] = True return vocab_trie diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 5aa9b42184..54b21416e3 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -84,7 +84,7 @@ .. sourcecode:: pycon - >>> result = model_trans.infer_vector(dst_model.docvecs[data[3].tags]) + >>> result = model_trans.infer_vector(dst_model.dv[data[3].tags]) References @@ -151,12 +151,12 @@ def build(cls, lang_vec, lexicon=None): # if the lexicon is not provided, using the all the Keyedvectors's words as default for item in lexicon: words.append(item) - mat.append(lang_vec.vectors[lang_vec.vocab[item].index]) + mat.append(lang_vec.vectors[lang_vec.get_index(item)]) else: - for item in lang_vec.vocab.keys(): + for item in lang_vec.index_to_key: words.append(item) - mat.append(lang_vec.vectors[lang_vec.vocab[item].index]) + mat.append(lang_vec.vectors[lang_vec.get_index(item)]) return Space(mat, words) @@ -392,7 +392,7 @@ class BackMappingTranslationMatrix(utils.SaveLoad): >>> model_trans = BackMappingTranslationMatrix(src_model, dst_model) >>> trans_matrix = model_trans.train(data) >>> - >>> result = model_trans.infer_vector(dst_model.docvecs[data[3].tags]) + >>> result = model_trans.infer_vector(dst_model.dv[data[3].tags]) """ def __init__(self, source_lang_vec, target_lang_vec, tagged_docs=None, random_state=None): @@ -436,8 +436,8 @@ def train(self, tagged_docs): Translation matrix that mapping from the source model's vector to target model's vector. """ - m1 = [self.source_lang_vec.docvecs[item.tags].flatten() for item in tagged_docs] - m2 = [self.target_lang_vec.docvecs[item.tags].flatten() for item in tagged_docs] + m1 = [self.source_lang_vec.dv[item.tags].flatten() for item in tagged_docs] + m2 = [self.target_lang_vec.dv[item.tags].flatten() for item in tagged_docs] self.translation_matrix = np.linalg.lstsq(m2, m1, -1)[0] return self.translation_matrix diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py deleted file mode 100644 index afc25c772b..0000000000 --- a/gensim/models/utils_any2vec.py +++ /dev/null @@ -1,298 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Shiva Manne -# Copyright (C) 2019 RaRe Technologies s.r.o. - -"""General functions used for any2vec models. - -One of the goals of this module is to provide an abstraction over the Cython -extensions for FastText. If they are not available, then the module substitutes -slower Python versions in their place. - -Another related set of FastText functionality is computing ngrams for a word. -The :py:func:`compute_ngrams` and :py:func:`compute_ngrams_bytes` hashes achieve that. - -Closely related is the functionality for hashing ngrams, implemented by the -:py:func:`ft_hash` and :py:func:`ft_hash_broken` functions. -The module exposes "working" and "broken" hash functions in order to maintain -backwards compatibility with older versions of Gensim. - -For compatibility with older Gensim, use :py:func:`compute_ngrams` and -:py:func:`ft_hash_broken` to has each ngram. For compatibility with the -current Facebook implementation, use :py:func:`compute_ngrams_bytes` and -:py:func:`ft_hash_bytes`. - -""" - -import logging -from gensim import utils -import gensim.models.keyedvectors - -from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, frombuffer - -from six.moves import range -from six import iteritems, PY2 - -logger = logging.getLogger(__name__) - - -# -# UTF-8 bytes that begin with 10 are subsequent bytes of a multi-byte sequence, -# as opposed to a new character. -# -_MB_MASK = 0xC0 -_MB_START = 0x80 - - -def _byte_to_int_py3(b): - return b - - -def _byte_to_int_py2(b): - return ord(b) - - -_byte_to_int = _byte_to_int_py2 if PY2 else _byte_to_int_py3 - - -def _is_utf8_continue(b): - return _byte_to_int(b) & _MB_MASK == _MB_START - - -try: - from gensim.models._utils_any2vec import ( - compute_ngrams, - compute_ngrams_bytes, - ft_hash_broken, - ft_hash_bytes, - ) -except ImportError: - raise utils.NO_CYTHON - - -def ft_ngram_hashes(word, minn, maxn, num_buckets, fb_compatible=True): - """Calculate the ngrams of the word and hash them. - - Parameters - ---------- - word : str - The word to calculate ngram hashes for. - minn : int - Minimum ngram length - maxn : int - Maximum ngram length - num_buckets : int - The number of buckets - fb_compatible : boolean, optional - True for compatibility with the Facebook implementation. - False for compatibility with the old Gensim implementation. - - Returns - ------- - A list of hashes (integers), one per each detected ngram. - - """ - if fb_compatible: - encoded_ngrams = compute_ngrams_bytes(word, minn, maxn) - hashes = [ft_hash_bytes(n) % num_buckets for n in encoded_ngrams] - else: - text_ngrams = compute_ngrams(word, minn, maxn) - hashes = [ft_hash_broken(n) % num_buckets for n in text_ngrams] - return hashes - - -def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None): - """Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility. - - Parameters - ---------- - fname : str - The file path used to save the vectors in. - vocab : dict - The vocabulary of words. - vectors : numpy.array - The vectors to be stored. - fvocab : str, optional - File path used to save the vocabulary. - binary : bool, optional - If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. - total_vec : int, optional - Explicitly specify total number of vectors - (in case word vectors are appended with document vectors afterwards). - - """ - if not (vocab or vectors): - raise RuntimeError("no input") - if total_vec is None: - total_vec = len(vocab) - vector_size = vectors.shape[1] - if fvocab is not None: - logger.info("storing vocabulary in %s", fvocab) - with utils.open(fvocab, 'wb') as vout: - for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): - vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count))) - logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) - assert (len(vocab), vector_size) == vectors.shape - with utils.open(fname, 'wb') as fout: - fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) - # store in sorted order: most frequent words at the top - for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): - row = vectors[vocab_.index] - if binary: - row = row.astype(REAL) - fout.write(utils.to_utf8(word) + b" " + row.tostring()) - else: - fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row)))) - - -# Functions for internal use by _load_word2vec_format function - - -def _add_word_to_result(result, counts, word, weights, vocab_size): - - word_id = len(result.vocab) - if word in result.vocab: - logger.warning("duplicate word '%s' in word2vec file, ignoring all but first", word) - return - if counts is None: - # most common scenario: no vocab file given. just make up some bogus counts, in descending order - word_count = vocab_size - word_id - elif word in counts: - # use count from the vocab file - word_count = counts[word] - else: - logger.warning("vocabulary file is incomplete: '%s' is missing", word) - word_count = None - - result.vocab[word] = gensim.models.keyedvectors.Vocab(index=word_id, count=word_count) - result.vectors[word_id] = weights - result.index2word.append(word) - - -def _add_bytes_to_result(result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors): - start = 0 - processed_words = 0 - bytes_per_vector = vector_size * dtype(REAL).itemsize - max_words = vocab_size - len(result.vocab) - for _ in range(max_words): - i_space = chunk.find(b' ', start) - i_vector = i_space + 1 - - if i_space == -1 or (len(chunk) - i_vector) < bytes_per_vector: - break - - word = chunk[start:i_space].decode("utf-8", errors=unicode_errors) - # Some binary files are reported to have obsolete new line in the beginning of word, remove it - word = word.lstrip('\n') - vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype) - _add_word_to_result(result, counts, word, vector, vocab_size) - start = i_vector + bytes_per_vector - processed_words += 1 - - return processed_words, chunk[start:] - - -def _word2vec_read_binary(fin, result, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size): - chunk = b'' - tot_processed_words = 0 - - while tot_processed_words < vocab_size: - new_chunk = fin.read(binary_chunk_size) - chunk += new_chunk - processed_words, chunk = _add_bytes_to_result( - result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors) - tot_processed_words += processed_words - if len(new_chunk) < binary_chunk_size: - break - if tot_processed_words != vocab_size: - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - - -def _word2vec_read_text(fin, result, counts, vocab_size, vector_size, datatype, unicode_errors, encoding): - for line_no in range(vocab_size): - line = fin.readline() - if line == b'': - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") - if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [datatype(x) for x in parts[1:]] - _add_word_to_result(result, counts, word, weights, vocab_size) - - -def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL, binary_chunk_size=100 * 1024): - """Load the input-hidden weight matrix from the original C word2vec-tool format. - - Note that the information stored in the file is incomplete (the binary tree is missing), - so while you can query for word similarity etc., you cannot continue training - with a model loaded this way. - - Parameters - ---------- - fname : str - The file path to the saved word2vec-format file. - fvocab : str, optional - File path to the vocabulary.Word counts are read from `fvocab` filename, if set - (this is the file generated by `-save-vocab` flag of the original C tool). - binary : bool, optional - If True, indicates whether the data is in binary word2vec format. - encoding : str, optional - If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. - unicode_errors : str, optional - default 'strict', is a string suitable to be passed as the `errors` - argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source - file may include word tokens truncated in the middle of a multibyte unicode character - (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. - limit : int, optional - Sets a maximum number of word-vectors to read from the file. The default, - None, means read all. - datatype : type, optional - (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory. - Such types may result in much slower bulk operations or incompatibility with optimized routines.) - binary_chunk_size : int, optional - Read input file in chunks of this many bytes for performance reasons. - - Returns - ------- - object - Returns the loaded model as an instance of :class:`cls`. - - """ - - counts = None - if fvocab is not None: - logger.info("loading word counts from %s", fvocab) - counts = {} - with utils.open(fvocab, 'rb') as fin: - for line in fin: - word, count = utils.to_unicode(line, errors=unicode_errors).strip().split() - counts[word] = int(count) - - logger.info("loading projection weights from %s", fname) - with utils.open(fname, 'rb') as fin: - header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format - if limit: - vocab_size = min(vocab_size, limit) - result = cls(vector_size) - result.vector_size = vector_size - result.vectors = zeros((vocab_size, vector_size), dtype=datatype) - - if binary: - _word2vec_read_binary(fin, result, counts, - vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size) - else: - _word2vec_read_text(fin, result, counts, vocab_size, vector_size, datatype, unicode_errors, encoding) - if result.vectors.shape[0] != len(result.vocab): - logger.info( - "duplicate words detected, shrinking matrix size from %i to %i", - result.vectors.shape[0], len(result.vocab) - ) - result.vectors = ascontiguousarray(result.vectors[: len(result.vocab)]) - assert (len(result.vocab), vector_size) == result.vectors.shape - - logger.info("loaded %s matrix from %s", result.vectors.shape, fname) - return result diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b34b419f45..a6523babdf 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Author: Shiva Manne +# Author: Gensim Contributors # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -125,29 +125,24 @@ import os import heapq from timeit import default_timer -from copy import deepcopy -from collections import defaultdict +from collections import defaultdict, namedtuple +from types import GeneratorType import threading import itertools -import warnings +import copy -from gensim.utils import keep_vocab_item, call_on_class_only -from gensim.models.keyedvectors import Vocab, Word2VecKeyedVectors -from gensim.models.base_any2vec import BaseWordEmbeddingsModel +from gensim.utils import keep_vocab_item, call_on_class_only, deprecated +from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector try: from queue import Queue, Empty except ImportError: from Queue import Queue, Empty -from numpy import exp, dot, zeros, random, dtype, float32 as REAL,\ - uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ - empty, sum as np_sum, ones, logaddexp, log, outer - -from scipy.special import expit +from numpy import float32 as REAL +import numpy as np from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.utils import deprecated from six import iteritems, itervalues, string_types from six.moves import range @@ -180,306 +175,24 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp raise RuntimeError("Training with corpus_file argument is not supported") -def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, - context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): - """Train the passed model instance on a word and its context, using the Skip-gram algorithm. - - Parameters - ---------- - model : :class:`~gensim.models.word2vec.Word2Vec` - The model to be trained. - word : str - The label (predicted) word. - context_index : list of int - The vocabulary indices of the words in the context. - alpha : float - Learning rate. - learn_vectors : bool, optional - Whether the vectors should be updated. - learn_hidden : bool, optional - Whether the weights of the hidden layer should be updated. - context_vectors : list of list of float, optional - Vector representations of the words in the context. If None, these will be retrieved from the model. - context_locks : list of float, optional - The lock factors for each word in the context. - compute_loss : bool, optional - Whether or not the training loss should be computed. - is_ft : bool, optional - If True, weights will be computed using `model.wv.syn0_vocab` and `model.wv.syn0_ngrams` - instead of `model.wv.syn0`. - - Returns - ------- - numpy.ndarray - Error vector to be back-propagated. - - """ - if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 - if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf - - if word not in model.wv.vocab: - return - predict_word = model.wv.vocab[word] # target word (NN output) - - if is_ft: - l1_vocab = context_vectors_vocab[context_index[0]] - l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0) - if context_index: - l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index) - else: - l1 = context_vectors[context_index] # input word (NN input/projection layer) - lock_factor = context_locks[context_index] - - neu1e = zeros(l1.shape) - - if model.hs: - # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) - l2a = deepcopy(model.syn1[predict_word.point]) # 2d matrix, codelen x layer1_size - prod_term = dot(l1, l2a.T) - fa = expit(prod_term) # propagate hidden -> output - ga = (1 - predict_word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[predict_word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - # loss component corresponding to hierarchical softmax - if compute_loss: - sgn = (-1.0) ** predict_word.code # `ch` function, 0 -> 1, 1 -> -1 - lprob = -log(expit(-sgn * prod_term)) - model.running_training_loss += sum(lprob) - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [predict_word.index] - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != predict_word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - prod_term = dot(l1, l2b.T) - fb = expit(prod_term) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - # loss component corresponding to negative sampling - if compute_loss: - model.running_training_loss -= sum(log(expit(-1 * prod_term[1:]))) # for the sampled words - model.running_training_loss -= log(expit(prod_term[0])) # for the output word - - if learn_vectors: - if is_ft: - model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]] - for i in context_index[1:]: - model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) - return neu1e - - -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, - compute_loss=False, context_vectors=None, context_locks=None, is_ft=False): - """Train the passed model instance on a word and its context, using the CBOW algorithm. - - Parameters - ---------- - model : :class:`~gensim.models.word2vec.Word2Vec` - The model to be trained. - word : str - The label (predicted) word. - input_word_indices : list of int - The vocabulary indices of the words in the context. - l1 : list of float - Vector representation of the label word. - alpha : float - Learning rate. - learn_vectors : bool, optional - Whether the vectors should be updated. - learn_hidden : bool, optional - Whether the weights of the hidden layer should be updated. - compute_loss : bool, optional - Whether or not the training loss should be computed. - context_vectors : list of list of float, optional - Vector representations of the words in the context. If None, these will be retrieved from the model. - context_locks : list of float, optional - The lock factors for each word in the context. - is_ft : bool, optional - If True, weights will be computed using `model.wv.syn0_vocab` and `model.wv.syn0_ngrams` - instead of `model.wv.syn0`. - - Returns - ------- - numpy.ndarray - Error vector to be back-propagated. - - """ - if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 - if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf - - neu1e = zeros(l1.shape) - - if model.hs: - l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - prod_term = dot(l1, l2a.T) - fa = expit(prod_term) # propagate hidden -> output - ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - # loss component corresponding to hierarchical softmax - if compute_loss: - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 - model.running_training_loss += sum(-log(expit(-sgn * prod_term))) - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [word.index] - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - prod_term = dot(l1, l2b.T) - fb = expit(prod_term) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - # loss component corresponding to negative sampling - if compute_loss: - model.running_training_loss -= sum(log(expit(-1 * prod_term[1:]))) # for the sampled words - model.running_training_loss -= log(expit(prod_term[0])) # for the output word - - if learn_vectors: - # learn input -> hidden, here for all words in the window separately - if is_ft: - if not model.cbow_mean and input_word_indices: - neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1])) - for i in input_word_indices[0]: - context_vectors_vocab[i] += neu1e * context_locks_vocab[i] - for i in input_word_indices[1]: - context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - if not model.cbow_mean and input_word_indices: - neu1e /= len(input_word_indices) - for i in input_word_indices: - context_vectors[i] += neu1e * context_locks[i] - - return neu1e - - -def score_sg_pair(model, word, word2): - """Score the trained Skip-gram model on a pair of words. - - Parameters - ---------- - model : :class:`~gensim.models.word2vec.Word2Vec` - The trained model. - word : :class:`~gensim.models.keyedvectors.Vocab` - Vocabulary representation of the first word. - word2 : :class:`~gensim.models.keyedvectors.Vocab` - Vocabulary representation of the second word. - - Returns - ------- - float - Logarithm of the sum of exponentiations of input words. - - """ - l1 = model.wv.syn0[word2.index] - l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 - lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) - return sum(lprob) - - -def score_cbow_pair(model, word, l1): - """Score the trained CBOW model on a pair of words. - - Parameters - ---------- - model : :class:`~gensim.models.word2vec.Word2Vec` - The trained model. - word : :class:`~gensim.models.keyedvectors.Vocab` - Vocabulary representation of the first word. - l1 : list of float - Vector representation of the second word. - - Returns - ------- - float - Logarithm of the sum of exponentiations of input words. - - """ - l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1 - lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) - return sum(lprob) - - -class Word2Vec(BaseWordEmbeddingsModel): - """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. - - Once you're finished training a model (=no more updates, only querying) - store and use only the :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `self.wv` to reduce memory. - - The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save` and - :meth:`~gensim.models.word2vec.Word2Vec.load` methods. - - The trained word vectors can also be stored/loaded from a format compatible with the - original word2vec implementation via `self.wv.save_word2vec_format` - and :meth:`gensim.models.keyedvectors.KeyedVectors.load_word2vec_format`. - - Some important attributes are the following: - - Attributes - ---------- - wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` - This object essentially contains the mapping between words and embeddings. After training, it can be used - directly to query those embeddings in various ways. See the module level docstring for examples. - - vocabulary : :class:`~gensim.models.word2vec.Word2VecVocab` - This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. - Besides keeping track of all unique words, this object provides extra functionality, such as - constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words. +class Word2Vec(utils.SaveLoad): + def __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), + comment=None, max_final_vocab=None): + """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. - trainables : :class:`~gensim.models.word2vec.Word2VecTrainables` - This object represents the inner shallow neural network used to train the embeddings. The semantics of the - network differ slightly in the two available training modes (CBOW or SG) but you can think of it as a NN with - a single projection and hidden layer which we train on the corpus. The weights are then used as our embeddings - (which means that the size of the hidden layer is equal to the number of features `self.size`). + Once you're finished training a model (=no more updates, only querying) + store and use only the :class:`~gensim.models.keyedvectors.KeyedVectors` instance in ``self.wv`` + to reduce memory. - """ + The full model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save` and + :meth:`~gensim.models.word2vec.Word2Vec.load` methods. - def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - max_final_vocab=None): - """ + The trained word vectors can also be stored/loaded from a format compatible with the + original word2vec implementation via `self.wv.save_word2vec_format` + and :meth:`gensim.models.keyedvectors.KeyedVectors.load_word2vec_format`. Parameters ---------- @@ -496,7 +209,7 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). - size : int, optional + vector_size : int, optional Dimensionality of the word vectors. window : int, optional Maximum distance between the current and predicted word within a sentence. @@ -544,8 +257,8 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind useful range is (0, 1e-5). hashfxn : function, optional Hash function to use to randomly initialize weights, for increased training reproducibility. - iter : int, optional - Number of iterations (epochs) over the corpus. + epochs : int, optional + Number of iterations (epochs) over the corpus. (Formerly: `iter`) trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). @@ -561,7 +274,7 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind * `min_count` (int) - the minimum count threshold. sorted_vocab : {0, 1}, optional If 1, sort the vocabulary by descending frequency before assigning word indexes. - See :meth:`~gensim.models.word2vec.Word2VecVocab.sort_vocab()`. + See :meth:`~gensim.models.keyedvectors.KeyedVectors.sort_by_descending_frequency()`. batch_words : int, optional Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual @@ -582,233 +295,1357 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> model = Word2Vec(sentences, min_count=1) + Attributes + ---------- + wv : :class:`~gensim.models.keyedvectors.KeyedVectors` + This object essentially contains the mapping between words and embeddings. After training, it can be used + directly to query those embeddings in various ways. See the module level docstring for examples. + """ - self.max_final_vocab = max_final_vocab + corpus_iterable = sentences - self.callbacks = callbacks - self.load = call_on_class_only + self.vector_size = int(vector_size) + self.workers = int(workers) + self.epochs = epochs + self.train_count = 0 + self.total_train_time = 0 + self.batch_words = batch_words - self.wv = Word2VecKeyedVectors(size) - self.vocabulary = Word2VecVocab( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab), - null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent) - self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) + self.sg = int(sg) + self.alpha = float(alpha) + self.min_alpha = float(min_alpha) - super(Word2Vec, self).__init__( - sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter, - callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, - seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss) + self.window = int(window) + self.random = np.random.RandomState(seed) - def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=None, total_words=None, **kwargs): - work, neu1 = thread_private_mem + self.hs = int(hs) + self.negative = int(negative) + self.ns_exponent = ns_exponent + self.cbow_mean = int(cbow_mean) + self.compute_loss = bool(compute_loss) + self.running_training_loss = 0 + self.min_alpha_yet_reached = float(alpha) + self.corpus_count = 0 + self.corpus_total_words = 0 - if self.sg: - examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss) - else: - examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss) + self.max_final_vocab = max_final_vocab + self.max_vocab_size = max_vocab_size + self.min_count = min_count + self.sample = sample + self.sorted_vocab = sorted_vocab + self.null_word = null_word + self.cum_table = None # for negative sampling + self.raw_vocab = None - return examples, tally, raw_tally + if not hasattr(self, 'wv'): # set unless subclass already set (eg: FastText) + self.wv = KeyedVectors(vector_size) - def _do_train_job(self, sentences, alpha, inits): - """Train the model on a single batch of sentences. + self.hashfxn = hashfxn + self.seed = seed + if not hasattr(self, 'layer1_size'): # set unless subclass already set (as for Doc2Vec dm_concat mode) + self.layer1_size = vector_size - Parameters - ---------- - sentences : iterable of list of str - Corpus chunk to be used in this training batch. - alpha : float - The learning rate used in this batch. - inits : (np.ndarray, np.ndarray) - Each worker threads private work memory. + self.comment = comment - Returns - ------- - (int, int) - 2-tuple (effective word count after ignoring unknown words and sentence length trimming, total word count). + self.load = call_on_class_only - """ - work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) + if corpus_iterable is not None or corpus_file is not None: + self.build_vocab_and_train(corpus_iterable=corpus_iterable, corpus_file=corpus_file, + trim_rule=trim_rule, callbacks=callbacks) else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) - return tally, self._raw_word_count(sentences) - - def _clear_post_train(self): - """Remove all L2-normalized word vectors from the model.""" - self.wv.vectors_norm = None - - def _set_train_params(self, **kwargs): - if 'compute_loss' in kwargs: - self.compute_loss = kwargs['compute_loss'] - self.running_training_loss = 0 - - def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): - """Update the model's neural weights from a sequence of sentences. - - Notes - ----- - To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate - progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of - raw words in sentences) **MUST** be provided. If `sentences` is the same corpus - that was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab` earlier, - you can simply use `total_examples=self.corpus_count`. - - Warnings - -------- - To avoid common mistakes around the model's ability to do multiple training passes itself, an - explicit `epochs` argument **MUST** be provided. In the common and recommended case - where :meth:`~gensim.models.word2vec.Word2Vec.train` is only called once, you can set `epochs=self.iter`. + if trim_rule is not None: + logger.warning( + "The rule, if given, is only used to prune vocabulary during build_vocab() " + "and is not stored as part of the model. Model initialized without sentences. " + "trim_rule provided, if any, will be ignored.") + if callbacks: + logger.warning( + "Callbacks are no longer retained by the model, so must be provided whenever " + "training is triggered, as in initialization with a corpus or calling `train()`. " + "The callbacks provided in this initialization without triggering train will " + "be ignored.") + + def build_vocab_and_train(self, corpus_iterable=None, corpus_file=None, trim_rule=None, callbacks=None): + if not (corpus_iterable is None) ^ (corpus_file is None): + raise ValueError("You must provide only one of corpus_iterable or corpus_file arguments.") + if corpus_file is not None and not isinstance(corpus_file, string_types): + raise TypeError("You must pass string as the corpus_file argument.") + elif isinstance(corpus_iterable, GeneratorType): + raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.") + # TODO: test for restartable? + self.build_vocab(corpus_iterable=corpus_iterable, corpus_file=corpus_file, trim_rule=trim_rule) + self.train( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, + total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, + end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks) + + def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs): + """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Parameters ---------- - sentences : iterable of list of str - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + corpus_iterable : iterable of list of str + Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - See also the `tutorial on data streaming in Python - `_. + or :class:`~gensim.models.word2vec.LineSentence` module for such examples. corpus_file : str, optional Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (not both of them). - total_examples : int - Count of sentences. - total_words : int - Count of raw words in sentences. - epochs : int - Number of iterations (epochs) over the corpus. - start_alpha : float, optional - Initial learning rate. If supplied, replaces the starting `alpha` from the constructor, - for this one call to`train()`. - Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself - (not recommended). - end_alpha : float, optional - Final learning rate. Drops linearly from `start_alpha`. - If supplied, this replaces the final `min_alpha` from the constructor, for this one call to `train()`. - Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself - (not recommended). - word_count : int, optional - Count of words already trained. Set this to 0 for the usual - case of training on all words in sentences. - queue_factor : int, optional - Multiplier for size of queue (number of workers * queue_factor). - report_delay : float, optional - Seconds to wait before reporting progress. - compute_loss: bool, optional - If True, computes and stores loss value which can be retrieved using - :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. - callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - Sequence of callbacks to be executed at specific stages during training. + update : bool + If true, the new words in `sentences` will be added to model's vocab. + progress_per : int, optional + Indicates how many words to process before showing/updating the progress. + keep_raw_vocab : bool, optional + If False, the raw vocabulary will be deleted after the scaling is done to free up RAM. + trim_rule : function, optional + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part + of the model. - Examples - -------- - .. sourcecode:: pycon + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. - >>> from gensim.models import Word2Vec - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = Word2Vec(min_count=1) - >>> model.build_vocab(sentences) # prepare the model vocabulary - >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) # train word vectors - (1, 30) + **kwargs : object + Key word arguments propagated to `self.prepare_vocab` """ - return super(Word2Vec, self).train( - sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, - epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, - queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) + total_words, corpus_count = self.scan_vocab( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) + self.corpus_count = corpus_count + self.corpus_total_words = total_words + report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) + self.prepare_weights(update=update) - def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): - """Score the log probability for a sequence of sentences. - This does not change the fitted model in any way (see :meth:`~gensim.models.word2vec.Word2Vec.train` for that). - - Gensim has currently only implemented score for the hierarchical softmax scheme, - so you need to have run word2vec with `hs=1` and `negative=0` for this to work. - - Note that you should specify `total_sentences`; you'll run into problems if you ask to - score more than this number of sentences but it is inefficient to set the value too high. - - See the `article by Matt Taddy: "Document Classification by Inversion of Distributed Language Representations" - `_ and the - `gensim demo `_ for examples of - how to use such scores in document classification. + def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): + """Build vocabulary from a dictionary of word frequencies. Parameters ---------- - sentences : iterable of list of str - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - total_sentences : int, optional - Count of sentences. - chunksize : int, optional - Chunksize of jobs - queue_factor : int, optional - Multiplier for size of queue (number of workers * queue_factor). - report_delay : float, optional - Seconds to wait before reporting progress. + word_freq : dict of (str, int) + A mapping from a word in the vocabulary to its frequency count. + keep_raw_vocab : bool, optional + If False, delete the raw vocabulary after the scaling is done to free up RAM. + corpus_count : int, optional + Even if no corpus is provided, this argument can set corpus_count explicitly. + trim_rule : function, optional + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part + of the model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + + update : bool, optional + If true, the new provided words in `word_freq` dict will be added to model's vocab. """ + logger.info("Processing provided word frequencies") + # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) + # to be directly the raw vocab + raw_vocab = word_freq logger.info( - "scoring sentences with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg, self.hs, - self.vocabulary.sample, self.negative + "collected %i different raw word, with total frequency of %i", + len(raw_vocab), sum(itervalues(raw_vocab)) ) - if not self.wv.vocab: - raise RuntimeError("you must first build vocabulary before scoring new data") - - if not self.hs: - raise RuntimeError( - "We have currently only implemented score for the hierarchical softmax scheme, " - "so you need to have run word2vec with hs=1 and negative=0 for this to work." - ) - - def worker_loop(): - """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" - work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) - while True: - job = job_queue.get() - if job is None: # signal to finish - break - ns = 0 - for sentence_id, sentence in job: - if sentence_id >= total_sentences: - break - if self.sg: - score = score_sentence_sg(self, sentence, work) - else: - score = score_sentence_cbow(self, sentence, work, neu1) - sentence_scores[sentence_id] = score - ns += 1 - progress_queue.put(ns) # report progress - - start, next_report = default_timer(), 1.0 - # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( - job_queue = Queue(maxsize=queue_factor * self.workers) - progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - - workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] - for thread in workers: - thread.daemon = True # make interrupting the process with ctrl+c easier - thread.start() + # Since no sentences are provided, this is to control the corpus_count. + self.corpus_count = corpus_count or 0 + self.raw_vocab = raw_vocab - sentence_count = 0 - sentence_scores = matutils.zeros_aligned(total_sentences, dtype=REAL) + # trim by min_count & precalculate downsampling + report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) + report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) + self.prepare_weights(update=update) # build tables & arrays - push_done = False - done_jobs = 0 + def _scan_vocab(self, sentences, progress_per, trim_rule): + sentence_no = -1 + total_words = 0 + min_reduce = 1 + vocab = defaultdict(int) + checked_string_types = 0 + for sentence_no, sentence in enumerate(sentences): + if not checked_string_types: + if isinstance(sentence, string_types): + logger.warning( + "Each 'sentences' item should be a list of words (usually unicode strings). " + "First item here is instead plain %s.", + type(sentence) + ) + checked_string_types += 1 + if sentence_no % progress_per == 0: + logger.info( + "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, total_words, len(vocab) + ) + for word in sentence: + vocab[word] += 1 + total_words += len(sentence) + + if self.max_vocab_size and len(vocab) > self.max_vocab_size: + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + min_reduce += 1 + + corpus_count = sentence_no + 1 + self.raw_vocab = vocab + return total_words, corpus_count + + def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, workers=None, trim_rule=None): + logger.info("collecting all words and their counts") + if corpus_file: + corpus_iterable = LineSentence(corpus_file) + + total_words, corpus_count = self._scan_vocab(corpus_iterable, progress_per, trim_rule) + + logger.info( + "collected %i word types from a corpus of %i raw words and %i sentences", + len(self.raw_vocab), total_words, corpus_count + ) + + return total_words, corpus_count + + def prepare_vocab( + self, update=False, keep_raw_vocab=False, trim_rule=None, + min_count=None, sample=None, dry_run=False): + """Apply vocabulary settings for `min_count` (discarding less-frequent words) + and `sample` (controlling the downsampling of more-frequent words). + + Calling with `dry_run=True` will only simulate the provided settings and + report the size of the retained vocabulary, effective corpus length, and + estimated memory requirements. Results are both printed via logging and + returned as a dict. + + Delete the raw vocabulary after the scaling is done to free up RAM, + unless `keep_raw_vocab` is set. + + """ + min_count = min_count or self.min_count + sample = sample or self.sample + drop_total = drop_unique = 0 + + # set effective_min_count to min_count in case max_final_vocab isn't set + self.effective_min_count = min_count + + # if max_final_vocab is specified instead of min_count + # pick a min_count which satisfies max_final_vocab as well as possible + if self.max_final_vocab is not None: + sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) + calc_min_count = 1 + + if self.max_final_vocab < len(sorted_vocab): + calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1 + + self.effective_min_count = max(calc_min_count, min_count) + logger.info( + "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d", + self.max_final_vocab, min_count, calc_min_count, self.effective_min_count + ) + + if not update: + logger.info("Loading a fresh vocabulary") + retain_total, retain_words = 0, [] + # Discard words less-frequent than min_count + if not dry_run: + self.wv.index_to_key = [] + # make stored settings match these applied settings + self.min_count = min_count + self.sample = sample + self.wv.key_to_index = {} + + for word, v in iteritems(self.raw_vocab): + if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): + retain_words.append(word) + retain_total += v + if not dry_run: + self.wv.key_to_index[word] = len(self.wv.index_to_key) + self.wv.index_to_key.append(word) + else: + drop_unique += 1 + drop_total += v + if not dry_run: + # now update counts + for word in self.wv.index_to_key: + self.wv.set_vecattr(word, 'count', self.raw_vocab[word]) + original_unique_total = len(retain_words) + drop_unique + retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) + logger.info( + "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)", + self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique + ) + original_total = retain_total + drop_total + retain_pct = retain_total * 100 / max(original_total, 1) + logger.info( + "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", + self.effective_min_count, retain_total, retain_pct, original_total, drop_total + ) + else: + logger.info("Updating model with new vocabulary") + new_total = pre_exist_total = 0 + new_words = pre_exist_words = [] + for word, v in iteritems(self.raw_vocab): + if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): + if self.wv.has_index_for(word): + pre_exist_words.append(word) + pre_exist_total += v + if not dry_run: + pass + else: + new_words.append(word) + new_total += v + if not dry_run: + self.wv.key_to_index[word] = len(self.wv.index_to_key) + self.wv.index_to_key.append(word) + else: + drop_unique += 1 + drop_total += v + if not dry_run: + # now update counts + self.wv.allocate_vecattrs(attrs=['count'], types=[type(0)]) + for word in self.wv.index_to_key: + self.wv.set_vecattr(word, 'count', self.wv.get_vecattr(word, 'count') + self.raw_vocab.get(word, 0)) + original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique + pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) + new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) + logger.info( + "New added %i unique words (%i%% of original %i) " + "and increased the count of %i pre-existing words (%i%% of original %i)", + len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), + pre_exist_unique_pct, original_unique_total + ) + retain_words = new_words + pre_exist_words + retain_total = new_total + pre_exist_total + + # Precalculate each vocabulary item's threshold for sampling + if not sample: + # no words downsampled + threshold_count = retain_total + elif sample < 1.0: + # traditional meaning: set parameter as proportion of total + threshold_count = sample * retain_total + else: + # new shorthand: sample >= 1 means downsample all words with higher count than sample + threshold_count = int(sample * (3 + np.sqrt(5)) / 2) + + downsample_total, downsample_unique = 0, 0 + for w in retain_words: + v = self.raw_vocab[w] + word_probability = (np.sqrt(v / threshold_count) + 1) * (threshold_count / v) + if word_probability < 1.0: + downsample_unique += 1 + downsample_total += word_probability * v + else: + word_probability = 1.0 + downsample_total += v + if not dry_run: + self.wv.set_vecattr(w, 'sample_int', np.uint32(word_probability * (2**32 - 1))) + + if not dry_run and not keep_raw_vocab: + logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) + self.raw_vocab = defaultdict(int) + + logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) + logger.info( + "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", + downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total + ) + + # return from each step: words-affected, resulting-corpus-size, extra memory estimates + report_values = { + 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, + 'downsample_total': int(downsample_total), 'num_retained_words': len(retain_words) + } + + if self.null_word: + # create null pseudo-word for padding when using concatenative L1 (run-of-words) + # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter + self.add_null_word() + + if self.sorted_vocab and not update: + self.wv.sort_by_descending_frequency() + + if self.hs: + # add info about each word's Huffman encoding + self.create_binary_tree() + if self.negative: + # build the table for drawing random words (for negative sampling) + self.make_cum_table() + + return report_values + + def estimate_memory(self, vocab_size=None, report=None): + """Estimate required memory for a model using current settings and provided vocabulary size. + + Parameters + ---------- + vocab_size : int, optional + Number of unique tokens in the vocabulary + report : dict of (str, int), optional + A dictionary from string representations of the model's memory consuming members to their size in bytes. + + Returns + ------- + dict of (str, int) + A dictionary from string representations of the model's memory consuming members to their size in bytes. + + """ + vocab_size = vocab_size or len(self.wv) + report = report or {} + report['vocab'] = vocab_size * (700 if self.hs else 500) + report['vectors'] = vocab_size * self.vector_size * np.dtype(REAL).itemsize + if self.hs: + report['syn1'] = vocab_size * self.layer1_size * np.dtype(REAL).itemsize + if self.negative: + report['syn1neg'] = vocab_size * self.layer1_size * np.dtype(REAL).itemsize + report['total'] = sum(report.values()) + logger.info( + "estimated required memory for %i words and %i dimensions: %i bytes", + vocab_size, self.vector_size, report['total'] + ) + return report + + def add_null_word(self): + word = '\0' + self.wv.key_to_index[word] = len(self.wv) + self.wv.index_to_key.append(word) + self.wv.set_vecattr(word, 'count', 1) + + def create_binary_tree(self): + """Create a `binary Huffman tree `_ using stored vocabulary + word counts. Frequent words will have shorter binary codes. + Called internally from :meth:`~gensim.models.word2vec.Word2VecVocab.build_vocab`. + + """ + _assign_binary_codes(self.wv) + + def make_cum_table(self, domain=2**31 - 1): + """Create a cumulative-distribution table using stored vocabulary word counts for + drawing random words in the negative-sampling training routines. + + To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]), + then finding that integer's sorted insertion point (as if by `bisect_left` or `ndarray.searchsorted()`). + That insertion point is the drawn index, coming up in proportion equal to the increment at that slot. + + """ + vocab_size = len(self.wv.index_to_key) + self.cum_table = np.zeros(vocab_size, dtype=np.uint32) + # compute sum of all power (Z in paper) + train_words_pow = 0.0 + for word_index in range(vocab_size): + count = self.wv.get_vecattr(word_index, 'count') + train_words_pow += count**self.ns_exponent + cumulative = 0.0 + for word_index in range(vocab_size): + count = self.wv.get_vecattr(word_index, 'count') + cumulative += count**self.ns_exponent + self.cum_table[word_index] = round(cumulative / train_words_pow * domain) + if len(self.cum_table) > 0: + assert self.cum_table[-1] == domain + + def prepare_weights(self, update=False): + """Build tables and model weights based on final vocabulary settings.""" + # set initial input/projection and hidden weights + if not update: + self.reset_weights() + else: + self.update_weights() + + @deprecated("Use gensim.models.keyedvectors.pseudorandom_weak_vector() directly") + def seeded_vector(self, seed_string, vector_size): + return pseudorandom_weak_vector(vector_size, seed_string=seed_string, hashfxn=self.hashfxn) + + def reset_weights(self): + """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" + logger.info("resetting layer weights") + self.wv.resize_vectors() + self.wv.randomly_initialize_vectors(seed=self.seed) + if self.hs: + self.syn1 = np.zeros((len(self.wv), self.layer1_size), dtype=REAL) + if self.negative: + self.syn1neg = np.zeros((len(self.wv), self.layer1_size), dtype=REAL) + + self.wv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows + + def update_weights(self): + """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" + logger.info("updating layer weights") + new_range = self.wv.resize_vectors() + gained_vocab = len(new_range) + self.wv.randomly_initialize_vectors(indexes=new_range) + + # Raise an error if an online update is run before initial training on a corpus + if not len(self.wv.vectors): + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus before doing an online update." + ) + + if self.hs: + self.syn1 = np.vstack([self.syn1, np.zeros((gained_vocab, self.layer1_size), dtype=REAL)]) + if self.negative: + pad = np.zeros((gained_vocab, self.layer1_size), dtype=REAL) + self.syn1neg = np.vstack([self.syn1neg, pad]) + self.wv.norms = None + + # do not suppress learning for already learned words + self.wv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows + + def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, + total_examples=None, total_words=None, **kwargs): + work, neu1 = thread_private_mem + + if self.sg: + examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch, + total_examples, total_words, work, neu1, self.compute_loss) + else: + examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch, + total_examples, total_words, work, neu1, self.compute_loss) + + return examples, tally, raw_tally + + def _do_train_job(self, sentences, alpha, inits): + """Train the model on a single batch of sentences. + + Parameters + ---------- + sentences : iterable of list of str + Corpus chunk to be used in this training batch. + alpha : float + The learning rate used in this batch. + inits : (np.ndarray, np.ndarray) + Each worker threads private work memory. + + Returns + ------- + (int, int) + 2-tuple (effective word count after ignoring unknown words and sentence length trimming, total word count). + + """ + work, neu1 = inits + tally = 0 + if self.sg: + tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) + else: + tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) + return tally, self._raw_word_count(sentences) + + def _clear_post_train(self): + """Clear any cached vector lengths from the model.""" + self.wv.norms = None + + def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, word_count=0, + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), + **kwargs): + """Update the model's neural weights from a sequence of sentences. + + Notes + ----- + To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate + progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of + raw words in sentences) **MUST** be provided. If `sentences` is the same corpus + that was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab` earlier, + you can simply use `total_examples=self.corpus_count`. + + Warnings + -------- + To avoid common mistakes around the model's ability to do multiple training passes itself, an + explicit `epochs` argument **MUST** be provided. In the common and recommended case + where :meth:`~gensim.models.word2vec.Word2Vec.train` is only called once, you can set `epochs=self.iter`. + + Parameters + ---------- + corpus_iterable : iterable of list of str + The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See also the `tutorial on data streaming in Python + `_. + corpus_file : str, optional + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or + `corpus_file` arguments need to be passed (not both of them). + total_examples : int + Count of sentences. + total_words : int + Count of raw words in sentences. + epochs : int + Number of iterations (epochs) over the corpus. + start_alpha : float, optional + Initial learning rate. If supplied, replaces the starting `alpha` from the constructor, + for this one call to`train()`. + Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself + (not recommended). + end_alpha : float, optional + Final learning rate. Drops linearly from `start_alpha`. + If supplied, this replaces the final `min_alpha` from the constructor, for this one call to `train()`. + Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself + (not recommended). + word_count : int, optional + Count of words already trained. Set this to 0 for the usual + case of training on all words in sentences. + queue_factor : int, optional + Multiplier for size of queue (number of workers * queue_factor). + report_delay : float, optional + Seconds to wait before reporting progress. + compute_loss: bool, optional + If True, computes and stores loss value which can be retrieved using + :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. + callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional + Sequence of callbacks to be executed at specific stages during training. + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.models import Word2Vec + >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + >>> + >>> model = Word2Vec(min_count=1) + >>> model.build_vocab(sentences) # prepare the model vocabulary + >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) # train word vectors + (1, 30) + + """ + self.alpha = start_alpha or self.alpha + self.min_alpha = end_alpha or self.min_alpha + self.epochs = epochs + + self._check_training_sanity( + epochs=epochs, + total_examples=total_examples, + total_words=total_words) + + self.compute_loss = compute_loss + self.running_training_loss = 0.0 + + for callback in callbacks: + callback.on_train_begin(self) + + trained_word_count = 0 + raw_word_count = 0 + start = default_timer() - 0.00001 + job_tally = 0 + + for cur_epoch in range(self.epochs): + for callback in callbacks: + callback.on_epoch_begin(self) + + if corpus_iterable is not None: + trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch( + corpus_iterable, cur_epoch=cur_epoch, total_examples=total_examples, + total_words=total_words, queue_factor=queue_factor, report_delay=report_delay, + callbacks=callbacks, **kwargs) + else: + trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch_corpusfile( + corpus_file, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, + callbacks=callbacks, **kwargs) + + trained_word_count += trained_word_count_epoch + raw_word_count += raw_word_count_epoch + job_tally += job_tally_epoch + + for callback in callbacks: + callback.on_epoch_end(self) + + # Log overall time + total_elapsed = default_timer() - start + self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally) + + self.train_count += 1 # number of times train() has been called + self._clear_post_train() + + for callback in callbacks: + callback.on_train_end(self) + return trained_word_count, raw_word_count + + def _worker_loop_corpusfile(self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, + total_examples=None, total_words=None, **kwargs): + """Train the model on a `corpus_file` in LineSentence format. + + This function will be called in parallel by multiple workers (threads or processes) to make + optimal use of multicore machines. + + Parameters + ---------- + corpus_file : str + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + thread_id : int + Thread index starting from 0 to `number of workers - 1`. + offset : int + Offset (in bytes) in the `corpus_file` for particular worker. + cython_vocab : :class:`~gensim.models.word2vec_inner.CythonVocab` + Copy of the vocabulary in order to access it without GIL. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + **kwargs : object + Additional key word parameters for the specific model inheriting from this class. + + """ + thread_private_mem = self._get_thread_working_mem() + + examples, tally, raw_tally = self._do_train_epoch( + corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, + total_examples=total_examples, total_words=total_words, **kwargs) + + progress_queue.put((examples, tally, raw_tally)) + progress_queue.put(None) + + def _worker_loop(self, job_queue, progress_queue): + """Train the model, lifting batches of data from the queue. + + This function will be called in parallel by multiple workers (threads or processes) to make + optimal use of multicore machines. + + Parameters + ---------- + job_queue : Queue of (list of objects, (str, int)) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + thread_private_mem = self._get_thread_working_mem() + jobs_processed = 0 + callbacks = progress_queue.callbacks + while True: + job = job_queue.get() + if job is None: + progress_queue.put(None) + break # no more jobs => quit this worker + data_iterable, job_parameters = job + + for callback in callbacks: + callback.on_batch_begin(self) + + tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) + + for callback in callbacks: + callback.on_batch_end(self) + + progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress + jobs_processed += 1 + logger.debug("worker exiting, processed %i jobs", jobs_processed) + + def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None): + """Fill the jobs queue using the data found in the input stream. + + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is a dictionary of parameters. + + Parameters + ---------- + data_iterator : iterable of list of objects + The input dataset. This will be split in chunks and these chunks will be pushed to the queue. + job_queue : Queue of (list of object, dict of (str, int)) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + + """ + job_batch, batch_size = [], 0 + pushed_words, pushed_examples = 0, 0 + next_job_params = self._get_job_params(cur_epoch) + job_no = 0 + + for data_idx, data in enumerate(data_iterator): + data_length = self._raw_word_count([data]) + + # can we fit this sentence into the existing job batch? + if batch_size + data_length <= self.batch_words: + # yes => add it to the current job + job_batch.append(data) + batch_size += data_length + else: + job_no += 1 + job_queue.put((job_batch, next_job_params)) + + # update the learning rate for the next job + if total_examples: + # examples-based decay + pushed_examples += len(job_batch) + epoch_progress = 1.0 * pushed_examples / total_examples + else: + # words-based decay + pushed_words += self._raw_word_count(job_batch) + epoch_progress = 1.0 * pushed_words / total_words + next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch) + + # add the sentence that didn't fit as the first item of a new job + job_batch, batch_size = [data], data_length + # add the last job too (may be significantly smaller than batch_words) + if job_batch: + job_no += 1 + job_queue.put((job_batch, next_job_params)) + + if job_no == 0 and self.train_count == 0: + logger.warning( + "train() called with an empty iterator (if not intended, " + "be sure to provide a corpus that offers restartable iteration = an iterable)." + ) + + # give the workers heads up that they can finish -- no more work! + for _ in range(self.workers): + job_queue.put(None) + logger.debug("job loop exiting, total %i jobs", job_no) + + def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, + total_words=None, report_delay=1.0, is_corpus_file_mode=None): + """Get the progress report for a single training epoch. + + Parameters + ---------- + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + job_queue : Queue of (list of object, dict of (str, int)) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + report_delay : float, optional + Number of seconds between two consecutive progress report messages in the logger. + is_corpus_file_mode : bool, optional + Whether training is file-based (corpus_file argument) or not. + + Returns + ------- + (int, int, int) + The epoch report consisting of three elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + example_count, trained_word_count, raw_word_count = 0, 0, 0 + start, next_report = default_timer() - 0.00001, 1.0 + job_tally = 0 + unfinished_worker_count = self.workers + + while unfinished_worker_count > 0: + report = progress_queue.get() # blocks if workers too slow + if report is None: # a thread reporting that it finished + unfinished_worker_count -= 1 + logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) + continue + examples, trained_words, raw_words = report + job_tally += 1 + + # update progress stats + example_count += examples + trained_word_count += trained_words # only words in vocab & sampled + raw_word_count += raw_words + + # log progress once every report_delay seconds + elapsed = default_timer() - start + if elapsed >= next_report: + self._log_progress( + job_queue, progress_queue, cur_epoch, example_count, total_examples, + raw_word_count, total_words, trained_word_count, elapsed) + next_report = elapsed + report_delay + # all done; report the final stats + elapsed = default_timer() - start + self._log_epoch_end( + cur_epoch, example_count, total_examples, raw_word_count, total_words, + trained_word_count, elapsed, is_corpus_file_mode) + self.total_train_time += elapsed + return trained_word_count, raw_word_count, job_tally + + def _train_epoch_corpusfile( + self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs): + """Train the model for a single epoch. + + Parameters + ---------- + corpus_file : str + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus, used to log progress. + total_words : int + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus, used to log progress. Must be provided in order to seek in `corpus_file`. + **kwargs : object + Additional key word parameters for the specific model inheriting from this class. + + Returns + ------- + (int, int, int) + The training report for this epoch consisting of three elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + if not total_words: + raise ValueError("total_words must be provided alongside corpus_file argument.") + + from gensim.models.word2vec_corpusfile import CythonVocab + from gensim.models.fasttext import FastText + cython_vocab = CythonVocab(self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) + + progress_queue = Queue() + + corpus_file_size = os.path.getsize(corpus_file) + + thread_kwargs = copy.copy(kwargs) + thread_kwargs['cur_epoch'] = cur_epoch + thread_kwargs['total_examples'] = total_examples + thread_kwargs['total_words'] = total_words + workers = [ + threading.Thread( + target=self._worker_loop_corpusfile, + args=( + corpus_file, thread_id, corpus_file_size / self.workers * thread_id, cython_vocab, progress_queue + ), + kwargs=thread_kwargs + ) for thread_id in range(self.workers) + ] + + for thread in workers: + thread.daemon = True + thread.start() + + trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( + progress_queue=progress_queue, job_queue=None, cur_epoch=cur_epoch, + total_examples=total_examples, total_words=total_words, is_corpus_file_mode=True) + + return trained_word_count, raw_word_count, job_tally + + def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, + queue_factor=2, report_delay=1.0, callbacks=()): + """Train the model for a single epoch. + + Parameters + ---------- + data_iterable : iterable of list of object + The input corpus. This will be split in chunks and these chunks will be pushed to the queue. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus, used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus, used to log progress. + queue_factor : int, optional + Multiplier for size of queue -> size = number of workers * queue_factor. + report_delay : float, optional + Number of seconds between two consecutive progress report messages in the logger. + + Returns + ------- + (int, int, int) + The training report for this epoch consisting of three elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + job_queue = Queue(maxsize=queue_factor * self.workers) + progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) + progress_queue.callbacks = callbacks # messy way to pass along for just this session + + workers = [ + threading.Thread( + target=self._worker_loop, + args=(job_queue, progress_queue,)) + for _ in range(self.workers) + ] + + workers.append(threading.Thread( + target=self._job_producer, + args=(data_iterable, job_queue), + kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words})) + + for thread in workers: + thread.daemon = True # make interrupting the process with ctrl+c easier + thread.start() + + trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( + progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, + report_delay=report_delay, is_corpus_file_mode=False) + + return trained_word_count, raw_word_count, job_tally + + def _get_job_params(self, cur_epoch): + """Get the learning rate used in the current epoch. + + Parameters + ---------- + cur_epoch : int + Current iteration through the corpus + + Returns + ------- + float + The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`). + + """ + alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs) + return alpha + + def _update_job_params(self, job_params, epoch_progress, cur_epoch): + """Get the correct learning rate for the next iteration. + + Parameters + ---------- + job_params : dict of (str, obj) + UNUSED. + epoch_progress : float + Ratio of finished work in the current epoch. + cur_epoch : int + Number of current iteration. + + Returns + ------- + float + The learning rate to be used in the next training epoch. + + """ + start_alpha = self.alpha + end_alpha = self.min_alpha + progress = (cur_epoch + epoch_progress) / self.epochs + next_alpha = start_alpha - (start_alpha - end_alpha) * progress + next_alpha = max(end_alpha, next_alpha) + self.min_alpha_yet_reached = next_alpha + return next_alpha + + def _get_thread_working_mem(self): + """Computes the memory used per worker thread. + + Returns + ------- + (np.ndarray, np.ndarray) + Each worker threads private work memory. + + """ + work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) + return work, neu1 + + def _raw_word_count(self, job): + """Get the number of words in a given job. + + Parameters + ---------- + job: iterable of list of str + The corpus chunk processed in a single batch. + + Returns + ------- + int + Number of raw words in the corpus chunk. + + """ + return sum(len(sentence) for sentence in job) + + def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): + """Checks whether the training parameters make sense. + + Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train` + and raises warning or errors depending on the severity of the issue in case an inconsistent parameter + combination is detected. + + Parameters + ---------- + epochs : int, optional + Number of training epochs. Must have a (non None) value. + total_examples : int, optional + Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied. + total_words : int, optional + Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied. + **kwargs : object + Unused. Present to preserve signature among base and inherited implementations. + + Raises + ------ + RuntimeError + If one of the required training pre/post processing steps have not been performed. + ValueError + If the combination of input parameters is inconsistent. + + """ + if self.alpha > self.min_alpha_yet_reached: + logger.warning("Effective 'alpha' higher than previous training cycles") + + if not self.wv.key_to_index: # should be set by `build_vocab` + raise RuntimeError("you must first build vocabulary before training the model") + if not len(self.wv.vectors): + raise RuntimeError("you must initialize vectors before training the model") + + if not hasattr(self, 'corpus_count'): + raise ValueError( + "The number of examples in the training corpus is missing. " + "Please make sure this is set inside `build_vocab` function." + "Call the `build_vocab` function before calling `train`." + ) + + if total_words is None and total_examples is None: + raise ValueError( + "You must specify either total_examples or total_words, for proper job parameters updation" + "and progress calculations. " + "The usual value is total_examples=model.corpus_count." + ) + if epochs is None: + raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.epochs.") + logger.info( + "training model with %i workers on %i vocabulary and %i features, " + "using sg=%s hs=%s sample=%s negative=%s window=%s", + self.workers, len(self.wv), self.layer1_size, self.sg, + self.hs, self.sample, self.negative, self.window + ) + + def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, + raw_word_count, total_words, trained_word_count, elapsed): + """Callback used to log progress for long running jobs. + + Parameters + ---------- + job_queue : Queue of (list of object, dict of (str, float)) + The queue of jobs still to be performed by workers. Each job is represented as a tuple containing + the batch of data to be processed and the parameters to be used for the processing as a dict. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + cur_epoch : int + The current training iteration through the corpus. + example_count : int + Number of examples (could be sentences for example) processed until now. + total_examples : int + Number of all examples present in the input corpus. + raw_word_count : int + Number of words used in training until now. + total_words : int + Number of all words in the input corpus. + trained_word_count : int + Number of effective words used in training until now (after ignoring unknown words and trimming + the sentence length). + elapsed : int + Elapsed time since the beginning of training in seconds. + + Notes + ----- + If you train the model via `corpus_file` argument, there is no job_queue, so reported job_queue size will + always be equal to -1. + + """ + if total_examples: + # examples-based progress % + logger.info( + "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", + cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, + -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) + ) + else: + # words-based progress % + logger.info( + "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", + cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, + -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) + ) + + def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, + trained_word_count, elapsed, is_corpus_file_mode): + """Callback used to log the end of a training epoch. + + Parameters + ---------- + cur_epoch : int + The current training iteration through the corpus. + example_count : int + Number of examples (could be sentences for example) processed until now. + total_examples : int + Number of all examples present in the input corpus. + raw_word_count : int + Number of words used in training until now. + total_words : int + Number of all words in the input corpus. + trained_word_count : int + Number of effective words used in training until now (after ignoring unknown words and trimming + the sentence length). + elapsed : int + Elapsed time since the beginning of training in seconds. + is_corpus_file_mode : bool + Whether training is file-based (corpus_file argument) or not. + + Warnings + -------- + In case the corpus is changed while the epoch was running. + + """ + logger.info( + "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", + cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed + ) + + # don't warn if training in file-based mode, because it's expected behavior + if is_corpus_file_mode: + return + + # check that the input corpus hasn't changed during iteration + if total_examples and total_examples != example_count: + logger.warning( + "EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1, + example_count, total_examples + ) + if total_words and total_words != raw_word_count: + logger.warning( + "EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1, + raw_word_count, total_words + ) + + def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): + """Callback to log the end of training. + + Parameters + ---------- + raw_word_count : int + Number of words used in the whole training. + trained_word_count : int + Number of effective words used in training (after ignoring unknown words and trimming the sentence length). + total_elapsed : int + Total time spent during training in seconds. + job_tally : int + Total number of jobs processed during training. + + """ + logger.info( + "training on a %i raw words (%i effective words) took %.1fs, %.0f effective words/s", + raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed + ) + + def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): + """Score the log probability for a sequence of sentences. + This does not change the fitted model in any way (see :meth:`~gensim.models.word2vec.Word2Vec.train` for that). + + Gensim has currently only implemented score for the hierarchical softmax scheme, + so you need to have run word2vec with `hs=1` and `negative=0` for this to work. + + Note that you should specify `total_sentences`; you'll run into problems if you ask to + score more than this number of sentences but it is inefficient to set the value too high. + + See the `article by Matt Taddy: "Document Classification by Inversion of Distributed Language Representations" + `_ and the + `gensim demo `_ for examples of + how to use such scores in document classification. + + Parameters + ---------- + sentences : iterable of list of str + The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + total_sentences : int, optional + Count of sentences. + chunksize : int, optional + Chunksize of jobs + queue_factor : int, optional + Multiplier for size of queue (number of workers * queue_factor). + report_delay : float, optional + Seconds to wait before reporting progress. + + """ + logger.info( + "scoring sentences with %i workers on %i vocabulary and %i features, " + "using sg=%s hs=%s sample=%s and negative=%s", + self.workers, len(self.wv), self.layer1_size, self.sg, self.hs, + self.sample, self.negative + ) + + if not self.wv.key_to_index: + raise RuntimeError("you must first build vocabulary before scoring new data") + + if not self.hs: + raise RuntimeError( + "We have currently only implemented score for the hierarchical softmax scheme, " + "so you need to have run word2vec with hs=1 and negative=0 for this to work." + ) + + def worker_loop(): + """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" + work = np.zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) + while True: + job = job_queue.get() + if job is None: # signal to finish + break + ns = 0 + for sentence_id, sentence in job: + if sentence_id >= total_sentences: + break + if self.sg: + score = score_sentence_sg(self, sentence, work) + else: + score = score_sentence_cbow(self, sentence, work, neu1) + sentence_scores[sentence_id] = score + ns += 1 + progress_queue.put(ns) # report progress + + start, next_report = default_timer(), 1.0 + # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( + job_queue = Queue(maxsize=queue_factor * self.workers) + progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) + + workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] + for thread in workers: + thread.daemon = True # make interrupting the process with ctrl+c easier + thread.start() + + sentence_count = 0 + sentence_scores = matutils.zeros_aligned(total_sentences, dtype=REAL) + + push_done = False + done_jobs = 0 jobs_source = enumerate(utils.grouper(enumerate(sentences), chunksize)) # fill jobs queue with (id, sentence) job items @@ -848,100 +1685,21 @@ def worker_loop(): pass # already out of loop; continue to next push elapsed = default_timer() - start - self.clear_sims() + self.wv.norms = None # clear any cached lengths logger.info( "scoring %i sentences took %.1fs, %.0f sentences/s", sentence_count, elapsed, sentence_count / elapsed ) return sentence_scores[:sentence_count] - def clear_sims(self): - """Remove all L2-normalized word vectors from the model, to free up memory. - - You can recompute them later again using the :meth:`~gensim.models.word2vec.Word2Vec.init_sims` method. - - """ - self.wv.vectors_norm = None - - def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): - """Merge in an input-hidden weight matrix loaded from the original C word2vec-tool format, - where it intersects with the current vocabulary. - - No words are added to the existing vocabulary, but intersecting words adopt the file's weights, and - non-intersecting words are left alone. - - Parameters - ---------- - fname : str - The file path to load the vectors from. - lockf : float, optional - Lock-factor value to be set for any imported word-vectors; the - default value of 0.0 prevents further updating of the vector during subsequent - training. Use 1.0 to allow further training updates of merged vectors. - binary : bool, optional - If True, `fname` is in the binary word2vec C format. - encoding : str, optional - Encoding of `text` for `unicode` function (python2 only). - unicode_errors : str, optional - Error handling behaviour, used as parameter for `unicode` function (python2 only). - - """ - overlap_count = 0 - logger.info("loading projection weights from %s", fname) - with utils.open(fname, 'rb') as fin: - header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format - if not vector_size == self.wv.vector_size: - raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) - # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? - if binary: - binary_len = dtype(REAL).itemsize * vector_size - for _ in range(vocab_size): - # mixed text and binary: read text first, then binary - word = [] - while True: - ch = fin.read(1) - if ch == b' ': - break - if ch != b'\n': # ignore newlines in front of words (some binary files have) - word.append(ch) - word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) - weights = fromstring(fin.read(binary_len), dtype=REAL) - if word in self.wv.vocab: - overlap_count += 1 - self.wv.vectors[self.wv.vocab[word].index] = weights - self.trainables.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes - else: - for line_no, line in enumerate(fin): - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") - if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [REAL(x) for x in parts[1:]] - if word in self.wv.vocab: - overlap_count += 1 - self.wv.vectors[self.wv.vocab[word].index] = weights - self.trainables.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes - logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.vectors.shape, fname) - - @deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead") - def __getitem__(self, words): - """Deprecated. Use `self.wv.__getitem__` instead. - Refer to the documentation for :meth:`~gensim.models.keyedvectors.Word2VecKeyedVectors.__getitem__`. - - """ - return self.wv.__getitem__(words) - - @deprecated("Method will be removed in 4.0.0, use self.wv.__contains__() instead") - def __contains__(self, word): - """Deprecated. Use `self.wv.__contains__` instead. - Refer to the documentation for :meth:`~gensim.models.keyedvectors.Word2VecKeyedVectors.__contains__`. - - """ - return self.wv.__contains__(word) - def predict_output_word(self, context_words_list, topn=10): """Get the probability distribution of the center word given context words. + Note this performs a CBOW-style propagation, even in SG models, + and doesn't quite weight the surrounding words the same as in + training -- so it's just one crude way of using a trained model + as a predictor. + Parameters ---------- context_words_list : list of str @@ -961,35 +1719,24 @@ def predict_output_word(self, context_words_list, topn=10): "so you need to have run word2vec with negative > 0 for this to work." ) - if not hasattr(self.wv, 'vectors') or not hasattr(self.trainables, 'syn1neg'): + if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") - word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab] - if not word_vocabs: - warnings.warn("All the input context words are out-of-vocabulary for the current model.") + word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv] + if not word2_indices: + logger.warning("All the input context words are out-of-vocabulary for the current model.") return None - word2_indices = [word.index for word in word_vocabs] - - l1 = np_sum(self.wv.vectors[word2_indices], axis=0) + l1 = np.sum(self.wv.vectors[word2_indices], axis=0) if word2_indices and self.cbow_mean: l1 /= len(word2_indices) # propagate hidden -> output and take softmax to get probabilities - prob_values = exp(dot(l1, self.trainables.syn1neg.T)) + prob_values = np.exp(np.dot(l1, self.syn1neg.T)) prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) # returning the most probable output words with their probabilities - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] - - def init_sims(self, replace=False): - """Deprecated. Use `self.wv.init_sims` instead. - See :meth:`~gensim.models.keyedvectors.Word2VecKeyedVectors.init_sims`. - - """ - if replace and hasattr(self.trainables, 'syn1'): - del self.trainables.syn1 - return self.wv.init_sims(replace) + return [(self.wv.index_to_key[index1], prob_values[index1]) for index1 in top_indices] def reset_from(self, other_model): """Borrow shareable pre-built structures from `other_model` and reset hidden layer weights. @@ -1008,28 +1755,13 @@ def reset_from(self, other_model): Another model to copy the internal structures from. """ - self.wv.vocab = other_model.wv.vocab - self.wv.index2word = other_model.wv.index2word - self.vocabulary.cum_table = other_model.vocabulary.cum_table + self.wv.key_to_index = other_model.wv.key_to_index + self.wv.index_to_key = other_model.wv.index_to_key + self.wv.expandos = other_model.wv.expandos + self.wv.norms = None + self.cum_table = other_model.cum_table self.corpus_count = other_model.corpus_count - self.trainables.reset_weights(self.hs, self.negative, self.wv) - - @staticmethod - def log_accuracy(section): - """Deprecated. Use `self.wv.log_accuracy` instead. - See :meth:`~gensim.models.word2vec.Word2VecKeyedVectors.log_accuracy`. - - """ - return Word2VecKeyedVectors.log_accuracy(section) - - @deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_analogies() instead") - def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True): - """Deprecated. Use `self.wv.accuracy` instead. - See :meth:`~gensim.models.word2vec.Word2VecKeyedVectors.accuracy`. - - """ - most_similar = most_similar or Word2VecKeyedVectors.most_similar - return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive) + self.reset_weights() def __str__(self): """Human readable representation of the model's state. @@ -1042,27 +1774,9 @@ def __str__(self): """ return "%s(vocab=%s, size=%s, alpha=%s)" % ( - self.__class__.__name__, len(self.wv.index2word), self.wv.vector_size, self.alpha + self.__class__.__name__, len(self.wv.index_to_key), self.wv.vector_size, self.alpha ) - def delete_temporary_training_data(self, replace_word_vectors_with_normalized=False): - """Discard parameters that are used in training and scoring, to save memory. - - Warnings - -------- - Use only if you're sure you're done training a model. - - Parameters - ---------- - replace_word_vectors_with_normalized : bool, optional - If True, forget the original (not normalized) word vectors and only keep - the L2-normalized word vectors, to save even more memory. - - """ - if replace_word_vectors_with_normalized: - self.init_sims(replace=True) - self._minimize_model() - def save(self, *args, **kwargs): """Save the model. This saved model can be loaded again using :func:`~gensim.models.word2vec.Word2Vec.load`, which supports @@ -1074,8 +1788,8 @@ def save(self, *args, **kwargs): Path to the file. """ - # don't bother storing the cached normalized vectors, recalculable table - kwargs['ignore'] = kwargs.get('ignore', ['vectors_norm', 'cum_table']) + # don't bother storing recalculable table + kwargs['ignore'] = kwargs.get('ignore', []) + ['cum_table', ] super(Word2Vec, self).save(*args, **kwargs) def get_latest_training_loss(self): @@ -1089,36 +1803,8 @@ def get_latest_training_loss(self): """ return self.running_training_loss - @deprecated( - "Method will be removed in 4.0.0, keep just_word_vectors = model.wv to retain just the KeyedVectors instance" - ) - def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_vectors_lockf=False): - if save_syn1 and save_syn1neg and save_vectors_lockf: - return - if hasattr(self.trainables, 'syn1') and not save_syn1: - del self.trainables.syn1 - if hasattr(self.trainables, 'syn1neg') and not save_syn1neg: - del self.trainables.syn1neg - if hasattr(self.trainables, 'vectors_lockf') and not save_vectors_lockf: - del self.trainables.vectors_lockf - self.model_trimmed_post_training = True - - @classmethod - def load_word2vec_format( - cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): - """Deprecated. Use :meth:`gensim.models.KeyedVectors.load_word2vec_format` instead.""" - raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.") - - def save_word2vec_format(self, fname, fvocab=None, binary=False): - """Deprecated. Use `model.wv.save_word2vec_format` instead. - See :meth:`gensim.models.KeyedVectors.save_word2vec_format`. - - """ - raise DeprecationWarning("Deprecated. Use model.wv.save_word2vec_format instead.") - @classmethod - def load(cls, *args, **kwargs): + def load(cls, *args, rethrow=False, **kwargs): """Load a previously saved :class:`~gensim.models.word2vec.Word2Vec` model. See Also @@ -1139,25 +1825,59 @@ def load(cls, *args, **kwargs): """ try: model = super(Word2Vec, cls).load(*args, **kwargs) - - # for backward compatibility for `max_final_vocab` feature + if not isinstance(model, Word2Vec): + rethrow = True + raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls))) + # for backward compatibility + if not hasattr(model, 'ns_exponent'): + model.ns_exponent = 0.75 + if model.negative and hasattr(model.wv, 'index2word'): + model.make_cum_table() # rebuild cum_table from vocabulary ## TODO: ??? + if not hasattr(model, 'corpus_count'): + model.corpus_count = None + if not hasattr(model, 'corpus_total_words'): + model.corpus_total_words = None + if not hasattr(model.wv, 'vectors_lockf') and hasattr(model.wv, 'vectors'): + model.wv.vectors_lockf = getattr(model, 'vectors_lockf', np.ones(1, dtype=REAL)) + if not hasattr(model, 'random'): + model.random = np.random.RandomState(model.seed) + if not hasattr(model, 'train_count'): + model.train_count = 0 + model.total_train_time = 0 + if not hasattr(model, 'epochs'): + model.epochs = model.iter + del model.iter if not hasattr(model, 'max_final_vocab'): model.max_final_vocab = None - model.vocabulary.max_final_vocab = None - + if hasattr(model, 'vocabulary'): # re-integrate state that had been moved + for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): + setattr(model, a, getattr(model.vocabulary, a)) + del model.vocabulary + if hasattr(model, 'trainables'): # re-integrate state that had been moved + for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): + if hasattr(model.trainables, a): + setattr(model, a, getattr(model.trainables, a)) + if hasattr(model, 'syn1'): + model.syn1 = model.syn1 + del model.syn1 + del model.trainables return model - except AttributeError: - logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.') - from gensim.models.deprecated.word2vec import load_old_word2vec - return load_old_word2vec(*args, **kwargs) + except AttributeError as ae: + if rethrow: + raise ae + logger.error( + "Model load error. Was model saved using code from an older Gensim Version? " + "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "compatibility with current code.") + raise ae class BrownCorpus(object): - """Iterate over sentences from the `Brown corpus `_ - (part of `NLTK data `_). - - """ def __init__(self, dirname): + """Iterate over sentences from the `Brown corpus `_ + (part of `NLTK data `_). + + """ self.dirname = dirname def __iter__(self): @@ -1179,8 +1899,8 @@ def __iter__(self): class Text8Corpus(object): - """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip.""" def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH): + """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip.""" self.fname = fname self.max_sentence_length = max_sentence_length @@ -1207,12 +1927,9 @@ def __iter__(self): class LineSentence(object): - """Iterate over a file that contains sentences: one line = one sentence. - Words must be already preprocessed and separated by whitespace. - - """ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """ + """Iterate over a file that contains sentences: one line = one sentence. + Words must be already preprocessed and separated by whitespace. Parameters ---------- @@ -1259,22 +1976,20 @@ def __iter__(self): class PathLineSentences(object): - """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory - in alphabetical order by filename. - - The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`: - .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. + def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): + """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory + in alphabetical order by filename. - The format of files (either text, or compressed text files) in the path is one sentence = one line, - with words already preprocessed and separated by whitespace. + The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`: + .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. - Warnings - -------- - Does **not recurse** into subdirectories. + The format of files (either text, or compressed text files) in the path is one sentence = one line, + with words already preprocessed and separated by whitespace. + + Warnings + -------- + Does **not recurse** into subdirectories. - """ - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """ Parameters ---------- source : str @@ -1314,340 +2029,49 @@ def __iter__(self): i += self.max_sentence_length -def _scan_vocab_worker(stream, progress_queue, max_vocab_size=None, trim_rule=None): - """Do an initial scan of all words appearing in stream. - - Note: This function can not be Word2VecVocab's method because - of multiprocessing synchronization specifics in Python. - """ - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - sentence_no = -1 - total_words = 0 - for sentence_no, sentence in enumerate(stream): - if not checked_string_types: - if isinstance(sentence, string_types): - log_msg = "Each 'sentences' item should be a list of words (usually unicode strings). " \ - "First item here is instead plain %s." % type(sentence) - progress_queue.put(log_msg) - - checked_string_types += 1 - - for word in sentence: - vocab[word] += 1 - - if max_vocab_size and len(vocab) > max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - total_words += len(sentence) - - progress_queue.put((total_words, sentence_no + 1)) - progress_queue.put(None) - return vocab - - class Word2VecVocab(utils.SaveLoad): - """Vocabulary used by :class:`~gensim.models.word2vec.Word2Vec`.""" - def __init__( - self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, - max_final_vocab=None, ns_exponent=0.75): - self.max_vocab_size = max_vocab_size - self.min_count = min_count - self.sample = sample - self.sorted_vocab = sorted_vocab - self.null_word = null_word - self.cum_table = None # for negative sampling - self.raw_vocab = None - self.max_final_vocab = max_final_vocab - self.ns_exponent = ns_exponent - - def _scan_vocab(self, sentences, progress_per, trim_rule): - sentence_no = -1 - total_words = 0 - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - for sentence_no, sentence in enumerate(sentences): - if not checked_string_types: - if isinstance(sentence, string_types): - logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings). " - "First item here is instead plain %s.", - type(sentence) - ) - checked_string_types += 1 - if sentence_no % progress_per == 0: - logger.info( - "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, total_words, len(vocab) - ) - for word in sentence: - vocab[word] += 1 - total_words += len(sentence) - - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - corpus_count = sentence_no + 1 - self.raw_vocab = vocab - return total_words, corpus_count - - def scan_vocab(self, sentences=None, corpus_file=None, progress_per=10000, workers=None, trim_rule=None): - logger.info("collecting all words and their counts") - if corpus_file: - sentences = LineSentence(corpus_file) - - total_words, corpus_count = self._scan_vocab(sentences, progress_per, trim_rule) - - logger.info( - "collected %i word types from a corpus of %i raw words and %i sentences", - len(self.raw_vocab), total_words, corpus_count - ) - - return total_words, corpus_count - - def sort_vocab(self, wv): - """Sort the vocabulary so the most frequent words have the lowest indexes.""" - if len(wv.vectors): - raise RuntimeError("cannot sort vocabulary after model weights already initialized.") - wv.index2word.sort(key=lambda word: wv.vocab[word].count, reverse=True) - for i, word in enumerate(wv.index2word): - wv.vocab[word].index = i - - def prepare_vocab( - self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None, - min_count=None, sample=None, dry_run=False): - """Apply vocabulary settings for `min_count` (discarding less-frequent words) - and `sample` (controlling the downsampling of more-frequent words). - - Calling with `dry_run=True` will only simulate the provided settings and - report the size of the retained vocabulary, effective corpus length, and - estimated memory requirements. Results are both printed via logging and - returned as a dict. - - Delete the raw vocabulary after the scaling is done to free up RAM, - unless `keep_raw_vocab` is set. - - """ - min_count = min_count or self.min_count - sample = sample or self.sample - drop_total = drop_unique = 0 - - # set effective_min_count to min_count in case max_final_vocab isn't set - self.effective_min_count = min_count - - # if max_final_vocab is specified instead of min_count - # pick a min_count which satisfies max_final_vocab as well as possible - if self.max_final_vocab is not None: - sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) - calc_min_count = 1 - - if self.max_final_vocab < len(sorted_vocab): - calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1 - - self.effective_min_count = max(calc_min_count, min_count) - logger.info( - "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d", - self.max_final_vocab, min_count, calc_min_count, self.effective_min_count - ) - - if not update: - logger.info("Loading a fresh vocabulary") - retain_total, retain_words = 0, [] - # Discard words less-frequent than min_count - if not dry_run: - wv.index2word = [] - # make stored settings match these applied settings - self.min_count = min_count - self.sample = sample - wv.vocab = {} - - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): - retain_words.append(word) - retain_total += v - if not dry_run: - wv.vocab[word] = Vocab(count=v, index=len(wv.index2word)) - wv.index2word.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(retain_words) + drop_unique - retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info( - "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique - ) - original_total = retain_total + drop_total - retain_pct = retain_total * 100 / max(original_total, 1) - logger.info( - "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - self.effective_min_count, retain_total, retain_pct, original_total, drop_total - ) - else: - logger.info("Updating model with new vocabulary") - new_total = pre_exist_total = 0 - new_words = pre_exist_words = [] - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): - if word in wv.vocab: - pre_exist_words.append(word) - pre_exist_total += v - if not dry_run: - wv.vocab[word].count += v - else: - new_words.append(word) - new_total += v - if not dry_run: - wv.vocab[word] = Vocab(count=v, index=len(wv.index2word)) - wv.index2word.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique - pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) - new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)", - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total - ) - retain_words = new_words + pre_exist_words - retain_total = new_total + pre_exist_total - - # Precalculate each vocabulary item's threshold for sampling - if not sample: - # no words downsampled - threshold_count = retain_total - elif sample < 1.0: - # traditional meaning: set parameter as proportion of total - threshold_count = sample * retain_total - else: - # new shorthand: sample >= 1 means downsample all words with higher count than sample - threshold_count = int(sample * (3 + sqrt(5)) / 2) - - downsample_total, downsample_unique = 0, 0 - for w in retain_words: - v = self.raw_vocab[w] - word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) - if word_probability < 1.0: - downsample_unique += 1 - downsample_total += word_probability * v - else: - word_probability = 1.0 - downsample_total += v - if not dry_run: - wv.vocab[w].sample_int = int(round(word_probability * 2**32)) - - if not dry_run and not keep_raw_vocab: - logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) - self.raw_vocab = defaultdict(int) - - logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total - ) - - # return from each step: words-affected, resulting-corpus-size, extra memory estimates - report_values = { - 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, - 'downsample_total': int(downsample_total), 'num_retained_words': len(retain_words) - } - - if self.null_word: - # create null pseudo-word for padding when using concatenative L1 (run-of-words) - # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter - self.add_null_word(wv) - - if self.sorted_vocab and not update: - self.sort_vocab(wv) - if hs: - # add info about each word's Huffman encoding - self.create_binary_tree(wv) - if negative: - # build the table for drawing random words (for negative sampling) - self.make_cum_table(wv) - - return report_values - - def add_null_word(self, wv): - word, v = '\0', Vocab(count=1, sample_int=0) - v.index = len(wv.vocab) - wv.index2word.append(word) - wv.vocab[word] = v - - def create_binary_tree(self, wv): - """Create a `binary Huffman tree `_ using stored vocabulary - word counts. Frequent words will have shorter binary codes. - Called internally from :meth:`~gensim.models.word2vec.Word2VecVocab.build_vocab`. - - """ - _assign_binary_codes(wv.vocab) + """Obsolete class retained for now as load-compatibility state capture""" + pass - def make_cum_table(self, wv, domain=2**31 - 1): - """Create a cumulative-distribution table using stored vocabulary word counts for - drawing random words in the negative-sampling training routines. - To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]), - then finding that integer's sorted insertion point (as if by `bisect_left` or `ndarray.searchsorted()`). - That insertion point is the drawn index, coming up in proportion equal to the increment at that slot. +class Word2VecTrainables(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" + pass - Called internally from :meth:`~gensim.models.word2vec.Word2VecVocab.build_vocab`. - """ - vocab_size = len(wv.index2word) - self.cum_table = zeros(vocab_size, dtype=uint32) - # compute sum of all power (Z in paper) - train_words_pow = 0.0 - for word_index in range(vocab_size): - train_words_pow += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent - cumulative = 0.0 - for word_index in range(vocab_size): - cumulative += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent - self.cum_table[word_index] = round(cumulative / train_words_pow * domain) - if len(self.cum_table) > 0: - assert self.cum_table[-1] == domain +class Heapitem(namedtuple('Heapitem', 'count, index, left, right')): + def __lt__(self, other): + return self.count < other.count -def _build_heap(vocab): - heap = list(itervalues(vocab)) +def _build_heap(wv): + heap = list(Heapitem(wv.get_vecattr(i, 'count'), i, None, None) for i in range(len(wv.index_to_key))) heapq.heapify(heap) - for i in range(len(vocab) - 1): + for i in range(len(wv) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) heapq.heappush( - heap, Vocab(count=min1.count + min2.count, index=i + len(vocab), left=min1, right=min2) + heap, Heapitem(count=min1.count + min2.count, index=i + len(wv), left=min1, right=min2) ) return heap -def _assign_binary_codes(vocab): +def _assign_binary_codes(wv): """ Appends a binary code to each vocab term. Parameters ---------- - vocab : dict - A dictionary of :class:`gensim.models.word2vec.Vocab` objects. - - Notes - ----- - Expects each term to have an .index attribute that contains the order in - which the term was added to the vocabulary. E.g. term.index == 0 means the - term was added to the vocab first. + wv : KeyedVectors + A collection of word-vectors. Sets the .code and .point attributes of each node. Each code is a numpy.array containing 0s and 1s. Each point is an integer. """ - logger.info("constructing a huffman tree from %i words", len(vocab)) + logger.info("constructing a huffman tree from %i words", len(wv)) - heap = _build_heap(vocab) + heap = _build_heap(wv) if not heap: # # TODO: how can we end up with an empty heap? @@ -1660,87 +2084,22 @@ def _assign_binary_codes(vocab): stack = [(heap[0], [], [])] while stack: node, codes, points = stack.pop() - if node.index < len(vocab): + if node[1] < len(wv): # node[1] = index # leaf node => store its path from the root - node.code, node.point = codes, points + k = node[1] + wv.set_vecattr(k, 'code', codes) + wv.set_vecattr(k, 'point', points) + # node.code, node.point = codes, points max_depth = max(len(codes), max_depth) else: # inner node => continue recursion - points = array(list(points) + [node.index - len(vocab)], dtype=uint32) - stack.append((node.left, array(list(codes) + [0], dtype=uint8), points)) - stack.append((node.right, array(list(codes) + [1], dtype=uint8), points)) + points = np.array(list(points) + [node.index - len(wv)], dtype=np.uint32) + stack.append((node.left, np.array(list(codes) + [0], dtype=np.uint8), points)) + stack.append((node.right, np.array(list(codes) + [1], dtype=np.uint8), points)) logger.info("built huffman tree with maximum node depth %i", max_depth) -class Word2VecTrainables(utils.SaveLoad): - """Represents the inner shallow neural network used to train :class:`~gensim.models.word2vec.Word2Vec`.""" - def __init__(self, vector_size=100, seed=1, hashfxn=hash): - self.hashfxn = hashfxn - self.layer1_size = vector_size - self.seed = seed - - def prepare_weights(self, hs, negative, wv, update=False, vocabulary=None): - """Build tables and model weights based on final vocabulary settings.""" - # set initial input/projection and hidden weights - if not update: - self.reset_weights(hs, negative, wv) - else: - self.update_weights(hs, negative, wv) - - def seeded_vector(self, seed_string, vector_size): - """Get a random vector (but deterministic by seed_string).""" - # Note: built-in hash() may vary by Python version or even (in Py3.x) per launch - once = random.Generator(random.SFC64(self.hashfxn(seed_string) & 0xffffffff)) - return (once.random(vector_size) - 0.5) / vector_size - - def reset_weights(self, hs, negative, wv): - """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" - logger.info("resetting layer weights") - wv.vectors = empty((len(wv.vocab), wv.vector_size), dtype=REAL) - # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once - for i in range(len(wv.vocab)): - # construct deterministic seed from word AND seed argument - wv.vectors[i] = self.seeded_vector(wv.index2word[i] + str(self.seed), wv.vector_size) - if hs: - self.syn1 = zeros((len(wv.vocab), self.layer1_size), dtype=REAL) - if negative: - self.syn1neg = zeros((len(wv.vocab), self.layer1_size), dtype=REAL) - wv.vectors_norm = None - - self.vectors_lockf = ones(len(wv.vocab), dtype=REAL) # zeros suppress learning - - def update_weights(self, hs, negative, wv): - """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" - logger.info("updating layer weights") - gained_vocab = len(wv.vocab) - len(wv.vectors) - newvectors = empty((gained_vocab, wv.vector_size), dtype=REAL) - - # randomize the remaining words - for i in range(len(wv.vectors), len(wv.vocab)): - # construct deterministic seed from word AND seed argument - newvectors[i - len(wv.vectors)] = self.seeded_vector(wv.index2word[i] + str(self.seed), wv.vector_size) - - # Raise an error if an online update is run before initial training on a corpus - if not len(wv.vectors): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus before doing an online update." - ) - - wv.vectors = vstack([wv.vectors, newvectors]) - - if hs: - self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) - if negative: - pad = zeros((gained_vocab, self.layer1_size), dtype=REAL) - self.syn1neg = vstack([self.syn1neg, pad]) - wv.vectors_norm = None - - # do not suppress learning for already learned words - self.vectors_lockf = ones(len(wv.vocab), dtype=REAL) # zeros suppress learning - - # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \ # -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": @@ -1759,7 +2118,7 @@ def update_weights(self, hs, negative, wv): from gensim.models.word2vec import Word2Vec # noqa:F811 avoid referencing __main__ in pickle - seterr(all='raise') # don't ignore numpy errors + np.seterr(all='raise') # don't ignore numpy errors parser = argparse.ArgumentParser() parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index 184042250e..467b6a2d45 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -41,15 +41,19 @@ cdef class CythonVocab: def __init__(self, wv, hs=0, fasttext=0): cdef VocabItem word - for py_token, vocab_item in iteritems(wv.vocab): + vocab_sample_ints = wv.expandos['sample_int'] + if hs: + vocab_codes = wv.expandos['code'] + vocab_points = wv.expandos['point'] + for py_token in wv.key_to_index.keys(): token = any2utf8(py_token) - word.index = vocab_item.index - word.sample_int = vocab_item.sample_int + word.index = wv.get_index(py_token) + word.sample_int = vocab_sample_ints[word.index] if hs: - word.code = np.PyArray_DATA(vocab_item.code) - word.code_len = len(vocab_item.code) - word.point = np.PyArray_DATA(vocab_item.point) + word.code = np.PyArray_DATA(vocab_codes[word.index]) + word.code_len = len(vocab_codes[word.index]) + word.point = np.PyArray_DATA(vocab_points[word.index]) # subwords information, used only in FastText model if fasttext: @@ -326,11 +330,13 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec if c.hs: w2v_fast_sentence_sg_hs( c.points[i], c.codes[i], c.codelens[i], c.syn0, c.syn1, c.size, c.indexes[j], - c.alpha, c.work, c.word_locks, c.compute_loss, &c.running_training_loss) + c.alpha, c.work, c.words_lockf, c.words_lockf_len, c.compute_loss, + &c.running_training_loss) if c.negative: c.next_random = w2v_fast_sentence_sg_neg( c.negative, c.cum_table, c.cum_table_len, c.syn0, c.syn1neg, c.size, - c.indexes[i], c.indexes[j], c.alpha, c.work, c.next_random, c.word_locks, + c.indexes[i], c.indexes[j], c.alpha, c.work, c.next_random, + c.words_lockf, c.words_lockf_len, c.compute_loss, &c.running_training_loss) total_sentences += sentences.size() @@ -421,13 +427,15 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp if c.hs: w2v_fast_sentence_cbow_hs( c.points[i], c.codes[i], c.codelens, c.neu1, c.syn0, c.syn1, c.size, c.indexes, c.alpha, - c.work, i, j, k, c.cbow_mean, c.word_locks, c.compute_loss, &c.running_training_loss) + c.work, i, j, k, c.cbow_mean, c.words_lockf, c.words_lockf_len, c.compute_loss, + &c.running_training_loss) if c.negative: c.next_random = w2v_fast_sentence_cbow_neg( c.negative, c.cum_table, c.cum_table_len, c.codelens, c.neu1, c.syn0, c.syn1neg, c.size, c.indexes, c.alpha, c.work, i, j, k, c.cbow_mean, - c.next_random, c.word_locks, c.compute_loss, &c.running_training_loss) + c.next_random, c.words_lockf, c.words_lockf_len, c.compute_loss, + &c.running_training_loss) total_sentences += sentences.size() total_effective_words += effective_words diff --git a/gensim/models/word2vec_inner.pxd b/gensim/models/word2vec_inner.pxd index fabea96321..82abad2f05 100644 --- a/gensim/models/word2vec_inner.pxd +++ b/gensim/models/word2vec_inner.pxd @@ -53,7 +53,8 @@ cdef struct Word2VecConfig: REAL_t running_training_loss, alpha REAL_t *syn0 - REAL_t *word_locks + REAL_t *words_lockf + np.uint32_t words_lockf_len REAL_t *work REAL_t *neu1 @@ -94,32 +95,32 @@ cdef unsigned long long random_int32(unsigned long long *next_random) nogil cdef void w2v_fast_sentence_sg_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, REAL_t *syn0, REAL_t *syn1, const int size, - const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, REAL_t *word_locks, - const int _compute_loss, REAL_t *_running_training_loss_param) nogil + const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, REAL_t *words_lockf, + const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil cdef unsigned long long w2v_fast_sentence_sg_neg( const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index, const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, - unsigned long long next_random, REAL_t *word_locks, - const int _compute_loss, REAL_t *_running_training_loss_param) nogil + unsigned long long next_random, REAL_t *words_lockf, + const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil cdef void w2v_fast_sentence_cbow_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size, const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, - int i, int j, int k, int cbow_mean, REAL_t *word_locks, - const int _compute_loss, REAL_t *_running_training_loss_param) nogil + int i, int j, int k, int cbow_mean, REAL_t *words_lockf, + const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil cdef unsigned long long w2v_fast_sentence_cbow_neg( const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, int codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, - int i, int j, int k, int cbow_mean, unsigned long long next_random, REAL_t *word_locks, - const int _compute_loss, REAL_t *_running_training_loss_param) nogil + int i, int j, int k, int cbow_mean, unsigned long long next_random, REAL_t *words_lockf, + const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1=*) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 0576773bd5..50bfc803bd 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -74,8 +74,8 @@ cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, con cdef void w2v_fast_sentence_sg_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, REAL_t *syn0, REAL_t *syn1, const int size, - const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, REAL_t *word_locks, - const int _compute_loss, REAL_t *_running_training_loss_param) nogil: + const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, REAL_t *words_lockf, + const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: """Train on a single effective word from the current batch, using the Skip-Gram model. In this model we are using a given word to predict a context word (a word that is @@ -102,7 +102,7 @@ cdef void w2v_fast_sentence_sg_hs( Learning rate. work Private working memory for each worker. - word_locks + words_lockf Lock factors for each word. A value of 0 will block training. _compute_loss Whether or not the loss should be computed at this step. @@ -135,7 +135,7 @@ cdef void w2v_fast_sentence_sg_hs( our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE) - our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE) + our_saxpy(&size, &words_lockf[word2_index % lockf_len], work, &ONE, &syn0[row1], &ONE) # to support random draws from negative-sampling cum_table @@ -160,8 +160,8 @@ cdef unsigned long long w2v_fast_sentence_sg_neg( const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index, const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, - unsigned long long next_random, REAL_t *word_locks, - const int _compute_loss, REAL_t *_running_training_loss_param) nogil: + unsigned long long next_random, REAL_t *words_lockf, + const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: """Train on a single effective word from the current batch, using the Skip-Gram model. In this model we are using a given word to predict a context word (a word that is @@ -193,7 +193,7 @@ cdef unsigned long long w2v_fast_sentence_sg_neg( Private working memory for each worker. next_random Seed to produce the index for the next word to be randomly sampled. - word_locks + words_lockf Lock factors for each word. A value of 0 will block training. _compute_loss Whether or not the loss should be computed at this step. @@ -242,7 +242,7 @@ cdef unsigned long long w2v_fast_sentence_sg_neg( our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE) - our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE) + our_saxpy(&size, &words_lockf[word2_index % lockf_len], work, &ONE, &syn0[row1], &ONE) return next_random @@ -251,7 +251,7 @@ cdef void w2v_fast_sentence_cbow_hs( const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size, const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, - int i, int j, int k, int cbow_mean, REAL_t *word_locks, + int i, int j, int k, int cbow_mean, REAL_t *words_lockf, const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: """Train on a single effective word from the current batch, using the CBOW method. @@ -289,7 +289,7 @@ cdef void w2v_fast_sentence_cbow_hs( Index of the word at the end of the context window. cbow_mean If 0, use the sum of the context word vectors as the prediction. If 1, use the mean. - word_locks + words_lockf Lock factors for each word. A value of 0 will block training. _compute_loss Whether or not the loss should be computed at this step. @@ -342,15 +342,15 @@ cdef void w2v_fast_sentence_cbow_hs( if m == i: continue else: - our_saxpy(&size, &word_locks[indexes[m]], work, &ONE, &syn0[indexes[m] * size], &ONE) + our_saxpy(&size, &words_lockf[indexes[m] % lockf_len], work, &ONE, &syn0[indexes[m] * size], &ONE) cdef unsigned long long w2v_fast_sentence_cbow_neg( const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, int codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, - int i, int j, int k, int cbow_mean, unsigned long long next_random, REAL_t *word_locks, - const int _compute_loss, REAL_t *_running_training_loss_param) nogil: + int i, int j, int k, int cbow_mean, unsigned long long next_random, REAL_t *words_lockf, + const np.uint32_t lockf_len, const int _compute_loss, REAL_t *_running_training_loss_param) nogil: """Train on a single effective word from the current batch, using the CBOW method. Using this method we train the trainable neural network by attempting to predict a @@ -392,7 +392,7 @@ cdef unsigned long long w2v_fast_sentence_cbow_neg( If 0, use the sum of the context word vectors as the prediction. If 1, use the mean. next_random Seed for the drawing the predicted word for the next iteration of the same routine. - word_locks + words_lockf Lock factors for each word. A value of 0 will block training. _compute_loss Whether or not the loss should be computed at this step. @@ -459,7 +459,7 @@ cdef unsigned long long w2v_fast_sentence_cbow_neg( if m == i: continue else: - our_saxpy(&size, &word_locks[indexes[m]], work, &ONE, &syn0[indexes[m]*size], &ONE) + our_saxpy(&size, &words_lockf[indexes[m] % lockf_len], work, &ONE, &syn0[indexes[m]*size], &ONE) return next_random @@ -467,7 +467,7 @@ cdef unsigned long long w2v_fast_sentence_cbow_neg( cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1=None): c[0].hs = model.hs c[0].negative = model.negative - c[0].sample = (model.vocabulary.sample != 0) + c[0].sample = (model.sample != 0) c[0].cbow_mean = model.cbow_mean c[0].window = model.window c[0].workers = model.workers @@ -476,17 +476,18 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1 c[0].running_training_loss = model.running_training_loss c[0].syn0 = (np.PyArray_DATA(model.wv.vectors)) - c[0].word_locks = (np.PyArray_DATA(model.trainables.vectors_lockf)) + c[0].words_lockf = (np.PyArray_DATA(model.wv.vectors_lockf)) + c[0].words_lockf_len = len(model.wv.vectors_lockf) c[0].alpha = alpha c[0].size = model.wv.vector_size if c[0].hs: - c[0].syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c[0].syn1 = (np.PyArray_DATA(model.syn1)) if c[0].negative: - c[0].syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) - c[0].cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) - c[0].cum_table_len = len(model.vocabulary.cum_table) + c[0].syn1neg = (np.PyArray_DATA(model.syn1neg)) + c[0].cum_table = (np.PyArray_DATA(model.cum_table)) + c[0].cum_table_len = len(model.cum_table) if c[0].negative or c[0].sample: c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) @@ -526,27 +527,31 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): cdef int i, j, k cdef int effective_words = 0, effective_sentences = 0 cdef int sent_idx, idx_start, idx_end + cdef np.uint32_t *vocab_sample_ints init_w2v_config(&c, model, alpha, compute_loss, _work) - + if c.sample: + vocab_sample_ints = np.PyArray_DATA(model.wv.expandos['sample_int']) + if c.hs: + vocab_codes = model.wv.expandos['code'] + vocab_points = model.wv.expandos['point'] # prepare C structures so we can go "full C" and release the Python GIL - vlookup = model.wv.vocab c.sentence_idx[0] = 0 # indices of the first sentence always start at 0 for sent in sentences: if not sent: continue # ignore empty sentences; leave effective_sentences unchanged for token in sent: - word = vlookup[token] if token in vlookup else None - if word is None: + if token not in model.wv.key_to_index: continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window - if c.sample and word.sample_int < random_int32(&c.next_random): + word_index = model.wv.key_to_index[token] + if c.sample and vocab_sample_ints[word_index] < random_int32(&c.next_random): continue - c.indexes[effective_words] = word.index + c.indexes[effective_words] = word_index if c.hs: - c.codelens[effective_words] = len(word.code) - c.codes[effective_words] = np.PyArray_DATA(word.code) - c.points[effective_words] = np.PyArray_DATA(word.point) + c.codelens[effective_words] = len(vocab_codes[word_index]) + c.codes[effective_words] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[effective_words] = np.PyArray_DATA(vocab_points[word_index]) effective_words += 1 if effective_words == MAX_SENTENCE_LEN: break # TODO: log warning, tally overflow? @@ -580,9 +585,9 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): if j == i: continue if c.hs: - w2v_fast_sentence_sg_hs(c.points[i], c.codes[i], c.codelens[i], c.syn0, c.syn1, c.size, c.indexes[j], c.alpha, c.work, c.word_locks, c.compute_loss, &c.running_training_loss) + w2v_fast_sentence_sg_hs(c.points[i], c.codes[i], c.codelens[i], c.syn0, c.syn1, c.size, c.indexes[j], c.alpha, c.work, c.words_lockf, c.words_lockf_len, c.compute_loss, &c.running_training_loss) if c.negative: - c.next_random = w2v_fast_sentence_sg_neg(c.negative, c.cum_table, c.cum_table_len, c.syn0, c.syn1neg, c.size, c.indexes[i], c.indexes[j], c.alpha, c.work, c.next_random, c.word_locks, c.compute_loss, &c.running_training_loss) + c.next_random = w2v_fast_sentence_sg_neg(c.negative, c.cum_table, c.cum_table_len, c.syn0, c.syn1neg, c.size, c.indexes[i], c.indexes[j], c.alpha, c.work, c.next_random, c.words_lockf, c.words_lockf_len, c.compute_loss, &c.running_training_loss) model.running_training_loss = c.running_training_loss return effective_words @@ -618,26 +623,31 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): cdef int i, j, k cdef int effective_words = 0, effective_sentences = 0 cdef int sent_idx, idx_start, idx_end + cdef np.uint32_t *vocab_sample_ints init_w2v_config(&c, model, alpha, compute_loss, _work, _neu1) + if c.sample: + vocab_sample_ints = np.PyArray_DATA(model.wv.expandos['sample_int']) + if c.hs: + vocab_codes = model.wv.expandos['code'] + vocab_points = model.wv.expandos['point'] # prepare C structures so we can go "full C" and release the Python GIL - vlookup = model.wv.vocab c.sentence_idx[0] = 0 # indices of the first sentence always start at 0 for sent in sentences: if not sent: continue # ignore empty sentences; leave effective_sentences unchanged for token in sent: - word = vlookup[token] if token in vlookup else None - if word is None: + if token not in model.wv.key_to_index: continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window - if c.sample and word.sample_int < random_int32(&c.next_random): + word_index = model.wv.key_to_index[token] + if c.sample and vocab_sample_ints[word_index] < random_int32(&c.next_random): continue - c.indexes[effective_words] = word.index + c.indexes[effective_words] = word_index if c.hs: - c.codelens[effective_words] = len(word.code) - c.codes[effective_words] = np.PyArray_DATA(word.code) - c.points[effective_words] = np.PyArray_DATA(word.point) + c.codelens[effective_words] = len(vocab_codes[word_index]) + c.codes[effective_words] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[effective_words] = np.PyArray_DATA(vocab_points[word_index]) effective_words += 1 if effective_words == MAX_SENTENCE_LEN: break # TODO: log warning, tally overflow? @@ -668,9 +678,9 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): if k > idx_end: k = idx_end if c.hs: - w2v_fast_sentence_cbow_hs(c.points[i], c.codes[i], c.codelens, c.neu1, c.syn0, c.syn1, c.size, c.indexes, c.alpha, c.work, i, j, k, c.cbow_mean, c.word_locks, c.compute_loss, &c.running_training_loss) + w2v_fast_sentence_cbow_hs(c.points[i], c.codes[i], c.codelens, c.neu1, c.syn0, c.syn1, c.size, c.indexes, c.alpha, c.work, i, j, k, c.cbow_mean, c.words_lockf, c.words_lockf_len, c.compute_loss, &c.running_training_loss) if c.negative: - c.next_random = w2v_fast_sentence_cbow_neg(c.negative, c.cum_table, c.cum_table_len, c.codelens, c.neu1, c.syn0, c.syn1neg, c.size, c.indexes, c.alpha, c.work, i, j, k, c.cbow_mean, c.next_random, c.word_locks, c.compute_loss, &c.running_training_loss) + c.next_random = w2v_fast_sentence_cbow_neg(c.negative, c.cum_table, c.cum_table_len, c.codelens, c.neu1, c.syn0, c.syn1neg, c.size, c.indexes, c.alpha, c.work, i, j, k, c.cbow_mean, c.next_random, c.words_lockf, c.words_lockf_len, c.compute_loss, &c.running_training_loss) model.running_training_loss = c.running_training_loss return effective_words @@ -709,21 +719,39 @@ def score_sentence_sg(model, sentence, _work): cdef long result = 0 cdef int sentence_len - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) # convert Python structures to primitive types, so we can release the GIL c.work = np.PyArray_DATA(_work) - vlookup = model.wv.vocab + vocab_codes = model.wv.expandos['code'] + vocab_points = model.wv.expandos['point'] i = 0 for token in sentence: - word = vlookup[token] if token in vlookup else None - if word is None: - continue # should drop the - c.indexes[i] = word.index - c.codelens[i] = len(word.code) - c.codes[i] = np.PyArray_DATA(word.code) - c.points[i] = np.PyArray_DATA(word.point) + word_index = model.wv.key_to_index[token] if token in model.wv.key_to_index else None + if word_index is None: + # For score, should this be a default negative value? + # + # See comment by @gojomo at https://github.com/RaRe-Technologies/gensim/pull/2698/files#r445827846 : + # + # These 'score' functions are a long-ago contribution from @mataddy whose + # current function/utility is unclear. + # I've continued to apply mechanical updates to match other changes, and the code + # still compiles & passes the one (trivial, form-but-not-function) unit test. But it's an + # idiosyncratic technique, and only works for the non-default hs mode. Here, in lieu of the + # previous cryptic # should drop the comment, I've asked if for the purposes of this + # particular kind of 'scoring' (really, loss-tallying indicating how divergent this new + # text is from what the model learned during training), shouldn't completely missing + # words imply something very negative, as opposed to nothing-at-all? But probably, this + # functionality should be dropped. (And ultimately, a talented cleanup of the largely-broken + # loss-tallying functions might provide a cleaner window into this same measure of how + # well a text contrasts with model expectations - such as a way to report loss from a + # single invocation of one fo the inner train methods, without changing the model.) + continue + c.indexes[i] = word_index + c.codelens[i] = len(vocab_codes[word_index]) + c.codes[i] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[i] = np.PyArray_DATA(vocab_points[word_index]) result += 1 i += 1 if i == MAX_SENTENCE_LEN: @@ -804,22 +832,23 @@ def score_sentence_cbow(model, sentence, _work, _neu1): cdef int i, j, k cdef long result = 0 - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) # convert Python structures to primitive types, so we can release the GIL c.work = np.PyArray_DATA(_work) c.neu1 = np.PyArray_DATA(_neu1) - vlookup = model.wv.vocab + vocab_codes = model.wv.expandos['code'] + vocab_points = model.wv.expandos['point'] i = 0 for token in sentence: - word = vlookup[token] if token in vlookup else None - if word is None: + word_index = model.wv.key_to_index[token] if token in model.wv.key_to_index else None + if word_index is None: continue # for score, should this be a default negative value? - c.indexes[i] = word.index - c.codelens[i] = len(word.code) - c.codes[i] = np.PyArray_DATA(word.code) - c.points[i] = np.PyArray_DATA(word.point) + c.indexes[i] = word_index + c.codelens[i] = len(vocab_codes[word_index]) + c.codes[i] = np.PyArray_DATA(vocab_codes[word_index]) + c.points[i] = np.PyArray_DATA(vocab_points[word_index]) result += 1 i += 1 if i == MAX_SENTENCE_LEN: diff --git a/gensim/models/wrappers/__init__.py b/gensim/models/wrappers/__init__.py index 9cd14ea8e7..330abce500 100644 --- a/gensim/models/wrappers/__init__.py +++ b/gensim/models/wrappers/__init__.py @@ -5,6 +5,5 @@ from .ldamallet import LdaMallet # noqa:F401 from .dtmmodel import DtmModel # noqa:F401 from .ldavowpalwabbit import LdaVowpalWabbit # noqa:F401 -from .fasttext import FastText # noqa:F401 from .wordrank import Wordrank # noqa:F401 from .varembed import VarEmbed # noqa:F401 diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py deleted file mode 100644 index bca36c7cb9..0000000000 --- a/gensim/models/wrappers/fasttext.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Jayant Jain -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.2.0 - Use :mod:`gensim.models.fasttext` instead. - - - -Python wrapper around word representation learning from FastText, a library for efficient learning -of word representations and sentence classification [1]. - -This module allows training a word embedding from a training corpus with the additional ability -to obtain word vectors for out-of-vocabulary words, using the fastText C implementation. - -The wrapped model can NOT be updated with new documents for online training -- use gensim's -`Word2Vec` for that. - -Example: - -.. sourcecode:: pycon - - >>> from gensim.models.wrappers import FastText - >>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8') - >>> print(model['forests']) # prints vector for given out-of-vocabulary word - -.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information - - - -""" -from gensim.models.deprecated.fasttext_wrapper import FastText, FastTextKeyedVectors # noqa:F401 -from gensim.models.deprecated.fasttext_wrapper import ft_hash, compute_ngrams # noqa:F401 diff --git a/gensim/models/wrappers/varembed.py b/gensim/models/wrappers/varembed.py index ca8227ac01..cf76dbe13e 100644 --- a/gensim/models/wrappers/varembed.py +++ b/gensim/models/wrappers/varembed.py @@ -19,7 +19,6 @@ from gensim import utils from gensim.models.keyedvectors import KeyedVectors -from gensim.models.word2vec import Vocab logger = logging.getLogger(__name__) @@ -34,7 +33,7 @@ class VarEmbed(KeyedVectors): """ def __init__(self): - self.vector_size = 0 + super(VarEmbed, self).__init__(vector_size=0) self.vocab_size = 0 @classmethod @@ -88,21 +87,22 @@ def load_word_embeddings(self, word_embeddings, word_to_ix): """ logger.info("Loading the vocabulary") - self.vocab = {} - self.index2word = [] + self.key_to_index = {} + self.index_to_key = [] counts = {} for word in word_to_ix: counts[word] = counts.get(word, 0) + 1 self.vocab_size = len(counts) self.vector_size = word_embeddings.shape[1] self.vectors = np.zeros((self.vocab_size, self.vector_size)) - self.index2word = [None] * self.vocab_size - logger.info("Corpus has %i words", len(self.vocab)) + self.index_to_key = [None] * self.vocab_size + logger.info("Corpus has %i words", len(self)) for word_id, word in enumerate(counts): - self.vocab[word] = Vocab(index=word_id, count=counts[word]) + self.index_to_key[word_id] = word + self.key_to_index[word] = word_id + self.set_vecattr(word, 'count', counts[word]) self.vectors[word_id] = word_embeddings[word_to_ix[word]] - self.index2word[word_id] = word - assert((len(self.vocab), self.vector_size) == self.vectors.shape) + assert((len(self.key_to_index), self.vector_size) == self.vectors.shape) logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size) def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho_to_ix): @@ -118,12 +118,12 @@ def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho Mapping morpheme to index. """ - for word in self.vocab: + for word in self.key_to_index: morpheme_embedding = np.array( [ morpho_embeddings[morpho_to_ix.get(m, -1)] for m in morfessor_model.viterbi_segment(word)[0] ] ).sum(axis=0) - self.vectors[self.vocab[word].index] += morpheme_embedding + self.vectors[self.get_index(word)] += morpheme_embedding logger.info("Added morphemes to word vectors") diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 565d209cf7..16de58743d 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -62,6 +62,8 @@ import argparse from gensim import utils +from gensim.utils import deprecated +from gensim.models.keyedvectors import KeyedVectors logger = logging.getLogger(__name__) @@ -87,6 +89,7 @@ def get_glove_info(glove_file_name): return num_lines, num_dims +@deprecated("KeyedVectors.load_word2vec_format(.., binary=False, no_header=True) loads GLoVE text vectors.") def glove2word2vec(glove_input_file, word2vec_output_file): """Convert `glove_input_file` in GloVe format to word2vec format and write it to `word2vec_output_file`. @@ -103,13 +106,11 @@ def glove2word2vec(glove_input_file, word2vec_output_file): Number of vectors (lines) of input file and its dimension. """ - num_lines, num_dims = get_glove_info(glove_input_file) + glovekv = KeyedVectors.load_word2vec_format(glove_input_file, binary=False, no_header=True) + + num_lines, num_dims = len(glovekv), glovekv.vector_size logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file) - with utils.open(word2vec_output_file, 'wb') as fout: - fout.write("{0} {1}\n".format(num_lines, num_dims).encode('utf-8')) - with utils.open(glove_input_file, 'rb') as fin: - for line in fin: - fout.write(line) + glovekv.save_word2vec_format(word2vec_output_file, binary=False) return num_lines, num_dims diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index ce9c3398c0..925c8877a0 100644 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -1,398 +1,398 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Jayant Jain -# Copyright (C) 2016 RaRe Technologies - -"""This script using for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided -by MediaWiki that looks like wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2 -(e.g. 14 GB of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2). - -It streams through all the XML articles using multiple cores (#cores - 1, by default), -decompressing on the fly and extracting plain text from the articles and their sections. - -For each extracted article, it prints its title, section names and plain text section contents, in json-line format. - -How to use ----------- -#. Process Wikipedia dump with this script :: - - python -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz - -#. Read output in simple way: - -.. sourcecode:: pycon - - >>> from gensim import utils - >>> import json - >>> - >>> # iterate over the plain text data we just created - >>> with utils.open('enwiki-latest.json.gz', 'rb') as f: - >>> for line in f: - >>> # decode each JSON line into a Python dictionary object - >>> article = json.loads(line) - >>> - >>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and - >>> # "section_texts". - >>> print("Article title: %s" % article['title']) - >>> print("Interlinks: %s" + article['interlinks']) - >>> for section_title, section_text in zip(article['section_titles'], article['section_texts']): - >>> print("Section title: %s" % section_title) - >>> print("Section text: %s" % section_text) - - -Notes ------ -Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour, -or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz. - - -Command line arguments ----------------------- - -.. program-output:: python -m gensim.scripts.segment_wiki --help - :ellipsis: 0, -10 - -""" - -import argparse -import json -import logging -import multiprocessing -import re -import sys -from xml.etree import ElementTree -from functools import partial - -from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, find_interlinks, get_namespace, utils -import gensim.utils - -logger = logging.getLogger(__name__) - - -def segment_all_articles(file_path, min_article_character=200, workers=None, include_interlinks=False): - """Extract article titles and sections from a MediaWiki bz2 database dump. - - Parameters - ---------- - file_path : str - Path to MediaWiki dump, typical filename is wiki--pages-articles.xml.bz2 - or wiki-latest-pages-articles.xml.bz2. - - min_article_character : int, optional - Minimal number of character for article (except titles and leading gaps). - - workers: int or None - Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None. - - include_interlinks: bool - Whether or not interlinks should be included in the output - - Yields - ------ - (str, list of (str, str), (Optionally) list of (str, str)) - Structure contains (title, [(section_heading, section_content), ...], - (Optionally) [(interlink_article, interlink_text), ...]). - - """ - with gensim.utils.open(file_path, 'rb') as xml_fileobj: - wiki_sections_corpus = _WikiSectionsCorpus( - xml_fileobj, min_article_character=min_article_character, processes=workers, - include_interlinks=include_interlinks) - wiki_sections_corpus.metadata = True - wiki_sections_text = wiki_sections_corpus.get_texts_with_sections() - - for article in wiki_sections_text: - yield article - - -def segment_and_write_all_articles(file_path, output_file, min_article_character=200, workers=None, - include_interlinks=False): - """Write article title and sections to `output_file` (or stdout, if output_file is None). - - The output format is one article per line, in json-line format with 4 fields:: - - 'title' - title of article, - 'section_titles' - list of titles of sections, - 'section_texts' - list of content from sections, - (Optional) 'section_interlinks' - list of interlinks in the article. - - Parameters - ---------- - file_path : str - Path to MediaWiki dump, typical filename is wiki--pages-articles.xml.bz2 - or wiki-latest-pages-articles.xml.bz2. - - output_file : str or None - Path to output file in json-lines format, or None for printing to stdout. - - min_article_character : int, optional - Minimal number of character for article (except titles and leading gaps). - - workers: int or None - Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None. - - include_interlinks: bool - Whether or not interlinks should be included in the output - """ - if output_file is None: - outfile = getattr(sys.stdout, 'buffer', sys.stdout) # we want write bytes, so for py3 we used 'buffer' - else: - outfile = gensim.utils.open(output_file, 'wb') - - try: - article_stream = segment_all_articles(file_path, min_article_character, workers=workers, - include_interlinks=include_interlinks) - for idx, article in enumerate(article_stream): - article_title, article_sections = article[0], article[1] - if include_interlinks: - interlinks = article[2] - - output_data = { - "title": article_title, - "section_titles": [], - "section_texts": [], - } - if include_interlinks: - output_data["interlinks"] = interlinks - - for section_heading, section_content in article_sections: - output_data["section_titles"].append(section_heading) - output_data["section_texts"].append(section_content) - - if (idx + 1) % 100000 == 0: - logger.info("processed #%d articles (at %r now)", idx + 1, article_title) - outfile.write((json.dumps(output_data) + "\n").encode('utf-8')) - - finally: - if output_file is not None: - outfile.close() - - -def extract_page_xmls(f): - """Extract pages from a MediaWiki database dump. - - Parameters - ---------- - f : file - File descriptor of MediaWiki dump. - - Yields - ------ - str - XML strings for page tags. - - """ - elems = (elem for _, elem in ElementTree.iterparse(f, events=("end",))) - - elem = next(elems) - namespace = get_namespace(elem.tag) - ns_mapping = {"ns": namespace} - page_tag = "{%(ns)s}page" % ns_mapping - - for elem in elems: - if elem.tag == page_tag: - yield ElementTree.tostring(elem) - # Prune the element tree, as per - # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ - # except that we don't need to prune backlinks from the parent - # because we don't use LXML. - # We do this only for s, since we need to inspect the - # ./revision/text element. The pages comprise the bulk of the - # file, so in practice we prune away enough. - elem.clear() - - -def segment(page_xml, include_interlinks=False): - """Parse the content inside a page tag - - Parameters - ---------- - page_xml : str - Content from page tag. - - include_interlinks : bool - Whether or not interlinks should be parsed. - - Returns - ------- - (str, list of (str, str), (Optionally) list of (str, str)) - Structure contains (title, [(section_heading, section_content), ...], - (Optionally) [(interlink_article, interlink_text), ...]). - - """ - elem = ElementTree.fromstring(page_xml) - filter_namespaces = ('0',) - namespace = get_namespace(elem.tag) - ns_mapping = {"ns": namespace} - text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping - title_path = "./{%(ns)s}title" % ns_mapping - ns_path = "./{%(ns)s}ns" % ns_mapping - lead_section_heading = "Introduction" - top_level_heading_regex = r"\n==[^=].*[^=]==\n" - top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" - - title = elem.find(title_path).text - text = elem.find(text_path).text - ns = elem.find(ns_path).text - if ns not in filter_namespaces: - text = None - - if text is not None: - if include_interlinks: - interlinks = find_interlinks(text) - section_contents = re.split(top_level_heading_regex, text) - section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) - section_headings = [heading.strip() for heading in section_headings] - assert len(section_contents) == len(section_headings) - else: - interlinks = [] - section_contents = [] - section_headings = [] - - section_contents = [filter_wiki(section_content) for section_content in section_contents] - sections = list(zip(section_headings, section_contents)) - - if include_interlinks: - return title, sections, interlinks - else: - return title, sections - - -class _WikiSectionsCorpus(WikiCorpus): - """Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 - or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. - - The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. - - """ - - def __init__(self, fileobj, min_article_character=200, processes=None, - lemmatize=utils.has_pattern(), filter_namespaces=('0',), include_interlinks=False): - """ - Parameters - ---------- - fileobj : file - File descriptor of MediaWiki dump. - min_article_character : int, optional - Minimal number of character for article (except titles and leading gaps). - processes : int, optional - Number of processes, max(1, multiprocessing.cpu_count() - 1) if None. - lemmatize : bool, optional - If `pattern` package is installed, use fancier shallow parsing to get token lemmas. - Otherwise, use simple regexp tokenization. - filter_namespaces : tuple of int, optional - Enumeration of namespaces that will be ignored. - include_interlinks: bool - Whether or not interlinks should be included in the output - - """ - self.fileobj = fileobj - self.filter_namespaces = filter_namespaces - self.metadata = False - if processes is None: - processes = max(1, multiprocessing.cpu_count() - 1) - self.processes = processes - self.lemmatize = lemmatize - self.min_article_character = min_article_character - self.include_interlinks = include_interlinks - - def get_texts_with_sections(self): - """Iterate over the dump, returning titles and text versions of all sections of articles. - - Notes - ----- - Only articles of sufficient length are returned (short articles & redirects - etc are ignored). - - Note that this iterates over the **texts**; if you want vectors, just use - the standard corpus interface instead of this function: - - .. sourcecode:: pycon - - >>> for vec in wiki_corpus: - >>> print(vec) - - Yields - ------ - (str, list of (str, str), list of (str, str)) - Structure contains (title, [(section_heading, section_content), ...], - (Optionally)[(interlink_article, interlink_text), ...]). - - """ - skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0 - total_articles, total_sections = 0, 0 - page_xmls = extract_page_xmls(self.fileobj) - pool = multiprocessing.Pool(self.processes) - # process the corpus in smaller chunks of docs, because multiprocessing.Pool - # is dumb and would load the entire input into RAM at once... - for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1): - for article in pool.imap(partial(segment, include_interlinks=self.include_interlinks), - group): # chunksize=10): partial(merge_names, b='Sons') - article_title, sections = article[0], article[1] - - # article redirects are pruned here - if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles - skipped_namespace += 1 - continue - if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect - skipped_redirect += 1 - continue - if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character: - # filter stubs (incomplete, very short articles) - skipped_length += 1 - continue - total_articles += 1 - total_sections += len(sections) - - if self.include_interlinks: - interlinks = article[2] - yield (article_title, sections, interlinks) - else: - yield (article_title, sections) - - logger.info( - "finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)", - total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace) - pool.terminate() - self.length = total_articles # cache corpus length - - -if __name__ == "__main__": - logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO) - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=__doc__[:-136]) - default_workers = max(1, multiprocessing.cpu_count() - 1) - parser.add_argument('-f', '--file', help='Path to MediaWiki database dump (read-only).', required=True) - parser.add_argument( - '-o', '--output', - help='Path to output file (stdout if not specified). If ends in .gz or .bz2, ' - 'the output file will be automatically compressed (recommended!).') - parser.add_argument( - '-w', '--workers', - help='Number of parallel workers for multi-core systems. Default: %(default)s.', - type=int, - default=default_workers - ) - parser.add_argument( - '-m', '--min-article-character', - help="Ignore articles with fewer characters than this (article stubs). Default: %(default)s.", - type=int, - default=200 - ) - parser.add_argument( - '-i', '--include-interlinks', - help='Include a mapping for interlinks to other articles in the dump. The mappings format is: ' - '"interlinks": [("article_title_1", "interlink_text_1"), ("article_title_2", "interlink_text_2"), ...]', - action='store_true' - ) - args = parser.parse_args() - - logger.info("running %s", " ".join(sys.argv)) - segment_and_write_all_articles( - args.file, args.output, - min_article_character=args.min_article_character, - workers=args.workers, - include_interlinks=args.include_interlinks - ) - - logger.info("finished running %s", sys.argv[0]) +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Jayant Jain +# Copyright (C) 2016 RaRe Technologies + +"""This script using for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided +by MediaWiki that looks like wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2 +(e.g. 14 GB of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2). + +It streams through all the XML articles using multiple cores (#cores - 1, by default), +decompressing on the fly and extracting plain text from the articles and their sections. + +For each extracted article, it prints its title, section names and plain text section contents, in json-line format. + +How to use +---------- +#. Process Wikipedia dump with this script :: + + python -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz + +#. Read output in simple way: + +.. sourcecode:: pycon + + >>> from gensim import utils + >>> import json + >>> + >>> # iterate over the plain text data we just created + >>> with utils.open('enwiki-latest.json.gz', 'rb') as f: + >>> for line in f: + >>> # decode each JSON line into a Python dictionary object + >>> article = json.loads(line) + >>> + >>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and + >>> # "section_texts". + >>> print("Article title: %s" % article['title']) + >>> print("Interlinks: %s" + article['interlinks']) + >>> for section_title, section_text in zip(article['section_titles'], article['section_texts']): + >>> print("Section title: %s" % section_title) + >>> print("Section text: %s" % section_text) + + +Notes +----- +Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour, +or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz. + + +Command line arguments +---------------------- + +.. program-output:: python -m gensim.scripts.segment_wiki --help + :ellipsis: 0, -10 + +""" + +import argparse +import json +import logging +import multiprocessing +import re +import sys +from xml.etree import ElementTree +from functools import partial + +from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, find_interlinks, get_namespace, utils +import gensim.utils + +logger = logging.getLogger(__name__) + + +def segment_all_articles(file_path, min_article_character=200, workers=None, include_interlinks=False): + """Extract article titles and sections from a MediaWiki bz2 database dump. + + Parameters + ---------- + file_path : str + Path to MediaWiki dump, typical filename is wiki--pages-articles.xml.bz2 + or wiki-latest-pages-articles.xml.bz2. + + min_article_character : int, optional + Minimal number of character for article (except titles and leading gaps). + + workers: int or None + Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None. + + include_interlinks: bool + Whether or not interlinks should be included in the output + + Yields + ------ + (str, list of (str, str), (Optionally) list of (str, str)) + Structure contains (title, [(section_heading, section_content), ...], + (Optionally) [(interlink_article, interlink_text), ...]). + + """ + with gensim.utils.open(file_path, 'rb') as xml_fileobj: + wiki_sections_corpus = _WikiSectionsCorpus( + xml_fileobj, min_article_character=min_article_character, processes=workers, + include_interlinks=include_interlinks) + wiki_sections_corpus.metadata = True + wiki_sections_text = wiki_sections_corpus.get_texts_with_sections() + + for article in wiki_sections_text: + yield article + + +def segment_and_write_all_articles(file_path, output_file, min_article_character=200, workers=None, + include_interlinks=False): + """Write article title and sections to `output_file` (or stdout, if output_file is None). + + The output format is one article per line, in json-line format with 4 fields:: + + 'title' - title of article, + 'section_titles' - list of titles of sections, + 'section_texts' - list of content from sections, + (Optional) 'section_interlinks' - list of interlinks in the article. + + Parameters + ---------- + file_path : str + Path to MediaWiki dump, typical filename is wiki--pages-articles.xml.bz2 + or wiki-latest-pages-articles.xml.bz2. + + output_file : str or None + Path to output file in json-lines format, or None for printing to stdout. + + min_article_character : int, optional + Minimal number of character for article (except titles and leading gaps). + + workers: int or None + Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None. + + include_interlinks: bool + Whether or not interlinks should be included in the output + """ + if output_file is None: + outfile = getattr(sys.stdout, 'buffer', sys.stdout) # we want write bytes, so for py3 we used 'buffer' + else: + outfile = gensim.utils.open(output_file, 'wb') + + try: + article_stream = segment_all_articles(file_path, min_article_character, workers=workers, + include_interlinks=include_interlinks) + for idx, article in enumerate(article_stream): + article_title, article_sections = article[0], article[1] + if include_interlinks: + interlinks = article[2] + + output_data = { + "title": article_title, + "section_titles": [], + "section_texts": [], + } + if include_interlinks: + output_data["interlinks"] = interlinks + + for section_heading, section_content in article_sections: + output_data["section_titles"].append(section_heading) + output_data["section_texts"].append(section_content) + + if (idx + 1) % 100000 == 0: + logger.info("processed #%d articles (at %r now)", idx + 1, article_title) + outfile.write((json.dumps(output_data) + "\n").encode('utf-8')) + + finally: + if output_file is not None: + outfile.close() + + +def extract_page_xmls(f): + """Extract pages from a MediaWiki database dump. + + Parameters + ---------- + f : file + File descriptor of MediaWiki dump. + + Yields + ------ + str + XML strings for page tags. + + """ + elems = (elem for _, elem in ElementTree.iterparse(f, events=("end",))) + + elem = next(elems) + namespace = get_namespace(elem.tag) + ns_mapping = {"ns": namespace} + page_tag = "{%(ns)s}page" % ns_mapping + + for elem in elems: + if elem.tag == page_tag: + yield ElementTree.tostring(elem) + # Prune the element tree, as per + # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ + # except that we don't need to prune backlinks from the parent + # because we don't use LXML. + # We do this only for s, since we need to inspect the + # ./revision/text element. The pages comprise the bulk of the + # file, so in practice we prune away enough. + elem.clear() + + +def segment(page_xml, include_interlinks=False): + """Parse the content inside a page tag + + Parameters + ---------- + page_xml : str + Content from page tag. + + include_interlinks : bool + Whether or not interlinks should be parsed. + + Returns + ------- + (str, list of (str, str), (Optionally) list of (str, str)) + Structure contains (title, [(section_heading, section_content), ...], + (Optionally) [(interlink_article, interlink_text), ...]). + + """ + elem = ElementTree.fromstring(page_xml) + filter_namespaces = ('0',) + namespace = get_namespace(elem.tag) + ns_mapping = {"ns": namespace} + text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping + title_path = "./{%(ns)s}title" % ns_mapping + ns_path = "./{%(ns)s}ns" % ns_mapping + lead_section_heading = "Introduction" + top_level_heading_regex = r"\n==[^=].*[^=]==\n" + top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" + + title = elem.find(title_path).text + text = elem.find(text_path).text + ns = elem.find(ns_path).text + if ns not in filter_namespaces: + text = None + + if text is not None: + if include_interlinks: + interlinks = find_interlinks(text) + section_contents = re.split(top_level_heading_regex, text) + section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) + section_headings = [heading.strip() for heading in section_headings] + assert len(section_contents) == len(section_headings) + else: + interlinks = [] + section_contents = [] + section_headings = [] + + section_contents = [filter_wiki(section_content) for section_content in section_contents] + sections = list(zip(section_headings, section_contents)) + + if include_interlinks: + return title, sections, interlinks + else: + return title, sections + + +class _WikiSectionsCorpus(WikiCorpus): + """Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 + or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. + + The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. + + """ + + def __init__(self, fileobj, min_article_character=200, processes=None, + lemmatize=utils.has_pattern(), filter_namespaces=('0',), include_interlinks=False): + """ + Parameters + ---------- + fileobj : file + File descriptor of MediaWiki dump. + min_article_character : int, optional + Minimal number of character for article (except titles and leading gaps). + processes : int, optional + Number of processes, max(1, multiprocessing.cpu_count() - 1) if None. + lemmatize : bool, optional + If `pattern` package is installed, use fancier shallow parsing to get token lemmas. + Otherwise, use simple regexp tokenization. + filter_namespaces : tuple of int, optional + Enumeration of namespaces that will be ignored. + include_interlinks: bool + Whether or not interlinks should be included in the output + + """ + self.fileobj = fileobj + self.filter_namespaces = filter_namespaces + self.metadata = False + if processes is None: + processes = max(1, multiprocessing.cpu_count() - 1) + self.processes = processes + self.lemmatize = lemmatize + self.min_article_character = min_article_character + self.include_interlinks = include_interlinks + + def get_texts_with_sections(self): + """Iterate over the dump, returning titles and text versions of all sections of articles. + + Notes + ----- + Only articles of sufficient length are returned (short articles & redirects + etc are ignored). + + Note that this iterates over the **texts**; if you want vectors, just use + the standard corpus interface instead of this function: + + .. sourcecode:: pycon + + >>> for vec in wiki_corpus: + >>> print(vec) + + Yields + ------ + (str, list of (str, str), list of (str, str)) + Structure contains (title, [(section_heading, section_content), ...], + (Optionally)[(interlink_article, interlink_text), ...]). + + """ + skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0 + total_articles, total_sections = 0, 0 + page_xmls = extract_page_xmls(self.fileobj) + pool = multiprocessing.Pool(self.processes) + # process the corpus in smaller chunks of docs, because multiprocessing.Pool + # is dumb and would load the entire input into RAM at once... + for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1): + for article in pool.imap(partial(segment, include_interlinks=self.include_interlinks), + group): # chunksize=10): partial(merge_names, b='Sons') + article_title, sections = article[0], article[1] + + # article redirects are pruned here + if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles + skipped_namespace += 1 + continue + if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect + skipped_redirect += 1 + continue + if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character: + # filter stubs (incomplete, very short articles) + skipped_length += 1 + continue + total_articles += 1 + total_sections += len(sections) + + if self.include_interlinks: + interlinks = article[2] + yield (article_title, sections, interlinks) + else: + yield (article_title, sections) + + logger.info( + "finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)", + total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace) + pool.terminate() + self.length = total_articles # cache corpus length + + +if __name__ == "__main__": + logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO) + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=__doc__[:-136]) + default_workers = max(1, multiprocessing.cpu_count() - 1) + parser.add_argument('-f', '--file', help='Path to MediaWiki database dump (read-only).', required=True) + parser.add_argument( + '-o', '--output', + help='Path to output file (stdout if not specified). If ends in .gz or .bz2, ' + 'the output file will be automatically compressed (recommended!).') + parser.add_argument( + '-w', '--workers', + help='Number of parallel workers for multi-core systems. Default: %(default)s.', + type=int, + default=default_workers + ) + parser.add_argument( + '-m', '--min-article-character', + help="Ignore articles with fewer characters than this (article stubs). Default: %(default)s.", + type=int, + default=200 + ) + parser.add_argument( + '-i', '--include-interlinks', + help='Include a mapping for interlinks to other articles in the dump. The mappings format is: ' + '"interlinks": [("article_title_1", "interlink_text_1"), ("article_title_2", "interlink_text_2"), ...]', + action='store_true' + ) + args = parser.parse_args() + + logger.info("running %s", " ".join(sys.argv)) + segment_and_write_all_articles( + args.file, args.output, + min_article_character=args.min_article_character, + workers=args.workers, + include_interlinks=args.include_interlinks + ) + + logger.info("finished running %s", sys.argv[0]) diff --git a/gensim/similarities/__init__.py b/gensim/similarities/__init__.py index 3c670ba95b..3ab45261ad 100644 --- a/gensim/similarities/__init__.py +++ b/gensim/similarities/__init__.py @@ -13,5 +13,6 @@ from .termsim import ( # noqa:F401 TermSimilarityIndex, UniformTermSimilarityIndex, + WordEmbeddingSimilarityIndex, SparseTermSimilarityMatrix) from .levenshtein import LevenshteinSimilarityIndex # noqa:F401 diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 256f276394..daba706eb1 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -993,7 +993,7 @@ def __str__(self): class WmdSimilarity(interfaces.SimilarityABC): """Compute negative WMD similarity against a corpus of documents. - See :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors` for more information. + See :class:`~gensim.models.keyedvectors.KeyedVectors` for more information. Also, tutorial `notebook `_ for more examples. @@ -1022,25 +1022,23 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> sims = index[query] """ - def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=True, chunksize=256): + def __init__(self, corpus, kv_model, num_best=None, chunksize=256): """ Parameters ---------- corpus: iterable of list of str A list of documents, each of which is a list of tokens. - w2v_model: :class:`~gensim.models.word2vec.Word2VecTrainables` - A trained word2vec model. + kv_model: :class:`~gensim.models.keyedvectors.KeyedVectors` + A set of KeyedVectors num_best: int, optional Number of results to retrieve. - normalize_w2v_and_replace: bool, optional - Whether or not to normalize the word2vec vectors to length 1. chunksize : int, optional Size of chunk. """ self.corpus = corpus - self.w2v_model = w2v_model + self.wv = kv_model self.num_best = num_best self.chunksize = chunksize @@ -1050,10 +1048,6 @@ def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=T # index is simply an array from 0 to size of corpus. self.index = numpy.arange(len(corpus)) - if normalize_w2v_and_replace: - # Normalize vectors in word2vec class to length 1. - w2v_model.init_sims(replace=True) - def __len__(self): """Get size of corpus.""" return len(self.corpus) @@ -1087,7 +1081,7 @@ def get_similarities(self, query): result = [] for qidx in range(n_queries): # Compute similarity for each query. - qresult = [self.w2v_model.wv.wmdistance(document, query[qidx]) for document in self.corpus] + qresult = [self.wv.wmdistance(document, query[qidx]) for document in self.corpus] qresult = numpy.array(qresult) qresult = 1. / (1. + qresult) # Similarity is the negative of the distance. diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py index 08ecd221c6..392d000b4e 100644 --- a/gensim/similarities/index.py +++ b/gensim/similarities/index.py @@ -47,7 +47,6 @@ from gensim.models.word2vec import Word2Vec from gensim.models.fasttext import FastText from gensim.models import KeyedVectors -from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors _NOANNOY = ImportError( @@ -97,7 +96,7 @@ def __init__(self, model=None, num_trees=None): self.build_from_doc2vec() elif isinstance(self.model, (Word2Vec, FastText)): self.build_from_word2vec() - elif isinstance(self.model, (WordEmbeddingsKeyedVectors, KeyedVectors)): + elif isinstance(self.model, (KeyedVectors,)): self.build_from_keyedvectors() else: raise ValueError("Only a Word2Vec, Doc2Vec, FastText or KeyedVectors instance can be used") diff --git a/gensim/similarities/nmslib.py b/gensim/similarities/nmslib.py index c2e23717d3..4fad9761a5 100644 --- a/gensim/similarities/nmslib.py +++ b/gensim/similarities/nmslib.py @@ -79,7 +79,7 @@ from gensim.models.word2vec import Word2Vec from gensim.models.fasttext import FastText from gensim.models import KeyedVectors -from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors + try: import nmslib except ImportError: @@ -129,7 +129,7 @@ def __init__(self, model, index_params=None, query_time_params=None): self._build_from_doc2vec() elif isinstance(self.model, (Word2Vec, FastText)): self._build_from_word2vec() - elif isinstance(self.model, (WordEmbeddingsKeyedVectors, KeyedVectors)): + elif isinstance(self.model, (KeyedVectors,)): self._build_from_keyedvectors() else: raise ValueError("model must be a Word2Vec, Doc2Vec, FastText or KeyedVectors instance") @@ -181,21 +181,18 @@ def load(cls, fname): def _build_from_word2vec(self): """Build an NMSLIB index using word vectors from a Word2Vec model.""" - self.model.init_sims() self._build_from_model(self.model.wv.vectors_norm, self.model.wv.index2word) def _build_from_doc2vec(self): """Build an NMSLIB index using document vectors from a Doc2Vec model.""" - docvecs = self.model.docvecs - docvecs.init_sims() - labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)] - self._build_from_model(docvecs.vectors_docs_norm, labels) + docvecs = self.model.dv + labels = docvecs.index_to_key + self._build_from_model(docvecs.vectors_norm, labels) def _build_from_keyedvectors(self): """Build an NMSLIB index using word vectors from a KeyedVectors model.""" - self.model.init_sims() self._build_from_model(self.model.vectors_norm, self.model.index2word) def _build_from_model(self, vectors, labels): diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index c4999fed37..975d584660 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -113,6 +113,49 @@ def _shortest_uint_dtype(max_value): return np.uint64 +class WordEmbeddingSimilarityIndex(TermSimilarityIndex): + """ + Use objects of this class to: + + 1) Compute cosine similarities between word embeddings. + 2) Retrieve the closest word embeddings (by cosine similarity) to a given word embedding. + + Parameters + ---------- + keyedvectors : :class:`~gensim.models.keyedvectors.KeyedVectors` + The word embeddings. + threshold : float, optional + Only embeddings more similar than `threshold` are considered when retrieving word embeddings + closest to a given word embedding. + exponent : float, optional + Take the word embedding similarities larger than `threshold` to the power of `exponent`. + kwargs : dict or None + A dict with keyword arguments that will be passed to the `keyedvectors.most_similar` method + when retrieving the word embeddings closest to a given word embedding. + + See Also + -------- + :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` + Build a term similarity matrix and compute the Soft Cosine Measure. + + """ + def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None): + self.keyedvectors = keyedvectors + self.threshold = threshold + self.exponent = exponent + self.kwargs = kwargs or {} + super(WordEmbeddingSimilarityIndex, self).__init__() + + def most_similar(self, t1, topn=10): + if t1 not in self.keyedvectors: + logger.debug('an out-of-dictionary term "%s"', t1) + else: + most_similar = self.keyedvectors.most_similar(positive=[t1], topn=topn, **self.kwargs) + for t2, similarity in most_similar: + if similarity > self.threshold: + yield (t2, similarity**self.exponent) + + class SparseTermSimilarityMatrix(SaveLoad): """ Builds a sparse term similarity matrix using a term similarity index. diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index fa154a2497..370897bfdb 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -35,10 +35,11 @@ class D2VTransformer(TransformerMixin, BaseEstimator): `_. """ - def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, - docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): + def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, dv=None, + dv_mapfile=None, comment=None, trim_rule=None, vector_size=100, alpha=0.025, window=5, + min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + hs=0, negative=5, cbow_mean=1, + hashfxn=hash, epochs=5, sorted_vocab=1, batch_words=10000): """ Parameters @@ -59,11 +60,10 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 in the context strung together. dm_tag_count : int, optional Expected constant number of document tags per document, when using dm_concat mode. - docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + dv : :class:`~gensim.models.keyedvectors.KeyedVectors` A mapping from a string or int tag to its vector representation. - Either this or `docvecs_mapfile` **MUST** be supplied. - docvecs_mapfile : str, optional - Path to a file containing the docvecs mapping. If `docvecs` is None, this file will be used to create it. + dv_mapfile : str, optional + Path to a file containing the docvecs mapping. If `dv` is None, this file will be used to create it. comment : str, optional A model descriptive comment, used for logging and debugging purposes. trim_rule : function ((str, int, int) -> int), optional @@ -72,7 +72,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default (:attr:`gensim.utils.RULE_DEFAULT`). If None, then :func:`gensim.utils.keep_vocab_item` will be used. - size : int, optional + vector_size : int, optional Dimensionality of the feature vectors. alpha : float, optional The initial learning rate. @@ -108,7 +108,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 Same as `dm_mean`, **unused**. hashfxn : function (object -> int), optional A hashing function. Used to create an initial random reproducible vector by hashing the random seed. - iter : int, optional + epochs : int, optional Number of epochs to iterate through the corpus. sorted_vocab : bool, optional Whether the vocabulary should be sorted internally. @@ -122,13 +122,13 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.dbow_words = dbow_words self.dm_concat = dm_concat self.dm_tag_count = dm_tag_count - self.docvecs = docvecs - self.docvecs_mapfile = docvecs_mapfile + self.dv = dv + self.dv_mapfile = dv_mapfile self.comment = comment self.trim_rule = trim_rule # attributes associated with gensim.models.Word2Vec - self.size = size + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -141,7 +141,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.iter = iter + self.epochs = epochs self.sorted_vocab = sorted_vocab self.batch_words = batch_words @@ -166,12 +166,12 @@ def fit(self, X, y=None): self.gensim_model = models.Doc2Vec( documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, - docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, vector_size=self.size, alpha=self.alpha, window=self.window, + dv=self.dv, dv_mapfile=self.dv_mapfile, comment=self.comment, + trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - epochs=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self diff --git a/gensim/sklearn_api/ftmodel.py b/gensim/sklearn_api/ftmodel.py index a1edd6c338..7acd22cfc2 100644 --- a/gensim/sklearn_api/ftmodel.py +++ b/gensim/sklearn_api/ftmodel.py @@ -18,7 +18,7 @@ >>> from gensim.sklearn_api import FTTransformer >>> >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = FTTransformer(size=10, min_count=1, seed=1) + >>> model = FTTransformer(vector_size=10, min_count=1, seed=1) >>> >>> # What is the vector representations of the word 'graph' and 'system'? >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) @@ -56,10 +56,10 @@ class FTTransformer(TransformerMixin, BaseEstimator): Information `_. """ - def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sg=0, hs=0, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, - cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, + cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=10000): """ @@ -71,7 +71,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. - size : int, optional + vector_size : int, optional Dimensionality of the word vectors. alpha : float, optional The initial learning rate. @@ -113,7 +113,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. hashfxn : function, optional Hash function to use to randomly initialize weights, for increased training reproducibility. - iter : int, optional + epochs : int, optional Number of iterations (epochs) over the corpus. min_n : int, optional Minimum length of char n-grams to be used for training word representations. @@ -148,7 +148,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, self.gensim_model = None self.sg = sg self.hs = hs - self.size = size + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -162,7 +162,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, self.ns_exponent = ns_exponent self.cbow_mean = cbow_mean self.hashfxn = hashfxn - self.iter = iter + self.epochs = epochs self.null_word = null_word self.min_n = min_n self.max_n = max_n @@ -189,13 +189,13 @@ def fit(self, X, y=None): """ self.gensim_model = models.FastText( - sentences=X, sg=self.sg, hs=self.hs, size=self.size, + sentences=X, sg=self.sg, hs=self.hs, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, word_ngrams=self.word_ngrams, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, + hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, min_n=self.min_n, max_n=self.max_n, sorted_vocab=self.sorted_vocab, bucket=self.bucket, trim_rule=self.trim_rule, batch_words=self.batch_words @@ -212,7 +212,7 @@ def transform(self, words): Returns ------- - np.ndarray of shape [`len(words)`, `size`] + np.ndarray of shape [`len(words)`, `vector_size`] A 2D array where each row is the vector of one word. """ @@ -225,4 +225,4 @@ def transform(self, words): if isinstance(words, six.string_types): words = [words] vectors = [self.gensim_model.wv[word] for word in words] - return np.reshape(np.array(vectors), (len(words), self.size)) + return np.reshape(np.array(vectors), (len(words), self.vector_size)) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 07091c2dde..ae64b56e3e 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -18,7 +18,7 @@ >>> from gensim.sklearn_api import W2VTransformer >>> >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = W2VTransformer(size=10, min_count=1, seed=1) + >>> model = W2VTransformer(vector_size=10, min_count=1, seed=1) >>> >>> # What is the vector representation of the word 'graph'? >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) @@ -40,14 +40,14 @@ class W2VTransformer(TransformerMixin, BaseEstimator): Estimation of Word Representations in Vector Space" `_. """ - def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, - workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + def __init__(self, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, + workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): """ Parameters ---------- - size : int + vector_size : int Dimensionality of the feature vectors. alpha : float The initial learning rate. @@ -85,7 +85,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. hashfxn : callable (object -> int), optional A hashing function. Used to create an initial random reproducible vector by hashing the random seed. - iter : int + epochs : int Number of iterations (epochs) over the corpus. null_word : int {1, 0} If 1, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words) @@ -106,7 +106,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= """ self.gensim_model = None - self.size = size + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -120,7 +120,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.iter = iter + self.epochs = epochs self.null_word = null_word self.trim_rule = trim_rule self.sorted_vocab = sorted_vocab @@ -144,11 +144,11 @@ def fit(self, X, y=None): """ self.gensim_model = models.Word2Vec( - sentences=X, size=self.size, alpha=self.alpha, + sentences=X, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule, + hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self @@ -163,7 +163,7 @@ def transform(self, words): Returns ------- - np.ndarray of shape [`len(words)`, `size`] + np.ndarray of shape [`len(words)`, `vector_size`] A 2D array where each row is the vector of one word. """ @@ -176,7 +176,7 @@ def transform(self, words): if isinstance(words, six.string_types): words = [words] vectors = [self.gensim_model.wv[word] for word in words] - return np.reshape(np.array(vectors), (len(words), self.size)) + return np.reshape(np.array(vectors), (len(words), self.vector_size)) def partial_fit(self, X): raise NotImplementedError( diff --git a/gensim/test/test_data/compatible-hash-false.model b/gensim/test/test_data/compatible-hash-false.model deleted file mode 100644 index 5a76fa7f6b..0000000000 Binary files a/gensim/test/test_data/compatible-hash-false.model and /dev/null differ diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index d8b358f1fa..aa958b744d 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -75,7 +75,7 @@ def test_persistence(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_persistence_fromfile(self): """Test storing/loading the entire model.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) tmpf = get_tmpfile('gensim_doc2vec.tst') @@ -88,100 +88,133 @@ def testPersistenceWord2VecFormat(self): model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) # test saving both document and word embedding test_doc_word = get_tmpfile('gensim_doc2vec.dw') - model.save_word2vec_format(test_doc_word, doctag_vec=True, word_vec=True, binary=True) - binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc_word, binary=True) - self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab)) + model.save_word2vec_format(test_doc_word, doctag_vec=True, word_vec=True, binary=False) + binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc_word, binary=False) + self.assertEqual(len(model.wv) + len(model.dv), len(binary_model_dv)) # test saving document embedding only test_doc = get_tmpfile('gensim_doc2vec.d') model.save_word2vec_format(test_doc, doctag_vec=True, word_vec=False, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc, binary=True) - self.assertEqual(len(model.docvecs), len(binary_model_dv.vocab)) + self.assertEqual(len(model.dv), len(binary_model_dv)) # test saving word embedding only test_word = get_tmpfile('gensim_doc2vec.w') model.save_word2vec_format(test_word, doctag_vec=False, word_vec=True, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True) - self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab)) + self.assertEqual(len(model.wv), len(binary_model_dv)) - def testLoadOldModel(self): - """Test loading doc2vec models from previous version""" + def obsolete_testLoadOldModel(self): + """Test loading an old doc2vec model from indeterminate version""" - model_file = 'doc2vec_old' + model_file = 'doc2vec_old' # which version?!? model = doc2vec.Doc2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (3955, 100)) - self.assertTrue(len(model.wv.vocab) == 3955) + self.assertTrue(len(model.wv) == 3955) self.assertTrue(len(model.wv.index2word) == 3955) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (3955, )) - self.assertTrue(model.vocabulary.cum_table.shape == (3955, )) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (3955, )) + self.assertTrue(model.cum_table.shape == (3955, )) - self.assertTrue(model.docvecs.vectors_docs.shape == (300, 100)) - self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, )) - self.assertTrue(model.docvecs.max_rawint == 299) - self.assertTrue(model.docvecs.count == 300) + self.assertTrue(model.dv.vectors.shape == (300, 100)) + self.assertTrue(model.dv.vectors_lockf.shape == (300, )) + self.assertTrue(len(model.dv) == 300) self.model_sanity(model) + def obsolete_testLoadOldModelSeparates(self): + """Test loading an old doc2vec model from indeterminate version""" + # Model stored in multiple files model_file = 'doc2vec_old_sep' model = doc2vec.Doc2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (3955, 100)) - self.assertTrue(len(model.wv.vocab) == 3955) + self.assertTrue(len(model.wv) == 3955) self.assertTrue(len(model.wv.index2word) == 3955) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (3955, )) - self.assertTrue(model.vocabulary.cum_table.shape == (3955, )) - - self.assertTrue(model.docvecs.vectors_docs.shape == (300, 100)) - self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, )) - self.assertTrue(model.docvecs.max_rawint == 299) - self.assertTrue(model.docvecs.count == 300) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (3955, )) + self.assertTrue(model.cum_table.shape == (3955, )) + self.assertTrue(model.dv.vectors.shape == (300, 100)) + self.assertTrue(model.dv.vectors_lockf.shape == (300, )) + self.assertTrue(len(model.dv) == 300) self.model_sanity(model) - # load really old model + def obsolete_test_load_old_models_pre_1_0(self): + """Test loading pre-1.0 models""" model_file = 'd2v-lee-v0.13.0' model = doc2vec.Doc2Vec.load(datapath(model_file)) self.model_sanity(model) - # Test loading doc2vec models from all previous versions old_versions = [ '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', - '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0', - '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' ] + for old_version in old_versions: + self._check_old_version(old_version) - saved_models_dir = datapath('old_d2v_models/d2v_{}.mdl') + def obsolete_test_load_old_models_1_x(self): + """Test loading 1.x models""" + old_versions = [ + '1.0.0', '1.0.1', + ] for old_version in old_versions: - model = doc2vec.Doc2Vec.load(saved_models_dir.format(old_version)) - self.assertTrue(len(model.wv.vocab) == 3) - self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.wv.vectors.shape == (3, 4)) - self.assertTrue(model.docvecs.vectors_docs.shape == (2, 4)) - self.assertTrue(model.docvecs.count == 2) - # check if inferring vectors for new documents and similarity search works. - doc0_inferred = model.infer_vector(list(DocsLeeCorpus())[0].words) - sims_to_infer = model.docvecs.most_similar([doc0_inferred], topn=len(model.docvecs)) - self.assertTrue(sims_to_infer) - # check if inferring vectors and similarity search works after saving and loading back the model - tmpf = get_tmpfile('gensim_doc2vec.tst') - model.save(tmpf) - loaded_model = doc2vec.Doc2Vec.load(tmpf) - doc0_inferred = loaded_model.infer_vector(list(DocsLeeCorpus())[0].words) - sims_to_infer = loaded_model.docvecs.most_similar([doc0_inferred], topn=len(loaded_model.docvecs)) - self.assertTrue(sims_to_infer) + self._check_old_version(old_version) + + def obsolete_test_load_old_models_2_x(self): + """Test loading 2.x models""" + old_versions = [ + '2.0.0', '2.1.0', '2.2.0', '2.3.0', + ] + for old_version in old_versions: + self._check_old_version(old_version) + + def obsolete_test_load_old_models_pre_3_3(self): + """Test loading 3.x models""" + old_versions = [ + '3.2.0', '3.1.0', '3.0.0' + ] + for old_version in old_versions: + self._check_old_version(old_version) + + def obsolete_test_load_old_models_post_3_2(self): + """Test loading 3.x models""" + old_versions = [ + '3.4.0', '3.3.0', + ] + for old_version in old_versions: + self._check_old_version(old_version) + + def _check_old_version(self, old_version): + logging.info("TESTING LOAD of %s Doc2Vec MODEL", old_version) + saved_models_dir = datapath('old_d2v_models/d2v_{}.mdl') + model = doc2vec.Doc2Vec.load(saved_models_dir.format(old_version)) + self.assertTrue(len(model.wv) == 3) + self.assertIsNone(model.corpus_total_words) + self.assertTrue(model.wv.vectors.shape == (3, 4)) + self.assertTrue(model.dv.vectors.shape == (2, 4)) + self.assertTrue(len(model.dv) == 2) + # check if inferring vectors for new documents and similarity search works. + doc0_inferred = model.infer_vector(list(DocsLeeCorpus())[0].words) + sims_to_infer = model.dv.most_similar([doc0_inferred], topn=len(model.dv)) + self.assertTrue(sims_to_infer) + # check if inferring vectors and similarity search works after saving and loading back the model + tmpf = get_tmpfile('gensim_doc2vec.tst') + model.save(tmpf) + loaded_model = doc2vec.Doc2Vec.load(tmpf) + doc0_inferred = loaded_model.infer_vector(list(DocsLeeCorpus())[0].words) + sims_to_infer = loaded_model.dv.most_similar([doc0_inferred], topn=len(loaded_model.dv)) + self.assertTrue(sims_to_infer) def testDoc2vecTrainParameters(self): model = doc2vec.Doc2Vec(vector_size=50) - model.build_vocab(documents=list_corpus) + model.build_vocab(corpus_iterable=list_corpus) self.assertRaises(TypeError, model.train, corpus_file=11111) - self.assertRaises(TypeError, model.train, documents=11111) - self.assertRaises(TypeError, model.train, documents=sentences, corpus_file='test') - self.assertRaises(TypeError, model.train, documents=None, corpus_file=None) + self.assertRaises(TypeError, model.train, corpus_iterable=11111) + self.assertRaises(TypeError, model.train, corpus_iterable=sentences, corpus_file='test') + self.assertRaises(TypeError, model.train, corpus_iterable=None, corpus_file=None) self.assertRaises(TypeError, model.train, corpus_file=sentences) @unittest.skipIf(os.name == 'nt', "See another test for Windows below") @@ -297,9 +330,9 @@ def test_int_doctags(self): model = doc2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) - self.assertEqual(len(model.docvecs.vectors_docs), 300) - self.assertEqual(model.docvecs[0].shape, (100,)) - self.assertEqual(model.docvecs[np.int64(0)].shape, (100,)) + self.assertEqual(len(model.dv.vectors), 300) + self.assertEqual(model.dv[0].shape, (100,)) + self.assertEqual(model.dv[np.int64(0)].shape, (100,)) self.assertRaises(KeyError, model.__getitem__, '_*0') def test_missing_string_doctag(self): @@ -310,7 +343,7 @@ def test_missing_string_doctag(self): model = doc2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) - self.assertRaises(KeyError, model.docvecs.__getitem__, 'not_a_tag') + self.assertRaises(KeyError, model.dv.__getitem__, 'not_a_tag') def test_string_doctags(self): """Test doc2vec doctag alternatives""" @@ -321,19 +354,17 @@ def test_string_doctags(self): model = doc2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) - self.assertEqual(len(model.docvecs.vectors_docs), 300) - self.assertEqual(model.docvecs[0].shape, (100,)) - self.assertEqual(model.docvecs['_*0'].shape, (100,)) - self.assertTrue(all(model.docvecs['_*0'] == model.docvecs[0])) - self.assertTrue(max(d.offset for d in model.docvecs.doctags.values()) < len(model.docvecs.doctags)) - self.assertTrue( - max( - model.docvecs._int_index(str_key, model.docvecs.doctags, model.docvecs.max_rawint) - for str_key in model.docvecs.doctags.keys()) - < len(model.docvecs.vectors_docs) + self.assertEqual(len(model.dv.vectors), 300) + self.assertEqual(model.dv[0].shape, (100,)) + self.assertEqual(model.dv['_*0'].shape, (100,)) + self.assertTrue(all(model.dv['_*0'] == model.dv[0])) + self.assertTrue(max(model.dv.key_to_index.values()) < len(model.dv.index_to_key)) + self.assertLess( + max(model.dv.get_index(str_key) for str_key in model.dv.key_to_index.keys()), + len(model.dv.vectors) ) - # verify docvecs.most_similar() returns string doctags rather than indexes - self.assertEqual(model.docvecs.offset2doctag[0], model.docvecs.most_similar([model.docvecs[0]])[0][0]) + # verify dv.most_similar() returns string doctags rather than indexes + self.assertEqual(model.dv.index_to_key[0], model.dv.most_similar([model.dv[0]])[0][0]) def test_empty_errors(self): # no input => "RuntimeError: you must first build vocabulary before training the model" @@ -344,37 +375,39 @@ def test_empty_errors(self): def test_similarity_unseen_docs(self): """Test similarity of out of training sentences""" - rome_str = ['rome', 'italy'] - car_str = ['car'] + rome_words = ['rome', 'italy'] + car_words = ['car'] corpus = list(DocsLeeCorpus(True)) model = doc2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) self.assertTrue( - model.docvecs.similarity_unseen_docs(model, rome_str, rome_str) - > model.docvecs.similarity_unseen_docs(model, rome_str, car_str) + model.similarity_unseen_docs(rome_words, rome_words) + > model.similarity_unseen_docs(rome_words, car_words) ) def model_sanity(self, model, keep_training=True): """Any non-trivial model on DocsLeeCorpus can pass these sanity checks""" fire1 = 0 # doc 0 sydney fires fire2 = np.int64(8) # doc 8 sydney fires - tennis1 = 6 # doc 6 tennis + alt1 = 29 # doc 29 palestine # inferred vector should be top10 close to bulk-trained one doc0_inferred = model.infer_vector(list(DocsLeeCorpus())[0].words) - sims_to_infer = model.docvecs.most_similar([doc0_inferred], topn=len(model.docvecs)) - f_rank = [docid for docid, sim in sims_to_infer].index(fire1) + sims_to_infer = model.dv.most_similar([doc0_inferred], topn=len(model.dv)) + sims_ids = [docid for docid, sim in sims_to_infer] + self.assertTrue(fire1 in sims_ids, "{0} not found in {1}".format(fire1, sims_to_infer)) + f_rank = sims_ids.index(fire1) self.assertLess(f_rank, 10) # fire2 should be top30 close to fire1 - sims = model.docvecs.most_similar(fire1, topn=len(model.docvecs)) + sims = model.dv.most_similar(fire1, topn=len(model.dv)) f2_rank = [docid for docid, sim in sims].index(fire2) self.assertLess(f2_rank, 30) # same sims should appear in lookup by vec as by index - doc0_vec = model.docvecs[fire1] - sims2 = model.docvecs.most_similar(positive=[doc0_vec], topn=21) + doc0_vec = model.dv[fire1] + sims2 = model.dv.most_similar(positive=[doc0_vec], topn=21) sims2 = [(id, sim) for id, sim in sims2 if id != fire1] # ignore the doc itself sims = sims[:20] self.assertEqual(list(zip(*sims))[0], list(zip(*sims2))[0]) # same doc ids @@ -382,30 +415,31 @@ def model_sanity(self, model, keep_training=True): # sim results should be in clip range if given clip_sims = \ - model.docvecs.most_similar(fire1, clip_start=len(model.docvecs) // 2, clip_end=len(model.docvecs) * 2 // 3) + model.dv.most_similar(fire1, clip_start=len(model.dv) // 2, clip_end=len(model.dv) * 2 // 3) sims_doc_id = [docid for docid, sim in clip_sims] for s_id in sims_doc_id: - self.assertTrue(len(model.docvecs) // 2 <= s_id <= len(model.docvecs) * 2 // 3) + self.assertTrue(len(model.dv) // 2 <= s_id <= len(model.dv) * 2 // 3) - # tennis doc should be out-of-place among fire news - self.assertEqual(model.docvecs.doesnt_match([fire1, tennis1, fire2]), tennis1) + # fire docs should be closer than fire-alt + self.assertLess(model.dv.similarity(fire1, alt1), model.dv.similarity(fire1, fire2)) + self.assertLess(model.dv.similarity(fire2, alt1), model.dv.similarity(fire1, fire2)) - # fire docs should be closer than fire-tennis - self.assertTrue(model.docvecs.similarity(fire1, fire2) > model.docvecs.similarity(fire1, tennis1)) + # alt doc should be out-of-place among fire news + self.assertEqual(model.dv.doesnt_match([fire1, alt1, fire2]), alt1) # keep training after save if keep_training: - tmpf = get_tmpfile('gensim_doc2vec.tst') + tmpf = get_tmpfile('gensim_doc2vec_resave.tst') model.save(tmpf) loaded = doc2vec.Doc2Vec.load(tmpf) - loaded.train(documents=sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs) + loaded.train(corpus_iterable=sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs) def test_training(self): """Test doc2vec training.""" corpus = DocsLeeCorpus() model = doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=20, workers=1) model.build_vocab(corpus) - self.assertEqual(model.docvecs.vectors_docs.shape, (300, 100)) + self.assertEqual(model.dv.vectors.shape, (300, 100)) model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs) self.model_sanity(model) @@ -417,12 +451,12 @@ def test_training(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_training_fromfile(self): """Test doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=20, workers=1) model.build_vocab(corpus_file=corpus_file) - self.assertEqual(model.docvecs.vectors_docs.shape, (300, 100)) + self.assertEqual(model.dv.vectors.shape, (300, 100)) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.epochs) self.model_sanity(model) @@ -438,7 +472,7 @@ def test_dbow_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dbow_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(corpus_file=corpus_file, dm=0, hs=1, negative=0, min_count=2, epochs=20) self.model_sanity(model) @@ -454,7 +488,7 @@ def test_dmm_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmm_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, @@ -473,7 +507,7 @@ def test_dms_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dms_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=0, vector_size=24, window=4, hs=1, @@ -492,7 +526,7 @@ def test_dmc_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmc_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, @@ -502,15 +536,15 @@ def test_dmc_hs_fromfile(self): def test_dbow_neg(self): """Test DBOW doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20) + model = doc2vec.Doc2Vec(list_corpus, vector_size=16, dm=0, hs=0, negative=5, min_count=2, epochs=40) self.model_sanity(model) @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dbow_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) - model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20) + model = doc2vec.Doc2Vec(list_corpus, vector_size=16, dm=0, hs=0, negative=5, min_count=2, epochs=40) self.model_sanity(model) def test_dmm_neg(self): @@ -524,7 +558,7 @@ def test_dmm_neg(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmm_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=0, @@ -543,7 +577,7 @@ def test_dms_neg(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dms_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=0, vector_size=24, window=4, hs=0, @@ -562,7 +596,7 @@ def test_dmc_neg(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmc_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, hs=0, @@ -571,12 +605,13 @@ def test_dmc_neg_fromfile(self): self.model_sanity(model) def test_parallel(self): - """Test doc2vec parallel training.""" - corpus = utils.RepeatCorpus(DocsLeeCorpus(), 10000) + """Test doc2vec parallel training with more than default 3 threads.""" + # repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words) + corpus = utils.RepeatCorpus(DocsLeeCorpus(), 6000) - for workers in [2, 4]: - model = doc2vec.Doc2Vec(corpus, workers=workers) - self.model_sanity(model) + # use smaller batches-to-workers for more contention + model = doc2vec.Doc2Vec(corpus, workers=6, batch_words=5000) + self.model_sanity(model) def test_deterministic_hs(self): """Test doc2vec results identical with identical RNG seed.""" @@ -606,58 +641,25 @@ def test_deterministic_dmc(self): self.models_equal(model, model2) def test_mixed_tag_types(self): - """Ensure alternating int/string tags don't share indexes in vectors_docs""" + """Ensure alternating int/string tags don't share indexes in vectors""" mixed_tag_corpus = [doc2vec.TaggedDocument(words, [i, words[0]]) for i, words in enumerate(raw_sentences)] model = doc2vec.Doc2Vec() model.build_vocab(mixed_tag_corpus) - expected_length = len(sentences) + len(model.docvecs.doctags) # 9 sentences, 7 unique first tokens - self.assertEqual(len(model.docvecs.vectors_docs), expected_length) + expected_length = len(sentences) + len(model.dv.key_to_index) # 9 sentences, 7 unique first tokens + self.assertEqual(len(model.dv.vectors), expected_length) + # TODO: test saving in word2vec format def models_equal(self, model, model2): # check words/hidden-weights - self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) + self.assertEqual(len(model.wv), len(model2.wv)) self.assertTrue(np.allclose(model.wv.vectors, model2.wv.vectors)) if model.hs: - self.assertTrue(np.allclose(model.trainables.syn1, model2.trainables.syn1)) + self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: - self.assertTrue(np.allclose(model.trainables.syn1neg, model2.trainables.syn1neg)) + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) # check docvecs - self.assertEqual(len(model.docvecs.doctags), len(model2.docvecs.doctags)) - self.assertEqual(len(model.docvecs.offset2doctag), len(model2.docvecs.offset2doctag)) - - def test_delete_temporary_training_data(self): - """Test doc2vec model after delete_temporary_training_data""" - for i in [0, 1]: - for j in [0, 1]: - model = doc2vec.Doc2Vec(sentences, vector_size=5, min_count=1, window=4, hs=i, negative=j) - if i: - self.assertTrue(hasattr(model.trainables, 'syn1')) - if j: - self.assertTrue(hasattr(model.trainables, 'syn1neg')) - self.assertTrue(hasattr(model, 'syn0_lockf')) - model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False) - self.assertTrue(len(model['human']), 10) - self.assertTrue(model.wv.vocab['graph'].count, 5) - self.assertTrue(not hasattr(model.trainables, 'syn1')) - self.assertTrue(not hasattr(model.trainables, 'syn1neg')) - self.assertTrue(not hasattr(model.trainables, 'syn0_lockf')) - self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'vectors_docs')) - self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0_lockf')) - model = doc2vec.Doc2Vec( - list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=1, - negative=0, alpha=0.05, min_count=2, epochs=20 - ) - model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) - self.assertTrue(model.docvecs and hasattr(model.docvecs, 'vectors_docs')) - self.assertTrue(hasattr(model.trainables, 'syn1')) - self.model_sanity(model, keep_training=False) - model = doc2vec.Doc2Vec( - list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=0, - negative=1, alpha=0.05, min_count=2, epochs=20 - ) - model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) - self.model_sanity(model, keep_training=False) - self.assertTrue(hasattr(model.trainables, 'syn1neg')) + self.assertEqual(len(model.dv), len(model2.dv)) + self.assertEqual(len(model.dv.index_to_key), len(model2.dv.index_to_key)) def test_word_vec_non_writeable(self): model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c')) @@ -666,17 +668,17 @@ def test_word_vec_non_writeable(self): vector *= 0 @log_capture() - def testBuildVocabWarning(self, line): + def testBuildVocabWarning(self, loglines): """Test if logger warning is raised on non-ideal input to a doc2vec model""" raw_sentences = ['human', 'machine'] sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] model = doc2vec.Doc2Vec() model.build_vocab(sentences) warning = "Each 'words' should be a list of words (usually unicode strings)." - self.assertTrue(warning in str(line)) + self.assertTrue(warning in str(loglines)) @log_capture() - def testTrainWarning(self, line): + def testTrainWarning(self, loglines): """Test if warning is raised if alpha rises during subsequent calls to train()""" raw_sentences = [['human'], ['graph', 'trees']] @@ -690,7 +692,7 @@ def testTrainWarning(self, line): if epoch == 5: model.alpha += 0.05 warning = "Effective 'alpha' higher than previous training cycles" - self.assertTrue(warning in str(line)) + self.assertTrue(warning in str(loglines)) def testLoadOnClassError(self): """Test if exception is raised when loading doc2vec model on instance""" @@ -717,8 +719,8 @@ class ConcatenatedDoc2Vec(object): def __init__(self, models): self.models = models - if hasattr(models[0], 'docvecs'): - self.docvecs = ConcatenatedDocvecs([model.docvecs for model in models]) + if hasattr(models[0], 'dv'): + self.dv = ConcatenatedDocvecs([model.dv for model in models]) def __getitem__(self, token): return np.concatenate([model[token] for model in self.models]) @@ -836,4 +838,4 @@ def read_su_sentiment_rotten_tomatoes(dirname, lowercase=True): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() + unittest.main(module='gensim.test.test_doc2vec') diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 8f691d4608..c8c9b0582c 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -9,18 +9,18 @@ import os import subprocess import struct +import sys +import six import numpy as np from gensim import utils from gensim.models.word2vec import LineSentence -from gensim.models.fasttext import FastText as FT_gensim -from gensim.models.wrappers.fasttext import FastTextKeyedVectors -from gensim.models.wrappers.fasttext import FastText as FT_wrapper -from gensim.models.keyedvectors import Word2VecKeyedVectors +from gensim.models.fasttext import FastText as FT_gensim, FastTextKeyedVectors, _unpack +from gensim.models.keyedvectors import KeyedVectors from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences import gensim.models._fasttext_bin - +from gensim.models.fasttext_inner import compute_ngrams, compute_ngrams_bytes, ft_hash_bytes import gensim.models.fasttext @@ -37,7 +37,7 @@ # Limit the size of FastText ngram buckets, for RAM reasons. # See https://github.com/RaRe-Technologies/gensim/issues/2790 -BUCKET = 5000 +BUCKET = 10000 FT_HOME = os.environ.get("FT_HOME") FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None @@ -70,88 +70,88 @@ def setUp(self): self.test_new_model_file = datapath('lee_fasttext_new.bin') def test_training(self): - model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) + model = FT_gensim(vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) model.build_vocab(sentences) self.model_sanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) - self.assertEqual(model.wv.vectors.shape, (12, 10)) - self.assertEqual(len(model.wv.vocab), 12) - self.assertEqual(model.wv.vectors_vocab.shape[1], 10) - self.assertEqual(model.wv.vectors_ngrams.shape[1], 10) + self.assertEqual(model.wv.vectors.shape, (12, 12)) + self.assertEqual(len(model.wv), 12) + self.assertEqual(model.wv.vectors_vocab.shape[1], 12) + self.assertEqual(model.wv.vectors_ngrams.shape[1], 12) self.model_sanity(model) # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.get_vector('graph', use_norm=True) sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) + model2 = FT_gensim(sentences, vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) self.models_equal(model, model2) # verify oov-word vector retrieval invocab_vec = model.wv['minors'] # invocab word - self.assertEqual(len(invocab_vec), 10) + self.assertEqual(len(invocab_vec), 12) oov_vec = model.wv['minor'] # oov word - self.assertEqual(len(oov_vec), 10) + self.assertEqual(len(oov_vec), 12) def testFastTextTrainParameters(self): - model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) - model.build_vocab(sentences=sentences) + model = FT_gensim(vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) + model.build_vocab(corpus_iterable=sentences) self.assertRaises(TypeError, model.train, corpus_file=11111) - self.assertRaises(TypeError, model.train, sentences=11111) - self.assertRaises(TypeError, model.train, sentences=sentences, corpus_file='test') - self.assertRaises(TypeError, model.train, sentences=None, corpus_file=None) + self.assertRaises(TypeError, model.train, corpus_iterable=11111) + self.assertRaises(TypeError, model.train, corpus_iterable=sentences, corpus_file='test') + self.assertRaises(TypeError, model.train, corpus_iterable=None, corpus_file=None) self.assertRaises(TypeError, model.train, corpus_file=sentences) def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) - model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) + model = FT_gensim(vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET) model.build_vocab(corpus_file=corpus_file) self.model_sanity(model) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) - self.assertEqual(model.wv.vectors.shape, (12, 10)) - self.assertEqual(len(model.wv.vocab), 12) - self.assertEqual(model.wv.vectors_vocab.shape[1], 10) - self.assertEqual(model.wv.vectors_ngrams.shape[1], 10) + self.assertEqual(model.wv.vectors.shape, (12, 12)) + self.assertEqual(len(model.wv), 12) + self.assertEqual(model.wv.vectors_vocab.shape[1], 12) + self.assertEqual(model.wv.vectors_ngrams.shape[1], 12) self.model_sanity(model) # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.get_vector('graph', use_norm=True) sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # verify oov-word vector retrieval invocab_vec = model.wv['minors'] # invocab word - self.assertEqual(len(invocab_vec), 10) + self.assertEqual(len(invocab_vec), 12) oov_vec = model.wv['minor'] # oov word - self.assertEqual(len(oov_vec), 10) + self.assertEqual(len(oov_vec), 12) def models_equal(self, model, model2): - self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) - self.assertEqual(model.wv.num_ngram_vectors, model2.wv.num_ngram_vectors) + self.assertEqual(len(model.wv), len(model2.wv)) + self.assertEqual(model.wv.bucket, model2.wv.bucket) self.assertTrue(np.allclose(model.wv.vectors_vocab, model2.wv.vectors_vocab)) self.assertTrue(np.allclose(model.wv.vectors_ngrams, model2.wv.vectors_ngrams)) self.assertTrue(np.allclose(model.wv.vectors, model2.wv.vectors)) if model.hs: - self.assertTrue(np.allclose(model.trainables.syn1, model2.trainables.syn1)) + self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: - self.assertTrue(np.allclose(model.trainables.syn1neg, model2.trainables.syn1neg)) - most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) + most_common_word = max(model.wv.key_to_index, key=lambda word: model.wv.get_vecattr(word, 'count'))[0] self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word])) def test_persistence(self): @@ -164,7 +164,7 @@ def test_persistence(self): wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.vectors_ngrams, loaded_wv.vectors_ngrams)) - self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) + self.assertEqual(len(wv), len(loaded_wv)) def test_persistence_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file: @@ -179,27 +179,26 @@ def test_persistence_fromfile(self): wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.vectors_ngrams, loaded_wv.vectors_ngrams)) - self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) - - def test_norm_vectors_not_saved(self): - tmpf = get_tmpfile('gensim_fasttext.tst') - model = FT_gensim(sentences, min_count=1, bucket=BUCKET) - model.init_sims() - model.save(tmpf) - loaded_model = FT_gensim.load(tmpf) - self.assertTrue(loaded_model.wv.vectors_norm is None) - self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None) - - wv = model.wv - wv.save(tmpf) - loaded_kv = FastTextKeyedVectors.load(tmpf) - self.assertTrue(loaded_kv.vectors_norm is None) - self.assertTrue(loaded_kv.vectors_ngrams_norm is None) + self.assertEqual(len(wv), len(loaded_wv)) def model_sanity(self, model): - self.assertEqual(model.wv.vectors.shape, (len(model.wv.vocab), model.vector_size)) - self.assertEqual(model.wv.vectors_vocab.shape, (len(model.wv.vocab), model.vector_size)) - self.assertEqual(model.wv.vectors_ngrams.shape, (model.wv.num_ngram_vectors, model.vector_size)) + self.model_structural_sanity(model) + # TODO: add semantic tests, where appropriate + + def model_structural_sanity(self, model): + """Check a model for basic self-consistency, necessary properties & property + correspondences, but no semantic tests.""" + self.assertEqual(model.wv.vectors.shape, (len(model.wv), model.vector_size)) + self.assertEqual(model.wv.vectors_vocab.shape, (len(model.wv), model.vector_size)) + self.assertEqual(model.wv.vectors_ngrams.shape, (model.wv.bucket, model.vector_size)) + self.assertLessEqual(len(model.wv.vectors_ngrams_lockf), len(model.wv.vectors_ngrams)) + self.assertLessEqual(len(model.wv.vectors_vocab_lockf), len(model.wv.index_to_key)) + self.assertTrue(np.isfinite(model.wv.vectors_ngrams).all(), "NaN in ngrams") + self.assertTrue(np.isfinite(model.wv.vectors_vocab).all(), "NaN in vectors_vocab") + if model.negative: + self.assertTrue(np.isfinite(model.syn1neg).all(), "NaN in syn1neg") + if model.hs: + self.assertTrue(np.isfinite(model.syn1).all(), "NaN in syn1neg") def test_load_fasttext_format(self): try: @@ -208,8 +207,8 @@ def test_load_fasttext_format(self): self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) vocab_size, model_size = 1762, 10 self.assertEqual(model.wv.vectors.shape, (vocab_size, model_size)) - self.assertEqual(len(model.wv.vocab), vocab_size, model_size) - self.assertEqual(model.wv.vectors_ngrams.shape, (model.wv.num_ngram_vectors, model_size)) + self.assertEqual(len(model.wv), vocab_size, model_size) + self.assertEqual(model.wv.vectors_ngrams.shape, (model.wv.bucket, model_size)) expected_vec = [ -0.57144, @@ -243,16 +242,16 @@ def test_load_fasttext_format(self): actual_vec_oov = model.wv["rejection"] self.assertTrue(np.allclose(actual_vec_oov, expected_vec_oov, atol=1e-4)) - self.assertEqual(model.vocabulary.min_count, 5) + self.assertEqual(model.min_count, 5) self.assertEqual(model.window, 5) self.assertEqual(model.epochs, 5) self.assertEqual(model.negative, 5) - self.assertEqual(model.vocabulary.sample, 0.0001) - self.assertEqual(model.trainables.bucket, 1000) + self.assertEqual(model.sample, 0.0001) + self.assertEqual(model.wv.bucket, 1000) self.assertEqual(model.wv.max_n, 6) self.assertEqual(model.wv.min_n, 3) - self.assertEqual(model.wv.vectors.shape, (len(model.wv.vocab), model.vector_size)) - self.assertEqual(model.wv.vectors_ngrams.shape, (model.wv.num_ngram_vectors, model.vector_size)) + self.assertEqual(model.wv.vectors.shape, (len(model.wv), model.vector_size)) + self.assertEqual(model.wv.vectors_ngrams.shape, (model.wv.bucket, model.vector_size)) def test_load_fasttext_new_format(self): try: @@ -261,8 +260,8 @@ def test_load_fasttext_new_format(self): self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) vocab_size, model_size = 1763, 10 self.assertEqual(new_model.wv.vectors.shape, (vocab_size, model_size)) - self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) - self.assertEqual(new_model.wv.vectors_ngrams.shape, (new_model.wv.num_ngram_vectors, model_size)) + self.assertEqual(len(new_model.wv), vocab_size, model_size) + self.assertEqual(new_model.wv.vectors_ngrams.shape, (new_model.wv.bucket, model_size)) expected_vec = [ -0.025627, @@ -296,16 +295,16 @@ def test_load_fasttext_new_format(self): actual_vec_oov = new_model.wv["rejection"] self.assertTrue(np.allclose(actual_vec_oov, expected_vec_oov, atol=1e-4)) - self.assertEqual(new_model.vocabulary.min_count, 5) + self.assertEqual(new_model.min_count, 5) self.assertEqual(new_model.window, 5) self.assertEqual(new_model.epochs, 5) self.assertEqual(new_model.negative, 5) - self.assertEqual(new_model.vocabulary.sample, 0.0001) - self.assertEqual(new_model.trainables.bucket, 1000) + self.assertEqual(new_model.sample, 0.0001) + self.assertEqual(new_model.wv.bucket, 1000) self.assertEqual(new_model.wv.max_n, 6) self.assertEqual(new_model.wv.min_n, 3) - self.assertEqual(new_model.wv.vectors.shape, (len(new_model.wv.vocab), new_model.vector_size)) - self.assertEqual(new_model.wv.vectors_ngrams.shape, (new_model.wv.num_ngram_vectors, new_model.vector_size)) + self.assertEqual(new_model.wv.vectors.shape, (len(new_model.wv), new_model.vector_size)) + self.assertEqual(new_model.wv.vectors_ngrams.shape, (new_model.wv.bucket, new_model.vector_size)) def test_load_model_supervised(self): with self.assertRaises(NotImplementedError): @@ -380,18 +379,19 @@ def test_most_similar_cosmul(self): def test_lookup(self): # In vocab, sanity check - self.assertTrue('night' in self.test_model.wv.vocab) + self.assertTrue('night' in self.test_model.wv.key_to_index) self.assertTrue(np.allclose(self.test_model.wv['night'], self.test_model.wv[['night']])) # Out of vocab check - self.assertFalse('nights' in self.test_model.wv.vocab) + self.assertFalse('nights' in self.test_model.wv.key_to_index) self.assertTrue(np.allclose(self.test_model.wv['nights'], self.test_model.wv[['nights']])) def test_contains(self): # In vocab, sanity check - self.assertTrue('night' in self.test_model.wv.vocab) + self.assertTrue('night' in self.test_model.wv.key_to_index) self.assertTrue('night' in self.test_model.wv) # Out of vocab check - self.assertFalse('nights' in self.test_model.wv.vocab) + self.assertFalse(self.test_model.wv.has_index_for('nights')) + self.assertFalse('nights' in self.test_model.wv.key_to_index) self.assertTrue('nights' in self.test_model.wv) @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") @@ -405,8 +405,8 @@ def test_wm_distance(self): def test_cbow_hs_training(self): model_gensim = FT_gensim( - size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) lee_data = LineSentence(datapath('lee_background.cor')) @@ -428,14 +428,17 @@ def test_cbow_hs_training(self): u'flights', u'during', u'comes'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_cbow_hs_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4) lee_data = LineSentence(datapath('lee_background.cor')) @@ -461,14 +464,17 @@ def test_cbow_hs_training_fromfile(self): u'flights', u'during', u'comes'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_sg_hs_training(self): model_gensim = FT_gensim( - size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) lee_data = LineSentence(datapath('lee_background.cor')) @@ -490,14 +496,17 @@ def test_sg_hs_training(self): u'manslaughter', u'north', u'flight'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_sg_hs_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) lee_data = LineSentence(datapath('lee_background.cor')) @@ -523,14 +532,17 @@ def test_sg_hs_training_fromfile(self): u'manslaughter', u'north', u'flight'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_cbow_neg_training(self): model_gensim = FT_gensim( - size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) lee_data = LineSentence(datapath('lee_background.cor')) @@ -552,14 +564,17 @@ def test_cbow_neg_training(self): u'remains', u'overnight', u'running'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_cbow_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) lee_data = LineSentence(datapath('lee_background.cor')) @@ -585,14 +600,17 @@ def test_cbow_neg_training_fromfile(self): u'remains', u'overnight', u'running'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_sg_neg_training(self): model_gensim = FT_gensim( - size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4) lee_data = LineSentence(datapath('lee_background.cor')) @@ -614,14 +632,17 @@ def test_sg_neg_training(self): u'firm', u'singles', u'death'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_sg_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4) lee_data = LineSentence(datapath('lee_background.cor')) @@ -647,17 +668,20 @@ def test_sg_neg_training_fromfile(self): u'firm', u'singles', u'death'] - overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) - self.assertGreaterEqual(overlap_count, 2) + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + self.assertGreaterEqual( + overlap_count, 2, + "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) def test_online_learning(self): - model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET) - self.assertTrue(len(model_hs.wv.vocab), 12) - self.assertTrue(model_hs.wv.vocab['graph'].count, 3) + model_hs = FT_gensim(sentences, vector_size=12, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET) + self.assertEqual(len(model_hs.wv), 12) + self.assertEqual(model_hs.wv.get_vecattr('graph', 'count'), 3) model_hs.build_vocab(new_sentences, update=True) # update vocab - self.assertEqual(len(model_hs.wv.vocab), 14) - self.assertTrue(model_hs.wv.vocab['graph'].count, 4) - self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + self.assertEqual(len(model_hs.wv), 14) + self.assertEqual(model_hs.wv.get_vecattr('graph', 'count'), 4) + self.assertEqual(model_hs.wv.get_vecattr('artificial', 'count'), 4) def test_online_learning_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ @@ -666,23 +690,23 @@ def test_online_learning_fromfile(self): utils.save_as_line_sentence(new_sentences, new_corpus_file) model_hs = FT_gensim( - corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET) - self.assertTrue(len(model_hs.wv.vocab), 12) - self.assertTrue(model_hs.wv.vocab['graph'].count, 3) + corpus_file=corpus_file, vector_size=12, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET) + self.assertTrue(len(model_hs.wv), 12) + self.assertTrue(model_hs.wv.get_vecattr('graph', 'count'), 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab - self.assertEqual(len(model_hs.wv.vocab), 14) - self.assertTrue(model_hs.wv.vocab['graph'].count, 4) - self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + self.assertEqual(len(model_hs.wv), 14) + self.assertTrue(model_hs.wv.get_vecattr('graph', 'count'), 4) + self.assertTrue(model_hs.wv.get_vecattr('artificial', 'count'), 4) def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') - model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET) + model_neg = FT_gensim(sentences, vector_size=12, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) - self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertTrue(len(model_neg.wv), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.epochs) - self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertEqual(len(model_neg.wv), 14) def test_online_learning_after_save_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ @@ -692,31 +716,34 @@ def test_online_learning_after_save_fromfile(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim( - corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET) + corpus_file=corpus_file, vector_size=12, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) - self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertTrue(len(model_neg.wv), 12) model_neg.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.epochs) - self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertEqual(len(model_neg.wv), 14) def online_sanity(self, model): terro, others = [], [] - for x in list_corpus: - if 'terrorism' in x: - terro.append(x) + for line in list_corpus: + if 'terrorism' in line: + terro.append(line) else: - others.append(x) - self.assertTrue(all('terrorism' not in x for x in others)) + others.append(line) + self.assertTrue(all('terrorism' not in line for line in others)) model.build_vocab(others) + start_vecs = model.wv.vectors_vocab.copy() model.train(others, total_examples=model.corpus_count, epochs=model.epochs) + # checks that `vectors_vocab` has been changed by training + self.assertFalse(np.all(np.equal(start_vecs, model.wv.vectors_vocab))) # checks that `vectors` is different from `vectors_vocab` self.assertFalse(np.all(np.equal(model.wv.vectors, model.wv.vectors_vocab))) - self.assertFalse('terrorism' in model.wv.vocab) + self.assertFalse('terrorism' in model.wv.key_to_index) model.build_vocab(terro, update=True) # update vocab self.assertTrue(model.wv.vectors_ngrams.dtype == 'float32') - self.assertTrue('terrorism' in model.wv.vocab) + self.assertTrue('terrorism' in model.wv.key_to_index) orig0_all = np.copy(model.wv.vectors_ngrams) model.train(terro, total_examples=len(terro), epochs=model.epochs) self.assertFalse(np.allclose(model.wv.vectors_ngrams, orig0_all)) @@ -724,16 +751,16 @@ def online_sanity(self, model): self.assertLess(0., sim) def test_sg_hs_online(self): - model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET) + model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, epochs=1, seed=42, workers=1, bucket=BUCKET) self.online_sanity(model) def test_sg_neg_online(self): - model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET) + model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, epochs=1, seed=42, workers=1, bucket=BUCKET) self.online_sanity(model) def test_cbow_hs_online(self): model = FT_gensim( - sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1, + sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, epochs=1, seed=42, workers=1, bucket=BUCKET, ) self.online_sanity(model) @@ -741,12 +768,12 @@ def test_cbow_hs_online(self): def test_cbow_neg_online(self): model = FT_gensim( sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5, - min_count=5, iter=1, seed=42, workers=1, sample=0, bucket=BUCKET + min_count=5, epochs=1, seed=42, workers=1, sample=0, bucket=BUCKET ) self.online_sanity(model) def test_get_vocab_word_vecs(self): - model = FT_gensim(size=10, min_count=1, seed=42, bucket=BUCKET) + model = FT_gensim(vector_size=12, min_count=1, seed=42, bucket=BUCKET) model.build_vocab(sentences) original_syn0_vocab = np.copy(model.wv.vectors_vocab) model.wv.adjust_vectors() @@ -755,44 +782,45 @@ def test_get_vocab_word_vecs(self): def test_persistence_word2vec_format(self): """Test storing/loading the model in word2vec format.""" tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst') - model = FT_gensim(sentences, min_count=1, size=10, bucket=BUCKET) + model = FT_gensim(sentences, min_count=1, vector_size=12, bucket=BUCKET) model.wv.save_word2vec_format(tmpf, binary=True) - loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True) - self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab)) + loaded_model_kv = KeyedVectors.load_word2vec_format(tmpf, binary=True) + self.assertEqual(len(model.wv), len(loaded_model_kv)) self.assertTrue(np.allclose(model.wv['human'], loaded_model_kv['human'])) def test_bucket_ngrams(self): - model = FT_gensim(size=10, min_count=1, bucket=20) + model = FT_gensim(vector_size=12, min_count=1, bucket=20) model.build_vocab(sentences) - self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10)) + self.assertEqual(model.wv.vectors_ngrams.shape, (20, 12)) model.build_vocab(new_sentences, update=True) - self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10)) + self.assertEqual(model.wv.vectors_ngrams.shape, (20, 12)) def test_estimate_memory(self): - model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3, bucket=BUCKET) + model = FT_gensim(sg=1, hs=1, vector_size=12, negative=5, min_count=3, bucket=BUCKET) model.build_vocab(sentences) report = model.estimate_memory() self.assertEqual(report['vocab'], 2800) - self.assertEqual(report['syn0_vocab'], 160) - self.assertEqual(report['syn1'], 160) - self.assertEqual(report['syn1neg'], 160) - self.assertEqual(report['syn0_ngrams'], 2240) - self.assertEqual(report['buckets_word'], 640) - self.assertEqual(report['total'], 6160) - - @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") - def testLoadOldModel(self): + self.assertEqual(report['syn0_vocab'], 192) + self.assertEqual(report['syn1'], 192) + self.assertEqual(report['syn1neg'], 192) + # FIXME: these fixed numbers for particular implementation generations encumber changes without real QA + # perhaps instead verify reports' total is within some close factor of a deep-audit of actual memory used? + self.assertEqual(report['syn0_ngrams'], model.vector_size * np.dtype(np.float32).itemsize * BUCKET) + self.assertEqual(report['buckets_word'], 688) + self.assertEqual(report['total'], 484064) + + def obsolete_testLoadOldModel(self): """Test loading fasttext models from previous version""" model_file = 'fasttext_old' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) - self.assertTrue(len(model.wv.vocab) == 12) - self.assertTrue(len(model.wv.index2word) == 12) + self.assertTrue(len(model.wv) == 12) + self.assertTrue(len(model.wv.index_to_key) == 12) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) - self.assertTrue(model.vocabulary.cum_table.shape == (12, )) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (12, )) + self.assertTrue(model.cum_table.shape == (12, )) self.assertEqual(model.wv.vectors_vocab.shape, (12, 100)) self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100)) @@ -801,92 +829,16 @@ def testLoadOldModel(self): model_file = 'fasttext_old_sep' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) - self.assertTrue(len(model.wv.vocab) == 12) - self.assertTrue(len(model.wv.index2word) == 12) + self.assertTrue(len(model.wv) == 12) + self.assertTrue(len(model.wv.index_to_key) == 12) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) - self.assertTrue(model.vocabulary.cum_table.shape == (12, )) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (12, )) + self.assertTrue(model.cum_table.shape == (12, )) self.assertEqual(model.wv.vectors_vocab.shape, (12, 100)) self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100)) - def compare_with_wrapper(self, model_gensim, model_wrapper): - # make sure we get >=2 overlapping words for top-10 similar words suggested for `night` - sims_gensim = model_gensim.wv.most_similar('night', topn=10) - sims_gensim_words = (list(map(lambda x: x[0], sims_gensim))) # get similar words - - sims_wrapper = model_wrapper.most_similar('night', topn=10) - sims_wrapper_words = (list(map(lambda x: x[0], sims_wrapper))) # get similar words - - overlap_count = len(set(sims_gensim_words).intersection(sims_wrapper_words)) - - # overlap increases as we increase `iter` value, min overlap set to 2 to avoid unit-tests taking too long - # this limit can be increased when using Cython code - self.assertGreaterEqual(overlap_count, 2) - - @unittest.skipIf(not FT_HOME, "FT_HOME env variable not set, skipping test") - def test_cbow_hs_against_wrapper(self): - tmpf = get_tmpfile('gensim_fasttext.tst') - model_wrapper = FT_wrapper.train(ft_path=FT_CMD, corpus_file=datapath('lee_background.cor'), - output_file=tmpf, model='cbow', size=50, alpha=0.05, window=5, min_count=5, - word_ngrams=1, - loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, - threads=12) - - model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) - - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.vectors[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) - self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training - self.compare_with_wrapper(model_gensim, model_wrapper) - - @unittest.skipIf(not FT_HOME, "FT_HOME env variable not set, skipping test") - def test_sg_hs_against_wrapper(self): - - tmpf = get_tmpfile('gensim_fasttext.tst') - model_wrapper = FT_wrapper.train(ft_path=FT_CMD, corpus_file=datapath('lee_background.cor'), - output_file=tmpf, model='skipgram', size=50, alpha=0.025, window=5, - min_count=5, word_ngrams=1, - loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, - threads=12) - - model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET) - - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.vectors[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) - self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training - self.compare_with_wrapper(model_gensim, model_wrapper) - - def test_vocab_pruning(self): - """Does the model correctly interpret the max_final_vocab parameter?""" - sentences = [ - ["graph", "system"], - ["graph", "system"], - ["system", "eps"], - ["graph", "system"], - ] - model = FT_gensim(sentences, size=10, min_count=2, max_final_vocab=2) - self.assertEqual(len(model.wv.vocab), 2) - self.assertEqual(model.wv.vocab['graph'].count, 3) - self.assertEqual(model.wv.vocab['system'].count, 4) - - model = FT_gensim(sentences, size=10, min_count=2, max_final_vocab=1) - self.assertEqual(len(model.wv.vocab), 1) - self.assertEqual(model.wv.vocab['system'].count, 4) - - model = FT_gensim(sentences, size=10, min_count=4) - self.assertEqual(len(model.wv.vocab), 1) - self.assertEqual(model.wv.vocab['system'].count, 4) - with open(datapath('toy-data.txt')) as fin: TOY_SENTENCES = [fin.read().strip().split(' ')] @@ -896,7 +848,7 @@ def train_gensim(bucket=100, min_count=5): # # Set parameters to match those in the load_native function # - model = FT_gensim(bucket=bucket, size=5, alpha=0.05, workers=1, sample=0.0001, min_count=min_count) + model = FT_gensim(bucket=bucket, vector_size=5, alpha=0.05, workers=1, sample=0.0001, min_count=min_count) model.build_vocab(TOY_SENTENCES) model.train(TOY_SENTENCES, total_examples=len(TOY_SENTENCES), epochs=model.epochs) return model @@ -923,28 +875,10 @@ def load_vec(fin): def compare_wv(a, b, t): - a_count = {key: value.count for (key, value) in a.vocab.items()} - b_count = {key: value.count for (key, value) in b.vocab.items()} + a_count = {key: a.get_vecattr(key, 'count') for key in a.key_to_index} + b_count = {key: b.get_vecattr(key, 'count') for key in b.key_to_index} t.assertEqual(a_count, b_count) - # - # We don't compare indices because they depend on several things we - # cannot control during testing: - # - # 1. The order in which ties are broken when sorting the vocabulary - # in prepare_vocab - # 2. The order in which vocab terms are added to vocab_raw - # - if False: - a_indices = {key: value.index for (key, value) in a.vocab.items()} - b_indices = {key: value.index for (key, value) in b.vocab.items()} - a_words = [k for k in sorted(a_indices, key=lambda x: a_indices[x])] - b_words = [k for k in sorted(b_indices, key=lambda x: b_indices[x])] - t.assertEqual(a_words, b_words) - - t.assertEqual(a.index2word, b.index2word) - t.assertEqual(a.hash2index, b.hash2index) - # # We do not compare most matrices directly, because they will never # be equal unless many conditions are strictly controlled. @@ -955,11 +889,6 @@ def compare_wv(a, b, t): t.assertEqual(a.vectors_vocab.shape, b.vectors_vocab.shape) # t.assertTrue(np.allclose(a.vectors_vocab, b.vectors_vocab)) - # - # Only if match_gensim=True in init_post_load - # - # t.assertEqual(a.vectors_ngrams.shape, b.vectors_ngrams.shape) - def compare_nn(a, b, t): # @@ -991,6 +920,7 @@ def compare_vocabulary(a, b, t): class NativeTrainingContinuationTest(unittest.TestCase): maxDiff = None + model_structural_sanity = TestFastTextModel.model_structural_sanity def setUp(self): # @@ -1017,89 +947,74 @@ def test_in_vocab(self): expected = dict(load_vec(fin)) for word, expected_vector in expected.items(): - actual_vector = native.wv.word_vec(word) + actual_vector = native.wv.get_vector(word) self.assertTrue(np.allclose(expected_vector, actual_vector, atol=1e-5)) + self.model_structural_sanity(native) + def test_out_of_vocab(self): """Test for correct representation of out-of-vocab words.""" native = load_native() for word, expected_vector in self.oov_expected.items(): - actual_vector = native.wv.word_vec(word) + actual_vector = native.wv.get_vector(word) self.assertTrue(np.allclose(expected_vector, actual_vector, atol=1e-5)) - @unittest.skip('this test does not pass currently, I suspect a bug in our FT implementation') - def test_out_of_vocab_gensim(self): - """Test whether gensim gives similar results to FB for OOV words. - - Seems to be broken for our toy model. - """ - model = train_gensim() - - for word, expected_vector in self.oov_expected.items(): - actual_vector = model.wv.word_vec(word) - self.assertTrue(np.allclose(expected_vector, actual_vector, atol=1e-5)) + self.model_structural_sanity(native) def test_sanity(self): """Compare models trained on toy data. They should be equal.""" trained = train_gensim() native = load_native() - self.assertEqual(trained.bucket, native.bucket) + self.assertEqual(trained.wv.bucket, native.wv.bucket) # # Only if match_gensim=True in init_post_load # - # self.assertEqual(trained.num_ngram_vectors, native.num_ngram_vectors) + # self.assertEqual(trained.bucket, native.bucket) compare_wv(trained.wv, native.wv, self) - compare_vocabulary(trained.vocabulary, native.vocabulary, self) - compare_nn(trained.trainables, native.trainables, self) + compare_vocabulary(trained, native, self) + compare_nn(trained, native, self) + + self.model_structural_sanity(trained) + self.model_structural_sanity(native) def test_continuation_native(self): """Ensure that training has had a measurable effect.""" native = load_native() + self.model_structural_sanity(native) # # Pick a word that's is in both corpuses. # Its vectors should be different between training runs. # - word = 'human' - old_vector = native.wv.word_vec(word).tolist() + word = 'human' # FIXME: this isn't actually in model, except via OOV ngrams + old_vector = native.wv.get_vector(word).tolist() native.train(list_corpus, total_examples=len(list_corpus), epochs=native.epochs) - new_vector = native.wv.word_vec(word).tolist() + new_vector = native.wv.get_vector(word).tolist() self.assertNotEqual(old_vector, new_vector) + self.model_structural_sanity(native) def test_continuation_gensim(self): """Ensure that continued training has had a measurable effect.""" model = train_gensim(min_count=0) + self.model_structural_sanity(model) vectors_ngrams_before = np.copy(model.wv.vectors_ngrams) word = 'human' - old_vector = model.wv.word_vec(word).tolist() + old_vector = model.wv.get_vector(word).tolist() model.train(list_corpus, total_examples=len(list_corpus), epochs=model.epochs) vectors_ngrams_after = np.copy(model.wv.vectors_ngrams) self.assertFalse(np.allclose(vectors_ngrams_before, vectors_ngrams_after)) - new_vector = model.wv.word_vec(word).tolist() + new_vector = model.wv.get_vector(word).tolist() self.assertNotEqual(old_vector, new_vector) - - def test_continuation_load_gensim(self): - # - # This is a model from 3.6.0 - # - model = FT_gensim.load(datapath('compatible-hash-false.model')) - vectors_ngrams_before = np.copy(model.wv.vectors_ngrams) - old_vector = model.wv.word_vec('human').tolist() - - model.train(list_corpus, total_examples=len(list_corpus), epochs=model.epochs) - new_vector = model.wv.word_vec('human').tolist() - - self.assertFalse(np.allclose(vectors_ngrams_before, model.wv.vectors_ngrams)) - self.assertNotEqual(old_vector, new_vector) + self.model_structural_sanity(model) def test_save_load_gensim(self): """Test that serialization works end-to-end. Not crashing is a success.""" @@ -1115,9 +1030,11 @@ def test_save_load_gensim(self): train_gensim().save(model_name) model = FT_gensim.load(model_name) + self.model_structural_sanity(model) model.train(list_corpus, total_examples=len(list_corpus), epochs=model.epochs) model.save(model_name) + self.model_structural_sanity(model) def test_save_load_native(self): """Test that serialization works end-to-end. Not crashing is a success.""" @@ -1128,21 +1045,24 @@ def test_save_load_native(self): load_native().save(model_name) model = FT_gensim.load(model_name) + self.model_structural_sanity(model) model.train(list_corpus, total_examples=len(list_corpus), epochs=model.epochs) model.save(model_name) + self.model_structural_sanity(model) def test_load_native_pretrained(self): model = gensim.models.fasttext.load_facebook_model(datapath('toy-model-pretrained.bin')) - actual = model['monarchist'] + actual = model.wv['monarchist'] expected = np.array([0.76222, 1.0669, 0.7055, -0.090969, -0.53508]) self.assertTrue(np.allclose(expected, actual, atol=10e-4)) + self.model_structural_sanity(model) def test_load_native_vectors(self): cap_path = datapath("crime-and-punishment.bin") fbkv = gensim.models.fasttext.load_facebook_vectors(cap_path) - self.assertFalse('landlord' in fbkv.vocab) - self.assertTrue('landlady' in fbkv.vocab) + self.assertFalse('landlord' in fbkv.key_to_index) + self.assertTrue('landlady' in fbkv.key_to_index) oov_vector = fbkv['landlord'] iv_vector = fbkv['landlady'] self.assertFalse(np.allclose(oov_vector, iv_vector)) @@ -1153,6 +1073,7 @@ def test_no_ngrams(self): v1 = model.wv[''] origin = np.zeros(v1.shape, v1.dtype) self.assertTrue(np.allclose(v1, origin)) + self.model_structural_sanity(model) def _train_model_with_pretrained_vectors(): @@ -1176,23 +1097,13 @@ class HashCompatibilityTest(unittest.TestCase): def test_compatibility_true(self): m = FT_gensim.load(datapath('compatible-hash-true.model')) self.assertTrue(m.wv.compatible_hash) - self.assertEqual(m.trainables.bucket, m.wv.bucket) - - def test_compatibility_false(self): - # - # Originally obtained using and older version of gensim (e.g. 3.6.0). - # - m = FT_gensim.load(datapath('compatible-hash-false.model')) - self.assertFalse(m.wv.compatible_hash) - self.assertEqual(m.trainables.bucket, m.wv.bucket) def test_hash_native(self): m = load_native() self.assertTrue(m.wv.compatible_hash) - self.assertEqual(m.trainables.bucket, m.wv.bucket) -class HashTest(unittest.TestCase): +class FTHashResultsTest(unittest.TestCase): """Loosely based on the test described here: https://github.com/RaRe-Technologies/gensim/issues/2059#issuecomment-432300777 @@ -1233,6 +1144,232 @@ def test_out_of_vocab(self): self.assertTrue(np.allclose(expected[longword], actual[longword], atol=1e-5)) +def hash_main(alg): + """Generate hash values for test from standard input.""" + + assert six.PY3, 'this only works under Py3' + + hashmap = { + 'cy_bytes': ft_hash_bytes, + } + try: + fun = hashmap[alg] + except KeyError: + raise KeyError('invalid alg: %r expected one of %r' % (alg, sorted(hashmap))) + + for line in sys.stdin: + if 'bytes' in alg: + words = line.encode('utf-8').rstrip().split(b' ') + else: + words = line.rstrip().split(' ') + for word in words: + print('u%r: %r,' % (word, fun(word))) + + +class FTHashFunctionsTest(unittest.TestCase): + def setUp(self): + # + # I obtained these expected values using: + # + # $ echo word1 ... wordN | python -c 'from gensim.test.test_fasttext import hash_main;hash_main("alg")' # noqa: E501 + # + # where alg is cy_bytes (previous options had included: py_bytes, py_broken, cy_bytes, cy_broken.) + + # + self.expected = { + u'команда': 1725507386, + u'маленьких': 3011324125, + u'друзей': 737001801, + u'возит': 4225261911, + u'грузы': 1301826944, + u'всех': 706328732, + u'быстрей': 1379730754, + u'mysterious': 1903186891, + u'asteroid': 1988297200, + u'odyssey': 310195777, + u'introduction': 2848265721, + u'北海道': 4096045468, + u'札幌': 3909947444, + u'西区': 3653372632, + } + + def test_cython(self): + actual = {k: ft_hash_bytes(k.encode('utf-8')) for k in self.expected} + self.assertEqual(self.expected, actual) + + +# +# Run with: +# +# python -c 'import gensim.test.test_fasttext as t;t.ngram_main()' py_text 3 5 +# +def ngram_main(): + """Generate ngrams for tests from standard input.""" + + alg = sys.argv[1] + minn = int(sys.argv[2]) + maxn = int(sys.argv[3]) + + assert six.PY3, 'this only works under Py3' + assert minn <= maxn, 'expected sane command-line parameters' + + hashmap = { + 'cy_text': compute_ngrams, + 'cy_bytes': compute_ngrams_bytes, + } + try: + fun = hashmap[alg] + except KeyError: + raise KeyError('invalid alg: %r expected one of %r' % (alg, sorted(hashmap))) + + for line in sys.stdin: + word = line.rstrip('\n') + ngrams = fun(word, minn, maxn) + print("%r: %r," % (word, ngrams)) + + +class NgramsTest(unittest.TestCase): + def setUp(self): + self.expected_text = { + 'test': ['', '', ''], + 'at the': [ + '', + '', '' + ], + 'at\nthe': [ + '', + '', '' + ], + 'тест': ['<те', 'тес', 'ест', 'ст>', '<тес', 'тест', 'ест>', '<тест', 'тест>'], + 'テスト': ['<テス', 'テスト', 'スト>', '<テスト', 'テスト>', '<テスト>'], + '試し': ['<試し', '試し>', '<試し>'], + } + self.expected_bytes = { + 'test': [b'', b'est', b'est>', b'st>'], + 'at the': [ + b'', b'the', b'the>', b'he>' + ], + 'тест': [ + b'<\xd1\x82\xd0\xb5', b'<\xd1\x82\xd0\xb5\xd1\x81', b'<\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', + b'\xd1\x82\xd0\xb5\xd1\x81', b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82>', + b'\xd0\xb5\xd1\x81\xd1\x82', b'\xd0\xb5\xd1\x81\xd1\x82>', b'\xd1\x81\xd1\x82>' + ], + 'テスト': [ + b'<\xe3\x83\x86\xe3\x82\xb9', b'<\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88', + b'<\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88>', b'\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88', + b'\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88>', b'\xe3\x82\xb9\xe3\x83\x88>' + ], + '試し': [b'<\xe8\xa9\xa6\xe3\x81\x97', b'<\xe8\xa9\xa6\xe3\x81\x97>', b'\xe8\xa9\xa6\xe3\x81\x97>'], + } + + self.expected_text_wide_unicode = { + '🚑🚒🚓🚕': [ + '<🚑🚒', '🚑🚒🚓', '🚒🚓🚕', '🚓🚕>', + '<🚑🚒🚓', '🚑🚒🚓🚕', '🚒🚓🚕>', '<🚑🚒🚓🚕', '🚑🚒🚓🚕>' + ], + } + self.expected_bytes_wide_unicode = { + '🚑🚒🚓🚕': [ + b'<\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92', + b'<\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93', + b'<\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95', + b'\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93', + b'\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95', + b'\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95>', + b'\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95', + b'\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95>', + b'\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95>' + ], + } + + def test_text_cy(self): + for word in self.expected_text: + expected = self.expected_text[word] + actual = compute_ngrams(word, 3, 5) + self.assertEqual(expected, actual) + + @unittest.skipIf(sys.maxunicode == 0xffff, "Python interpreter doesn't support UCS-4 (wide unicode)") + def test_text_cy_wide_unicode(self): + for word in self.expected_text_wide_unicode: + expected = self.expected_text_wide_unicode[word] + actual = compute_ngrams(word, 3, 5) + self.assertEqual(expected, actual) + + def test_bytes_cy(self): + for word in self.expected_bytes: + expected = self.expected_bytes[word] + actual = compute_ngrams_bytes(word, 3, 5) + self.assertEqual(expected, actual) + + expected_text = self.expected_text[word] + actual_text = [n.decode('utf-8') for n in actual] + self.assertEqual(sorted(expected_text), sorted(actual_text)) + + for word in self.expected_bytes_wide_unicode: + expected = self.expected_bytes_wide_unicode[word] + actual = compute_ngrams_bytes(word, 3, 5) + self.assertEqual(expected, actual) + + expected_text = self.expected_text_wide_unicode[word] + actual_text = [n.decode('utf-8') for n in actual] + self.assertEqual(sorted(expected_text), sorted(actual_text)) + + def test_fb(self): + """Test against results from Facebook's implementation.""" + with utils.open(datapath('fb-ngrams.txt'), 'r', encoding='utf-8') as fin: + fb = dict(_read_fb(fin)) + + for word, expected in fb.items(): + # + # The model was trained with minn=3, maxn=6 + # + actual = compute_ngrams(word, 3, 6) + self.assertEqual(sorted(expected), sorted(actual)) + + +def _read_fb(fin): + """Read ngrams from output of the FB utility.""" + # + # $ cat words.txt + # test + # at the + # at\nthe + # тест + # テスト + # 試し + # 🚑🚒🚓🚕 + # $ while read w; + # do + # echo ""; + # echo $w; + # ./fasttext print-ngrams gensim/test/test_data/crime-and-punishment.bin "$w"; + # echo ""; + # done < words.txt > gensim/test/test_data/fb-ngrams.txt + # + while fin: + line = fin.readline().rstrip() + if not line: + break + + assert line == '' + word = fin.readline().rstrip() + + fin.readline() # ignore this line, it contains an origin vector for the full term + + ngrams = [] + while True: + line = fin.readline().rstrip() + if line == '': + break + + columns = line.split(' ') + term = ' '.join(columns[:-5]) + ngrams.append(term) + + yield word, ngrams + + class ZeroBucketTest(unittest.TestCase): def test_in_vocab(self): model = train_gensim(bucket=0) @@ -1351,7 +1488,7 @@ class SaveFacebookFormatModelTest(unittest.TestCase): def _check_roundtrip(self, sg): model_params = { "sg": sg, - "size": 10, + "vector_size": 10, "min_count": 1, "hs": 1, "negative": 5, @@ -1369,13 +1506,13 @@ def _check_roundtrip(self, sg): self.assertEqual(model_trained.negative, model_loaded.negative) self.assertEqual(model_trained.hs, model_loaded.hs) self.assertEqual(model_trained.sg, model_loaded.sg) - self.assertEqual(model_trained.trainables.bucket, model_loaded.trainables.bucket) + self.assertEqual(model_trained.wv.bucket, model_loaded.wv.bucket) self.assertEqual(model_trained.wv.min_n, model_loaded.wv.min_n) self.assertEqual(model_trained.wv.max_n, model_loaded.wv.max_n) - self.assertEqual(model_trained.vocabulary.sample, model_loaded.vocabulary.sample) - self.assertEqual(set(model_trained.wv.index2word), set(model_loaded.wv.index2word)) + self.assertEqual(model_trained.sample, model_loaded.sample) + self.assertEqual(set(model_trained.wv.index_to_key), set(model_loaded.wv.index_to_key)) - for w in model_trained.wv.index2word: + for w in model_trained.wv.index_to_key: v_orig = model_trained.wv[w] v_loaded = model_loaded.wv[w] self.assertLess(calc_max_diff(v_orig, v_loaded), MAX_WORDVEC_COMPONENT_DIFFERENCE) @@ -1406,7 +1543,7 @@ class SaveGensimByteIdentityTest(unittest.TestCase): def _check_roundtrip_file_file(self, sg): model_params = { "sg": sg, - "size": 10, + "vector_size": 10, "min_count": 1, "hs": 1, "negative": 0, @@ -1435,7 +1572,7 @@ def _save_test_model(out_base_fname, model_params): inp_fname = datapath('lee_background.cor') model_type = "cbow" if model_params["sg"] == 0 else "skipgram" - size = str(model_params["size"]) + size = str(model_params["vector_size"]) seed = str(model_params["seed"]) cmd = [ @@ -1457,7 +1594,7 @@ class SaveFacebookByteIdentityTest(unittest.TestCase): """ def _check_roundtrip_file_file(self, sg): - model_params = {"size": 10, "sg": sg, "seed": 42} + model_params = {"vector_size": 10, "sg": sg, "seed": 42} # fasttext tool creates both *vec and *bin files, so we have to remove both, even thought *vec is unused @@ -1506,7 +1643,7 @@ class SaveFacebookFormatReadingTest(unittest.TestCase): def _check_load_fasttext_format(self, sg): model_params = { "sg": sg, - "size": 10, + "vector_size": 10, "min_count": 1, "hs": 1, "negative": 5, @@ -1516,9 +1653,9 @@ def _check_load_fasttext_format(self, sg): with temporary_file("load_fasttext.bin") as fpath: model = _create_and_save_fb_model(fpath, model_params) - wv = _read_wordvectors_using_fasttext(fpath, model.wv.index2word) + wv = _read_wordvectors_using_fasttext(fpath, model.wv.index_to_key) - for i, w in enumerate(model.wv.index2word): + for i, w in enumerate(model.wv.index_to_key): diff = calc_max_diff(wv[i, :], model.wv[w]) # Because fasttext command line prints vectors with limited accuracy self.assertLess(diff, 1.0e-4) @@ -1530,6 +1667,38 @@ def test_cbow(self): self._check_load_fasttext_format(sg=0) +class UnpackTest(unittest.TestCase): + def test_sanity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {10: 0, 11: 1, 12: 2} + + n = _unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[10])) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[11])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + + def test_tricky(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {1: 0, 0: 1, 12: 2} + + n = _unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[0])) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[1])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + + def test_identity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {0: 0, 1: 1, 2: 2} + + n = _unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[0])) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[1])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[2])) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py deleted file mode 100644 index 9c619a3efb..0000000000 --- a/gensim/test/test_fasttext_wrapper.py +++ /dev/null @@ -1,382 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking transformation algorithms (the models package). -""" - -import logging -import unittest -import os - -import numpy - -from gensim.models.wrappers import fasttext -from gensim.models import keyedvectors -from gensim.test.utils import datapath, get_tmpfile - - -try: - from pyemd import emd # noqa:F401 - PYEMD_EXT = True -except (ImportError, ValueError): - PYEMD_EXT = False - - -logger = logging.getLogger(__name__) - - -class TestFastText(unittest.TestCase): - def setUp(self): - ft_home = os.environ.get('FT_HOME', None) - self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None - self.corpus_file = datapath('lee_background.cor') - self.test_model_file = datapath('lee_fasttext') - self.test_new_model_file = datapath('lee_fasttext_new') - # Load pre-trained model to perform tests in case FastText binary isn't available in test environment - self.test_model = fasttext.FastText.load_fasttext_format(self.test_model_file) - - def model_sanity(self, model): - """Even tiny models trained on any corpus should pass these sanity checks""" - self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) - self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size)) - - def models_equal(self, model1, model2): - self.assertEqual(len(model1.wv.vocab), len(model2.wv.vocab)) - self.assertEqual(set(model1.wv.vocab.keys()), set(model2.wv.vocab.keys())) - self.assertTrue(numpy.allclose(model1.wv.syn0, model2.wv.syn0)) - self.assertTrue(numpy.allclose(model1.wv.syn0_ngrams, model2.wv.syn0_ngrams)) - - def testTraining(self): - """Test self.test_model successfully trained, parameters and weights correctly loaded""" - if self.ft_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return # Use self.skipTest once python < 2.7 is no longer supported - vocab_size, model_size = 1763, 10 - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - trained_model = fasttext.FastText.train( - self.ft_path, self.corpus_file, size=model_size, output_file=tmpf - ) - - self.assertEqual(trained_model.wv.syn0.shape, (vocab_size, model_size)) - self.assertEqual(len(trained_model.wv.vocab), vocab_size) - self.assertEqual(trained_model.wv.syn0_ngrams.shape[1], model_size) - self.model_sanity(trained_model) - - # Tests temporary training files deleted - self.assertFalse(os.path.exists('%s.bin' % tmpf)) - - def testMinCount(self): - """Tests words with frequency less than `min_count` absent from vocab""" - if self.ft_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return # Use self.skipTest once python < 2.7 is no longer supported - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - test_model_min_count_5 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=tmpf, size=10, min_count=5 - ) - self.assertTrue('forests' not in test_model_min_count_5.wv.vocab) - - test_model_min_count_1 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=tmpf, size=10, min_count=1 - ) - self.assertTrue('forests' in test_model_min_count_1.wv.vocab) - - def testModelSize(self): - """Tests output vector dimensions are the same as the value for `size` param""" - if self.ft_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return # Use self.skipTest once python < 2.7 is no longer supported - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - test_model_size_20 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=tmpf, size=20 - ) - self.assertEqual(test_model_size_20.vector_size, 20) - self.assertEqual(test_model_size_20.wv.syn0.shape[1], 20) - self.assertEqual(test_model_size_20.wv.syn0_ngrams.shape[1], 20) - - def testPersistence(self): - """Test storing/loading the entire model.""" - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - self.test_model.save(tmpf) - loaded = fasttext.FastText.load(tmpf) - self.models_equal(self.test_model, loaded) - - self.test_model.save(tmpf, sep_limit=0) - self.models_equal(self.test_model, fasttext.FastText.load(tmpf)) - - def testNormalizedVectorsNotSaved(self): - """Test syn0norm/syn0_ngrams_norm aren't saved in model file""" - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - self.test_model.init_sims() - self.test_model.save(tmpf) - loaded = fasttext.FastText.load(tmpf) - self.assertTrue(loaded.wv.syn0norm is None) - self.assertTrue(loaded.wv.syn0_ngrams_norm is None) - - wv = self.test_model.wv - wv.save(tmpf) - loaded_kv = keyedvectors.KeyedVectors.load(tmpf) - self.assertTrue(loaded_kv.syn0norm is None) - self.assertTrue(loaded_kv.syn0_ngrams_norm is None) - - def testLoadFastTextFormat(self): - """Test model successfully loaded from fastText .bin file""" - try: - model = fasttext.FastText.load_fasttext_format(self.test_model_file) - except Exception as exc: - self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) - vocab_size, model_size = 1762, 10 - self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) - self.assertEqual(len(model.wv.vocab), vocab_size, model_size) - self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size)) - - expected_vec = [ - -0.57144, - -0.0085561, - 0.15748, - -0.67855, - -0.25459, - -0.58077, - -0.09913, - 1.1447, - 0.23418, - 0.060007 - ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin - self.assertTrue(numpy.allclose(model["hundred"], expected_vec, atol=1e-4)) - - # vector for oov words are slightly different from original FastText due to discarding unused ngrams - # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin - expected_vec_oov = [ - -0.23825, - -0.58482, - -0.22276, - -0.41215, - 0.91015, - -1.6786, - -0.26724, - 0.58818, - 0.57828, - 0.75801 - ] - self.assertTrue(numpy.allclose(model["rejection"], expected_vec_oov, atol=1e-4)) - - self.assertEqual(model.min_count, 5) - self.assertEqual(model.window, 5) - self.assertEqual(model.iter, 5) - self.assertEqual(model.negative, 5) - self.assertEqual(model.sample, 0.0001) - self.assertEqual(model.bucket, 1000) - self.assertEqual(model.wv.max_n, 6) - self.assertEqual(model.wv.min_n, 3) - self.model_sanity(model) - - def testLoadFastTextNewFormat(self): - """ Test model successfully loaded from fastText (new format) .bin file """ - try: - new_model = fasttext.FastText.load_fasttext_format(self.test_new_model_file) - except Exception as exc: - self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) - vocab_size, model_size = 1763, 10 - self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) - self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) - self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size)) - - expected_vec = [ - -0.025627, - -0.11448, - 0.18116, - -0.96779, - 0.2532, - -0.93224, - 0.3929, - 0.12679, - -0.19685, - -0.13179 - ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin - self.assertTrue(numpy.allclose(new_model["hundred"], expected_vec, atol=1e-4)) - - # vector for oov words are slightly different from original FastText due to discarding unused ngrams - # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin - expected_vec_oov = [ - -0.53378, - -0.19, - 0.013482, - -0.86767, - -0.21684, - -0.89928, - 0.45124, - 0.18025, - -0.14128, - 0.22508 - ] - self.assertTrue(numpy.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4)) - - self.assertEqual(new_model.min_count, 5) - self.assertEqual(new_model.window, 5) - self.assertEqual(new_model.iter, 5) - self.assertEqual(new_model.negative, 5) - self.assertEqual(new_model.sample, 0.0001) - self.assertEqual(new_model.bucket, 1000) - self.assertEqual(new_model.wv.max_n, 6) - self.assertEqual(new_model.wv.min_n, 3) - self.model_sanity(new_model) - - def testLoadFileName(self): - """ Test model accepts input as both `/path/to/model` or `/path/to/model.bin` """ - self.assertTrue(fasttext.FastText.load_fasttext_format(datapath('lee_fasttext_new'))) - self.assertTrue(fasttext.FastText.load_fasttext_format(datapath('lee_fasttext_new.bin'))) - - def testLoadModelSupervised(self): - """Test loading model with supervised learning labels""" - with self.assertRaises(NotImplementedError): - fasttext.FastText.load_fasttext_format(datapath('pang_lee_polarity_fasttext')) - - def testLoadModelWithNonAsciiVocab(self): - """Test loading model with non-ascii words in vocab""" - model = fasttext.FastText.load_fasttext_format(datapath('non_ascii_fasttext')) - self.assertTrue(u'který' in model) - try: - vector = model[u'který'] # noqa:F841 - except UnicodeDecodeError: - self.fail('Unable to access vector for utf8 encoded non-ascii word') - - def testLoadModelNonUtf8Encoding(self): - """Test loading model with words in user-specified encoding""" - model = fasttext.FastText.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') - self.assertTrue(u'který' in model) - try: - vector = model[u'který'] # noqa:F841 - except KeyError: - self.fail('Unable to access vector for cp-852 word') - - def testNSimilarity(self): - """Test n_similarity for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) - self.assertEqual(self.test_model.n_similarity(['the'], ['and']), self.test_model.n_similarity(['and'], ['the'])) - # Out of vocab check - self.assertTrue(numpy.allclose(self.test_model.n_similarity(['night', 'nights'], ['nights', 'night']), 1.0)) - self.assertEqual( - self.test_model.n_similarity(['night'], ['nights']), - self.test_model.n_similarity(['nights'], ['night']) - ) - - def testSimilarity(self): - """Test similarity for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue(numpy.allclose(self.test_model.similarity('the', 'the'), 1.0)) - self.assertEqual(self.test_model.similarity('the', 'and'), self.test_model.similarity('and', 'the')) - # Out of vocab check - self.assertTrue(numpy.allclose(self.test_model.similarity('nights', 'nights'), 1.0)) - self.assertEqual(self.test_model.similarity('night', 'nights'), self.test_model.similarity('nights', 'night')) - - def testMostSimilar(self): - """Test most_similar for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertEqual(len(self.test_model.most_similar(positive=['the', 'and'], topn=5)), 5) - self.assertEqual(self.test_model.most_similar('the'), self.test_model.most_similar(positive=['the'])) - # Out of vocab check - self.assertEqual(len(self.test_model.most_similar(['night', 'nights'], topn=5)), 5) - self.assertEqual(self.test_model.most_similar('nights'), self.test_model.most_similar(positive=['nights'])) - - def testMostSimilarCosmul(self): - """Test most_similar_cosmul for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertEqual(len(self.test_model.most_similar_cosmul(positive=['the', 'and'], topn=5)), 5) - self.assertEqual( - self.test_model.most_similar_cosmul('the'), - self.test_model.most_similar_cosmul(positive=['the'])) - # Out of vocab check - self.assertEqual(len(self.test_model.most_similar_cosmul(['night', 'nights'], topn=5)), 5) - self.assertEqual( - self.test_model.most_similar_cosmul('nights'), - self.test_model.most_similar_cosmul(positive=['nights'])) - - def testLookup(self): - """Tests word vector lookup for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue('night' in self.test_model.wv.vocab) - self.assertTrue(numpy.allclose(self.test_model['night'], self.test_model[['night']])) - # Out of vocab check - self.assertFalse('nights' in self.test_model.wv.vocab) - self.assertTrue(numpy.allclose(self.test_model['nights'], self.test_model[['nights']])) - # Word with no ngrams in model - self.assertRaises(KeyError, lambda: self.test_model['a!@']) - - def testContains(self): - """Tests __contains__ for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue('night' in self.test_model.wv.vocab) - self.assertTrue('night' in self.test_model) - # Out of vocab check - self.assertFalse('nights' in self.test_model.wv.vocab) - self.assertTrue('nights' in self.test_model) - # Word with no ngrams in model - self.assertFalse('a!@' in self.test_model.wv.vocab) - self.assertFalse('a!@' in self.test_model) - - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") - def testWmdistance(self): - """Tests wmdistance for docs with in-vocab and out-of-vocab words""" - doc = ['night', 'payment'] - oov_doc = ['nights', 'forests', 'payments'] - ngrams_absent_doc = ['a!@', 'b#$'] - - dist = self.test_model.wmdistance(doc, oov_doc) - self.assertNotEqual(float('inf'), dist) - dist = self.test_model.wmdistance(doc, ngrams_absent_doc) - self.assertEqual(float('inf'), dist) - - def testDoesntMatch(self): - """Tests doesnt_match for list of out-of-vocab words""" - oov_words = ['nights', 'forests', 'payments'] - # Out of vocab check - for word in oov_words: - self.assertFalse(word in self.test_model.wv.vocab) - try: - self.test_model.doesnt_match(oov_words) - except Exception: - self.fail('model.doesnt_match raises exception for oov words') - - def testHash(self): - # Tests FastText.ft_hash method return values to those obtained from original C implementation - ft_hash = fasttext.ft_hash('test') - self.assertEqual(ft_hash, 2949673445) - ft_hash = fasttext.ft_hash('word') - self.assertEqual(ft_hash, 1788406269) - - def testConsistentDtype(self): - """Test that the same dtype is returned for OOV words as for words in the vocabulary""" - vocab_word = 'night' - oov_word = 'wordnotpresentinvocabulary' - self.assertIn(vocab_word, self.test_model.wv.vocab) - self.assertNotIn(oov_word, self.test_model.wv.vocab) - - vocab_embedding = self.test_model[vocab_word] - oov_embedding = self.test_model[oov_word] - self.assertEqual(vocab_embedding.dtype, oov_embedding.dtype) - - def testPersistenceForOldVersions(self): - """Test backward compatibility for models saved with versions < 3.0.0""" - old_model_path = datapath('ft_model_2.3.0') - loaded_model = fasttext.FastText.load(old_model_path) - self.assertEqual(loaded_model.vector_size, 10) - self.assertEqual(loaded_model.wv.syn0.shape[1], 10) - self.assertEqual(loaded_model.wv.syn0_ngrams.shape[1], 10) - # in-vocab word - in_expected_vec = numpy.array([-2.44566941, -1.54802394, -2.61103821, -1.88549316, 1.02860415, - 1.19031894, 2.01627707, 1.98942184, -1.39095843, -0.65036952]) - self.assertTrue(numpy.allclose(loaded_model["the"], in_expected_vec, atol=1e-4)) - # out-of-vocab word - out_expected_vec = numpy.array([-1.34948218, -0.8686831, -1.51483142, -1.0164026, 0.56272298, - 0.66228276, 1.06477463, 1.1355902, -0.80972326, -0.39845538]) - self.assertTrue(numpy.allclose(loaded_model["random_word"], out_expected_vec, atol=1e-4)) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index cc7af1892d..0319bddaf1 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -25,7 +25,7 @@ class TestKerasWord2VecWrapper(unittest.TestCase): def setUp(self): - self.model_cos_sim = word2vec.Word2Vec(common_texts, size=100, min_count=1, hs=1) + self.model_cos_sim = word2vec.Word2Vec(common_texts, vector_size=100, min_count=1, hs=1) self.model_twenty_ng = word2vec.Word2Vec(min_count=1) def testWord2VecTraining(self): @@ -33,13 +33,13 @@ def testWord2VecTraining(self): Test word2vec training. """ model = self.model_cos_sim - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 100)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 100)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), 100)) + self.assertTrue(model.syn1.shape == (len(model.wv), 100)) sims = model.wv.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.vectors_norm[model.wv.get_index('graph')] sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) @@ -65,8 +65,8 @@ def testEmbeddingLayerCosineSim(self): word_a = 'graph' word_b = 'trees' output = model.predict([ - np.asarray([keras_w2v_model.wv.vocab[word_a].index]), - np.asarray([keras_w2v_model.wv.vocab[word_b].index]) + np.asarray([keras_w2v_model.wv.get_index(word_a)]), + np.asarray([keras_w2v_model.wv.get_index(word_b)]) ]) # output is the cosine distance between the two words (as a similarity measure) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 43cea8625c..f3e9329f75 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -12,79 +12,21 @@ import logging import unittest -from mock import patch import numpy as np -from gensim.corpora import Dictionary -from gensim.models.keyedvectors import ( - KeyedVectors, WordEmbeddingSimilarityIndex, FastTextKeyedVectors, REAL, -) +from gensim.models.keyedvectors import KeyedVectors, REAL, pseudorandom_weak_vector from gensim.test.utils import datapath import gensim.models.keyedvectors logger = logging.getLogger(__name__) -class TestWordEmbeddingSimilarityIndex(unittest.TestCase): - def setUp(self): - self.vectors = KeyedVectors.load_word2vec_format( - datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64) - - def test_most_similar(self): - """Test most_similar returns expected results.""" - - # check the handling of out-of-dictionary terms - index = WordEmbeddingSimilarityIndex(self.vectors) - self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10)))) - self.assertEqual(0, len(list(index.most_similar(u"out-of-dictionary term", topn=10)))) - - # check that the topn works as expected - index = WordEmbeddingSimilarityIndex(self.vectors) - results = list(index.most_similar(u"holiday", topn=10)) - self.assertLess(0, len(results)) - self.assertGreaterEqual(10, len(results)) - results = list(index.most_similar(u"holiday", topn=20)) - self.assertLess(10, len(results)) - self.assertGreaterEqual(20, len(results)) - - # check that the term itself is not returned - index = WordEmbeddingSimilarityIndex(self.vectors) - terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.vectors.vocab))] - self.assertFalse(u"holiday" in terms) - - # check that the threshold works as expected - index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0) - results = list(index.most_similar(u"holiday", topn=10)) - self.assertLess(0, len(results)) - self.assertGreaterEqual(10, len(results)) - - index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0) - results = list(index.most_similar(u"holiday", topn=10)) - self.assertEqual(0, len(results)) - - # check that the exponent works as expected - index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0) - first_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) - index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0) - second_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) - self.assertTrue(np.allclose(first_similarities ** 2.0, second_similarities)) - - class TestKeyedVectors(unittest.TestCase): def setUp(self): self.vectors = KeyedVectors.load_word2vec_format( - datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64) - - def test_similarity_matrix(self): - """Test similarity_matrix returns expected results.""" - - documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]] - dictionary = Dictionary(documents) - similarity_matrix = self.vectors.similarity_matrix(dictionary).todense() - - # checking the existence of ones on the main diagonal - self.assertTrue( - (np.diag(similarity_matrix) == np.ones(similarity_matrix.shape[0])).all()) + datapath('euclidean_vectors.bin'), binary=True) + self.model_path = datapath("w2v_keyedvectors_load_test.modeldata") + self.vocab_path = datapath("w2v_keyedvectors_load_test.vocab") def test_most_similar(self): """Test most_similar returns expected results.""" @@ -104,7 +46,7 @@ def test_most_similar_topn(self): self.assertEqual(len(self.vectors.most_similar('war', topn=10)), 10) predicted = self.vectors.most_similar('war', topn=None) - self.assertEqual(len(predicted), len(self.vectors.vocab)) + self.assertEqual(len(predicted), len(self.vectors)) predicted = self.vectors.most_similar('war', topn=0) self.assertEqual(len(predicted), 0) @@ -123,7 +65,7 @@ def test_relative_cosine_similarity(self): ] # synonyms for "good" as per wordnet cos_sim = [] for i in range(len(wordnet_syn)): - if wordnet_syn[i] in self.vectors.vocab: + if wordnet_syn[i] in self.vectors: cos_sim.append(self.vectors.similarity("good", wordnet_syn[i])) cos_sim = sorted(cos_sim, reverse=True) # cosine_similarity of "good" with wordnet_syn in decreasing order # computing relative_cosine_similarity of two similar words @@ -199,21 +141,17 @@ def test_similarity(self): self.assertTrue(np.allclose(self.vectors.similarity('war', 'war'), 1)) self.assertTrue(np.allclose(self.vectors.similarity('war', 'conflict'), 0.93305397)) - def test_words_closer_than(self): + def test_closer_than(self): """Test words_closer_than returns expected value for distinct and identical nodes.""" - self.assertEqual(self.vectors.words_closer_than('war', 'war'), []) + self.assertEqual(self.vectors.closer_than('war', 'war'), []) expected = set(['conflict', 'administration']) - self.assertEqual(set(self.vectors.words_closer_than('war', 'terrorism')), expected) + self.assertEqual(set(self.vectors.closer_than('war', 'terrorism')), expected) def test_rank(self): """Test rank returns expected value for distinct and identical nodes.""" self.assertEqual(self.vectors.rank('war', 'war'), 1) self.assertEqual(self.vectors.rank('war', 'terrorism'), 3) - def test_wv_property(self): - """Test that the deprecated `wv` property returns `self`. To be removed in v4.0.0.""" - self.assertTrue(self.vectors is self.vectors) - def test_add_single(self): """Test that adding entity in a manual way works correctly.""" entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] @@ -240,9 +178,9 @@ def test_add_multiple(self): vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. - vocab_size = len(self.vectors.vocab) + vocab_size = len(self.vectors) self.vectors.add(entities, vectors, replace=False) - self.assertEqual(vocab_size + len(entities), len(self.vectors.vocab)) + self.assertEqual(vocab_size + len(entities), len(self.vectors)) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) @@ -250,7 +188,7 @@ def test_add_multiple(self): # Test `add` on empty kv. kv = KeyedVectors(self.vectors.vector_size) kv[entities] = vectors - self.assertEqual(len(kv.vocab), len(entities)) + self.assertEqual(len(kv), len(entities)) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector)) @@ -266,184 +204,197 @@ def test_add_type(self): def test_set_item(self): """Test that __setitem__ works correctly.""" - vocab_size = len(self.vectors.vocab) + vocab_size = len(self.vectors) # Add new entity. entity = '___some_new_entity___' vector = np.random.randn(self.vectors.vector_size) self.vectors[entity] = vector - self.assertEqual(len(self.vectors.vocab), vocab_size + 1) + self.assertEqual(len(self.vectors), vocab_size + 1) self.assertTrue(np.allclose(self.vectors[entity], vector)) # Replace vector for entity in vocab. - vocab_size = len(self.vectors.vocab) + vocab_size = len(self.vectors) vector = np.random.randn(self.vectors.vector_size) self.vectors['war'] = vector - self.assertEqual(len(self.vectors.vocab), vocab_size) + self.assertEqual(len(self.vectors), vocab_size) self.assertTrue(np.allclose(self.vectors['war'], vector)) # __setitem__ on several entities. - vocab_size = len(self.vectors.vocab) + vocab_size = len(self.vectors) entities = ['war', '___some_new_entity1___', '___some_new_entity2___', 'terrorism', 'conflict'] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(len(entities))] self.vectors[entities] = vectors - self.assertEqual(len(self.vectors.vocab), vocab_size + 2) + self.assertEqual(len(self.vectors), vocab_size + 2) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) - def test_ft_kv_backward_compat_w_360(self): - kv = KeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) - ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) - - expected = ['trees', 'survey', 'system', 'graph', 'interface'] - actual = [word for (word, similarity) in kv.most_similar("human", topn=5)] - - self.assertEqual(actual, expected) - - actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)] - - self.assertEqual(actual, expected) - - -class L2NormTest(unittest.TestCase): - def test(self): - m = np.array(range(1, 10), dtype=np.float32) - m.shape = (3, 3) - - norm = gensim.models.keyedvectors._l2_norm(m) - self.assertFalse(np.allclose(m, norm)) - - gensim.models.keyedvectors._l2_norm(m, replace=True) - self.assertTrue(np.allclose(m, norm)) - - -class UnpackTest(unittest.TestCase): - def test_copy_sanity(self): - m = np.array(range(9)) - m.shape = (3, 3) - hash2index = {10: 0, 11: 1, 12: 2} - - n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index) - self.assertTrue(np.all(m[0] == n[10])) - self.assertTrue(np.all(m[1] == n[11])) - self.assertTrue(np.all(m[2] == n[12])) - - def test_sanity(self): - m = np.array(range(9)) - m.shape = (3, 3) - hash2index = {10: 0, 11: 1, 12: 2} - - n = gensim.models.keyedvectors._unpack(m, 25, hash2index) - self.assertTrue(np.all(np.array([0, 1, 2]) == n[10])) - self.assertTrue(np.all(np.array([3, 4, 5]) == n[11])) - self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) - - def test_tricky(self): - m = np.array(range(9)) - m.shape = (3, 3) - hash2index = {1: 0, 0: 1, 12: 2} - - n = gensim.models.keyedvectors._unpack(m, 25, hash2index) - self.assertTrue(np.all(np.array([3, 4, 5]) == n[0])) - self.assertTrue(np.all(np.array([0, 1, 2]) == n[1])) - self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) - - def test_identity(self): - m = np.array(range(9)) - m.shape = (3, 3) - hash2index = {0: 0, 1: 1, 2: 2} - - n = gensim.models.keyedvectors._unpack(m, 25, hash2index) - self.assertTrue(np.all(np.array([0, 1, 2]) == n[0])) - self.assertTrue(np.all(np.array([3, 4, 5]) == n[1])) - self.assertTrue(np.all(np.array([6, 7, 8]) == n[2])) - - -class Gensim320Test(unittest.TestCase): - def test(self): - path = datapath('old_keyedvectors_320.dat') - vectors = gensim.models.keyedvectors.KeyedVectors.load(path) - self.assertTrue(vectors.word_vec('computer') is not None) - - -class Word2VecKeyedVectorsTest(unittest.TestCase): - def setUp(self): - self.model_path = datapath("w2v_keyedvectors_load_test.modeldata") - self.vocab_path = datapath("w2v_keyedvectors_load_test.vocab") - def test_load_model_and_vocab_file_strict(self): - """Test loading model and vocab files which have decoding errors: strict mode""" + """Test loading model and voacab files which have decoding errors: strict mode""" with self.assertRaises(UnicodeDecodeError): gensim.models.KeyedVectors.load_word2vec_format( self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="strict") def test_load_model_and_vocab_file_replace(self): - """Test loading model and vocab files which have decoding errors: replace mode""" + """Test loading model and voacab files which have decoding errors: replace mode""" model = gensim.models.KeyedVectors.load_word2vec_format( self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="replace") - self.assertEqual(model.vocab[u'ありがとう�'].count, 123) - self.assertEqual(model.vocab[u'どういたしまして�'].count, 789) - self.assertEqual(model.vocab[u'ありがとう�'].index, 0) - self.assertEqual(model.vocab[u'どういたしまして�'].index, 1) + self.assertEqual(model.get_vecattr(u'ありがとう�', 'count'), 123) + self.assertEqual(model.get_vecattr(u'どういたしまして�', 'count'), 789) + self.assertEqual(model.key_to_index[u'ありがとう�'], 0) + self.assertEqual(model.key_to_index[u'どういたしまして�'], 1) self.assertTrue(np.array_equal( model.get_vector(u'ありがとう�'), np.array([.6, .6, .6], dtype=np.float32))) self.assertTrue(np.array_equal( model.get_vector(u'どういたしまして�'), np.array([.1, .2, .3], dtype=np.float32))) def test_load_model_and_vocab_file_ignore(self): - """Test loading model and vocab files which have decoding errors: ignore mode""" + """Test loading model and voacab files which have decoding errors: ignore mode""" model = gensim.models.KeyedVectors.load_word2vec_format( self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="ignore") - print(model.vocab.keys()) - self.assertEqual(model.vocab[u'ありがとう'].count, 123) - self.assertEqual(model.vocab[u'どういたしまして'].count, 789) - self.assertEqual(model.vocab[u'ありがとう'].index, 0) - self.assertEqual(model.vocab[u'どういたしまして'].index, 1) + self.assertEqual(model.get_vecattr(u'ありがとう', 'count'), 123) + self.assertEqual(model.get_vecattr(u'どういたしまして', 'count'), 789) + self.assertEqual(model.key_to_index[u'ありがとう'], 0) + self.assertEqual(model.key_to_index[u'どういたしまして'], 1) self.assertTrue(np.array_equal( model.get_vector(u'ありがとう'), np.array([.6, .6, .6], dtype=np.float32))) self.assertTrue(np.array_equal( model.get_vector(u'どういたしまして'), np.array([.1, .2, .3], dtype=np.float32))) + def test_save_reload(self): + randkv = KeyedVectors(vector_size=100) + count = 20 + keys = [str(i) for i in range(count)] + weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)] + randkv.add(keys, weights) + tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt") + randkv.save_word2vec_format(tmpfiletxt, binary=False) + reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False) + self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key) + self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all()) + tmpfilebin = gensim.test.utils.get_tmpfile("tmp_kv.bin") + randkv.save_word2vec_format(tmpfilebin, binary=True) + reloadbinkv = KeyedVectors.load_word2vec_format(tmpfilebin, binary=True) + self.assertEqual(randkv.index_to_key, reloadbinkv.index_to_key) + self.assertTrue((randkv.vectors == reloadbinkv.vectors).all()) + + def test_no_header(self): + randkv = KeyedVectors(vector_size=100) + count = 20 + keys = [str(i) for i in range(count)] + weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)] + randkv.add(keys, weights) + tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt") + randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False) + reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True) + self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key) + self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all()) -try: - import keras # noqa: F401 - - KERAS_INSTALLED = True -except ImportError: - KERAS_INSTALLED = False - - -@unittest.skipUnless(KERAS_INSTALLED, 'keras needs to be installed for this test') -class WordEmbeddingsKeyedVectorsTest(unittest.TestCase): - def setUp(self): - self.vectors = KeyedVectors.load_word2vec_format( - datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64) - - def test_get_keras_embedding_word_index_none(self): - embedding_layer = self.vectors.get_keras_embedding() - self.assertEqual(self.vectors.vectors.shape, embedding_layer._initial_weights[0].shape) - self.assertTrue(np.array_equal( - self.vectors['is'], embedding_layer._initial_weights[0][self.vectors.vocab['is'].index, :])) - def test_get_keras_embedding_word_index_passed(self): - word_index = {'is': 1, 'to': 2} - embedding_layer = self.vectors.get_keras_embedding(word_index=word_index) - self.assertEqual(embedding_layer._initial_weights[0].shape, (3, self.vectors.vectors.shape[1])) - self.assertTrue(np.array_equal( - self.vectors['is'], embedding_layer._initial_weights[0][1, :])) - - @patch('numpy.random.normal') - def test_get_keras_embedding_word_index_passed_with_oov_word(self, normal_func): - normal_func.return_value = np.zeros((3, self.vectors.vectors.shape[1])) - word_index = {'is': 1, 'not_a_real_word': 2} - embedding_layer = self.vectors.get_keras_embedding(word_index=word_index) - self.assertEqual(embedding_layer._initial_weights[0].shape, (3, self.vectors.vectors.shape[1])) - self.assertTrue( - np.array_equal(embedding_layer._initial_weights[0][2, :], np.zeros(self.vectors.vectors.shape[1]))) +class Gensim320Test(unittest.TestCase): + def test(self): + path = datapath('old_keyedvectors_320.dat') + vectors = gensim.models.keyedvectors.KeyedVectors.load(path) + self.assertTrue(vectors.get_vector('computer') is not None) + + +def save_dict_to_word2vec_formated_file(fname, word2vec_dict): + + with gensim.utils.open(fname, "bw") as f: + + num_words = len(word2vec_dict) + vector_length = len(list(word2vec_dict.values())[0]) + + header = "%d %d\n" % (num_words, vector_length) + f.write(header.encode(encoding="ascii")) + + for word, vector in word2vec_dict.items(): + f.write(word.encode()) + f.write(' '.encode()) + f.write(np.array(vector).astype(np.float32).tobytes()) + + +class LoadWord2VecFormatTest(unittest.TestCase): + + def assert_dict_equal_to_model(self, d, m): + self.assertEqual(len(d), len(m)) + + for word in d.keys(): + self.assertSequenceEqual(list(d[word]), list(m[word])) + + def verify_load2vec_binary_result(self, w2v_dict, binary_chunk_size, limit): + tmpfile = gensim.test.utils.get_tmpfile("tmp_w2v") + save_dict_to_word2vec_formated_file(tmpfile, w2v_dict) + w2v_model = \ + gensim.models.keyedvectors._load_word2vec_format( + cls=gensim.models.KeyedVectors, + fname=tmpfile, + binary=True, + limit=limit, + binary_chunk_size=binary_chunk_size) + if limit is None: + limit = len(w2v_dict) + + w2v_keys_postprocessed = list(w2v_dict.keys())[:limit] + w2v_dict_postprocessed = {k.lstrip(): w2v_dict[k] for k in w2v_keys_postprocessed} + + self.assert_dict_equal_to_model(w2v_dict_postprocessed, w2v_model) + + def test_load_word2vec_format_basic(self): + w2v_dict = {"abc": [1, 2, 3], + "cde": [4, 5, 6], + "def": [7, 8, 9]} + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=None) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=None) + + w2v_dict = {"abc": [1, 2, 3], + "cdefg": [4, 5, 6], + "d": [7, 8, 9]} + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=None) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=None) + + def test_load_word2vec_format_limit(self): + w2v_dict = {"abc": [1, 2, 3], + "cde": [4, 5, 6], + "def": [7, 8, 9]} + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=1) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=1) + + w2v_dict = {"abc": [1, 2, 3], + "cde": [4, 5, 6], + "def": [7, 8, 9]} + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=2) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=2) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=2) + + w2v_dict = {"abc": [1, 2, 3], + "cdefg": [4, 5, 6], + "d": [7, 8, 9]} + + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=1) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=1) + + w2v_dict = {"abc": [1, 2, 3], + "cdefg": [4, 5, 6], + "d": [7, 8, 9]} + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=2) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=2) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=2) + + def test_load_word2vec_format_space_stripping(self): + w2v_dict = {"\nabc": [1, 2, 3], + "cdefdg": [4, 5, 6], + "\n\ndef": [7, 8, 9]} + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None) + self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1) if __name__ == '__main__': diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 227c78b4f6..2982331ceb 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -204,7 +204,8 @@ def setUp(self): corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel( corpus=corpus, id2word=dictionary, num_topics=2, - time_slice=[10, 10, 11], initialize='own', sstats=sstats + time_slice=[10, 10, 11], initialize='own', sstats=sstats, + passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4 ) # testing topic word proportions diff --git a/gensim/test/test_poincare.py b/gensim/test/test_poincare.py index 9ea020da51..67b2668e02 100644 --- a/gensim/test/test_poincare.py +++ b/gensim/test/test_poincare.py @@ -57,24 +57,24 @@ def setUp(self): self.data_large = PoincareRelations(datapath('poincare_hypernyms_large.tsv')) def models_equal(self, model_1, model_2): - self.assertEqual(len(model_1.kv.vocab), len(model_2.kv.vocab)) - self.assertEqual(set(model_1.kv.vocab.keys()), set(model_2.kv.vocab.keys())) - self.assertTrue(np.allclose(model_1.kv.syn0, model_2.kv.syn0)) + self.assertEqual(len(model_1.kv), len(model_2.kv)) + self.assertEqual(set(model_1.kv.index_to_key), set(model_2.kv.index_to_key)) + self.assertTrue(np.allclose(model_1.kv.vectors, model_2.kv.vectors)) def test_data_counts(self): """Tests whether data has been loaded correctly and completely.""" model = PoincareModel(self.data) self.assertEqual(len(model.all_relations), 5) - self.assertEqual(len(model.node_relations[model.kv.vocab['kangaroo.n.01'].index]), 3) - self.assertEqual(len(model.kv.vocab), 7) + self.assertEqual(len(model.node_relations[model.kv.get_index('kangaroo.n.01')]), 3) + self.assertEqual(len(model.kv), 7) self.assertTrue('mammal.n.01' not in model.node_relations) def test_data_counts_with_bytes(self): """Tests whether input bytes data is loaded correctly and completely.""" model = PoincareModel([(b'\x80\x01c', b'\x50\x71a'), (b'node.1', b'node.2')]) self.assertEqual(len(model.all_relations), 2) - self.assertEqual(len(model.node_relations[model.kv.vocab[b'\x80\x01c'].index]), 1) - self.assertEqual(len(model.kv.vocab), 4) + self.assertEqual(len(model.node_relations[model.kv.get_index(b'\x80\x01c')]), 1) + self.assertEqual(len(model.kv), 4) self.assertTrue(b'\x50\x71a' not in model.node_relations) def test_persistence(self): @@ -96,12 +96,12 @@ def test_persistence_separate_file(self): def test_online_learning(self): """Tests whether additional input data is loaded correctly and completely.""" model = PoincareModel(self.data, burn_in=0, negative=3) - self.assertEqual(len(model.kv.vocab), 7) - self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 3) - self.assertEqual(model.kv.vocab['cat.n.01'].count, 1) + self.assertEqual(len(model.kv), 7) + self.assertEqual(model.kv.get_vecattr('kangaroo.n.01', 'count'), 3) + self.assertEqual(model.kv.get_vecattr('cat.n.01', 'count'), 1) model.build_vocab([('kangaroo.n.01', 'cat.n.01')], update=True) # update vocab - self.assertEqual(model.kv.vocab['kangaroo.n.01'].count, 4) - self.assertEqual(model.kv.vocab['cat.n.01'].count, 2) + self.assertEqual(model.kv.get_vecattr('kangaroo.n.01', 'count'), 4) + self.assertEqual(model.kv.get_vecattr('cat.n.01', 'count'), 2) def test_train_after_load(self): """Tests whether the model can be trained correctly after loading from disk.""" @@ -116,17 +116,17 @@ def test_train_after_load(self): def test_persistence_old_model(self): """Tests whether model from older gensim version is loaded correctly.""" loaded = PoincareModel.load(datapath('poincare_test_3.4.0')) - self.assertEqual(loaded.kv.syn0.shape, (239, 2)) - self.assertEqual(len(loaded.kv.vocab), 239) + self.assertEqual(loaded.kv.vectors.shape, (239, 2)) + self.assertEqual(len(loaded.kv), 239) self.assertEqual(loaded.size, 2) self.assertEqual(len(loaded.all_relations), 200) def test_train_old_model_after_load(self): """Tests whether loaded model from older gensim version can be trained correctly.""" loaded = PoincareModel.load(datapath('poincare_test_3.4.0')) - old_vectors = np.copy(loaded.kv.syn0) + old_vectors = np.copy(loaded.kv.vectors) loaded.train(epochs=2) - self.assertFalse(np.allclose(old_vectors, loaded.kv.syn0)) + self.assertFalse(np.allclose(old_vectors, loaded.kv.vectors)) def test_invalid_data_raises_error(self): """Tests that error is raised on invalid input data.""" @@ -140,34 +140,34 @@ def test_invalid_data_raises_error(self): def test_vector_shape(self): """Tests whether vectors are initialized with the correct size.""" model = PoincareModel(self.data, size=20) - self.assertEqual(model.kv.syn0.shape, (7, 20)) + self.assertEqual(model.kv.vectors.shape, (7, 20)) def test_vector_dtype(self): """Tests whether vectors have the correct dtype before and after training.""" model = PoincareModel(self.data_large, dtype=np.float32, burn_in=0, negative=3) - self.assertEqual(model.kv.syn0.dtype, np.float32) + self.assertEqual(model.kv.vectors.dtype, np.float32) model.train(epochs=1) - self.assertEqual(model.kv.syn0.dtype, np.float32) + self.assertEqual(model.kv.vectors.dtype, np.float32) def test_training(self): """Tests that vectors are different before and after training.""" model = PoincareModel(self.data_large, burn_in=0, negative=3) - old_vectors = np.copy(model.kv.syn0) + old_vectors = np.copy(model.kv.vectors) model.train(epochs=2) - self.assertFalse(np.allclose(old_vectors, model.kv.syn0)) + self.assertFalse(np.allclose(old_vectors, model.kv.vectors)) def test_training_multiple(self): """Tests that calling train multiple times results in different vectors.""" model = PoincareModel(self.data_large, burn_in=0, negative=3) model.train(epochs=2) - old_vectors = np.copy(model.kv.syn0) + old_vectors = np.copy(model.kv.vectors) model.train(epochs=1) - self.assertFalse(np.allclose(old_vectors, model.kv.syn0)) + self.assertFalse(np.allclose(old_vectors, model.kv.vectors)) - old_vectors = np.copy(model.kv.syn0) + old_vectors = np.copy(model.kv.vectors) model.train(epochs=0) - self.assertTrue(np.allclose(old_vectors, model.kv.syn0)) + self.assertTrue(np.allclose(old_vectors, model.kv.vectors)) def test_gradients_check(self): """Tests that the model is trained successfully with gradients check enabled.""" @@ -192,22 +192,22 @@ def test_reproducible(self): model_2 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1) model_2.train(epochs=2) - self.assertTrue(np.allclose(model_1.kv.syn0, model_2.kv.syn0)) + self.assertTrue(np.allclose(model_1.kv.vectors, model_2.kv.vectors)) def test_burn_in(self): """Tests that vectors are different after burn-in.""" model = PoincareModel(self.data, burn_in=1, negative=3) - original_vectors = np.copy(model.kv.syn0) + original_vectors = np.copy(model.kv.vectors) model.train(epochs=0) - self.assertFalse(np.allclose(model.kv.syn0, original_vectors)) + self.assertFalse(np.allclose(model.kv.vectors, original_vectors)) def test_burn_in_only_done_once(self): """Tests that burn-in does not happen when train is called a second time.""" model = PoincareModel(self.data, negative=3, burn_in=1) model.train(epochs=0) - original_vectors = np.copy(model.kv.syn0) + original_vectors = np.copy(model.kv.vectors) model.train(epochs=0) - self.assertTrue(np.allclose(model.kv.syn0, original_vectors)) + self.assertTrue(np.allclose(model.kv.vectors, original_vectors)) def test_negatives(self): """Tests that correct number of negatives are sampled.""" @@ -268,7 +268,7 @@ def test_most_similar_topn(self): self.assertEqual(len(self.vectors.most_similar('dog.n.01', topn=10)), 10) predicted = self.vectors.most_similar('dog.n.01', topn=None) - self.assertEqual(len(predicted), len(self.vectors.vocab) - 1) + self.assertEqual(len(predicted), len(self.vectors) - 1) self.assertEqual(predicted[-1][0], 'gallant_fox.n.01') def test_most_similar_raises_keyerror(self): @@ -311,7 +311,7 @@ def test_distances(self): self.assertTrue(np.allclose(distances, [4.5278745, 0])) distances = self.vectors.distances('dog.n.01') - self.assertEqual(len(distances), len(self.vectors.vocab)) + self.assertEqual(len(distances), len(self.vectors)) self.assertTrue(np.allclose(distances[-1], 10.04756)) def test_distances_with_vector_input(self): @@ -321,7 +321,7 @@ def test_distances_with_vector_input(self): self.assertTrue(np.allclose(distances, [4.5278745, 0])) distances = self.vectors.distances(input_vector) - self.assertEqual(len(distances), len(self.vectors.vocab)) + self.assertEqual(len(distances), len(self.vectors)) self.assertTrue(np.allclose(distances[-1], 10.04756)) def test_poincare_distances_batch(self): @@ -383,11 +383,11 @@ def test_difference_in_hierarchy(self): self.assertTrue(np.allclose(self.vectors.difference_in_hierarchy('mammal.n.01', 'dog.n.01'), 0.9384287)) self.assertTrue(np.allclose(self.vectors.difference_in_hierarchy('dog.n.01', 'mammal.n.01'), -0.9384287)) - def test_words_closer_than(self): - """Test words_closer_than returns expected value for distinct and identical nodes.""" - self.assertEqual(self.vectors.words_closer_than('dog.n.01', 'dog.n.01'), []) + def test_closer_than(self): + """Test closer_than returns expected value for distinct and identical nodes.""" + self.assertEqual(self.vectors.closer_than('dog.n.01', 'dog.n.01'), []) expected = set(['canine.n.02', 'hunting_dog.n.01']) - self.assertEqual(set(self.vectors.words_closer_than('dog.n.01', 'carnivore.n.01')), expected) + self.assertEqual(set(self.vectors.closer_than('dog.n.01', 'carnivore.n.01')), expected) def test_rank(self): """Test rank returns expected value for distinct and identical nodes.""" diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 5fa441a2d5..3556438655 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -28,6 +28,7 @@ from gensim.test.utils import (datapath, get_tmpfile, common_texts as texts, common_dictionary as dictionary, common_corpus as corpus) from gensim.similarities import UniformTermSimilarityIndex +from gensim.similarities import WordEmbeddingSimilarityIndex from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import LevenshteinSimilarityIndex from gensim.similarities.docsim import _nlargest @@ -301,7 +302,7 @@ def setUp(self): class TestWmdSimilarity(unittest.TestCase, _TestSimilarityABC): def setUp(self): self.cls = similarities.WmdSimilarity - self.w2v_model = Word2Vec(texts, min_count=1) + self.w2v_model = Word2Vec(texts, min_count=1).wv def factoryMethod(self): # Override factoryMethod. @@ -552,11 +553,10 @@ def setUp(self): def testWord2Vec(self): model = word2vec.Word2Vec(texts, min_count=1) - model.init_sims() index = self.indexer(model, 10) self.assertVectorIsSimilarToItself(model.wv, index) - self.assertApproxNeighborsMatchExact(model, model.wv, index) + self.assertApproxNeighborsMatchExact(model.wv, model.wv, index) self.assertIndexSaved(index) self.assertLoadedIndexEqual(index, model) @@ -571,11 +571,10 @@ def __iter__(self): yield line.lower().strip().split() model = FastText(LeeReader(datapath('lee.cor')), bucket=5000) - model.init_sims() index = self.indexer(model, 10) self.assertVectorIsSimilarToItself(model.wv, index) - self.assertApproxNeighborsMatchExact(model, model.wv, index) + self.assertApproxNeighborsMatchExact(model.wv, model.wv, index) self.assertIndexSaved(index) self.assertLoadedIndexEqual(index, model) @@ -606,8 +605,8 @@ def assertVectorIsSimilarToItself(self, wv, index): def assertApproxNeighborsMatchExact(self, model, wv, index): vector = wv.vectors_norm[0] - approx_neighbors = model.wv.most_similar([vector], topn=5, indexer=index) - exact_neighbors = model.wv.most_similar(positive=[vector], topn=5) + approx_neighbors = model.most_similar([vector], topn=5, indexer=index) + exact_neighbors = model.most_similar(positive=[vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] @@ -616,11 +615,11 @@ def assertApproxNeighborsMatchExact(self, model, wv, index): def assertAllSimilaritiesDisableIndexer(self, model, wv, index): vector = wv.vectors_norm[0] - approx_similarities = model.wv.most_similar([vector], topn=None, indexer=index) - exact_similarities = model.wv.most_similar(positive=[vector], topn=None) + approx_similarities = model.most_similar([vector], topn=None, indexer=index) + exact_similarities = model.most_similar(positive=[vector], topn=None) self.assertEqual(approx_similarities, exact_similarities) - self.assertEqual(len(approx_similarities), len(wv.vectors.vocab)) + self.assertEqual(len(approx_similarities), len(wv.vectors)) def assertIndexSaved(self, index): fname = get_tmpfile('gensim_similarities.tst.pkl') @@ -654,9 +653,8 @@ def setUp(self): from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) - self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) - self.vector = self.model.docvecs.vectors_docs_norm[0] + self.vector = self.model.dv.vectors_norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) @@ -666,8 +664,8 @@ def testDocumentIsSimilarToItself(self): self.assertAlmostEqual(similarity, 1.0, places=2) def testApproxNeighborsMatchExact(self): - approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) - exact_neighbors = self.model.docvecs.most_similar( + approx_neighbors = self.model.dv.most_similar([self.vector], topn=5, indexer=self.index) + exact_neighbors = self.model.dv.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] @@ -715,11 +713,10 @@ def setUp(self): def test_word2vec(self): model = word2vec.Word2Vec(texts, min_count=1) - model.init_sims() index = self.indexer(model) self.assertVectorIsSimilarToItself(model.wv, index) - self.assertApproxNeighborsMatchExact(model, model.wv, index) + self.assertApproxNeighborsMatchExact(model.wv, model.wv, index) self.assertIndexSaved(index) self.assertLoadedIndexEqual(index, model) @@ -734,11 +731,10 @@ def __iter__(self): yield line.lower().strip().split() model = FastText(LeeReader(datapath('lee.cor')), bucket=5000) - model.init_sims() index = self.indexer(model) self.assertVectorIsSimilarToItself(model.wv, index) - self.assertApproxNeighborsMatchExact(model, model.wv, index) + self.assertApproxNeighborsMatchExact(model.wv, model.wv, index) self.assertIndexSaved(index) self.assertLoadedIndexEqual(index, model) @@ -767,8 +763,8 @@ def assertVectorIsSimilarToItself(self, wv, index): def assertApproxNeighborsMatchExact(self, model, wv, index): vector = wv.vectors_norm[0] - approx_neighbors = model.wv.most_similar([vector], topn=5, indexer=index) - exact_neighbors = model.wv.most_similar(positive=[vector], topn=5) + approx_neighbors = model.most_similar([vector], topn=5, indexer=index) + exact_neighbors = model.most_similar(positive=[vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] @@ -806,9 +802,8 @@ def setUp(self): from gensim.similarities.nmslib import NmslibIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) - self.model.init_sims() self.index = NmslibIndexer(self.model) - self.vector = self.model.docvecs.vectors_docs_norm[0] + self.vector = self.model.dv.vectors_norm[0] def test_document_is_similar_to_itself(self): approx_neighbors = self.index.most_similar(self.vector, 1) @@ -818,8 +813,8 @@ def test_document_is_similar_to_itself(self): self.assertAlmostEqual(similarity, 1.0, places=2) def test_approx_neighbors_match_exact(self): - approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) - exact_neighbors = self.model.docvecs.most_similar( + approx_neighbors = self.model.dv.most_similar([self.vector], topn=5, indexer=self.index) + exact_neighbors = self.model.dv.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] @@ -1237,6 +1232,51 @@ def test_most_similar(self): self.assertTrue(scipy.sparse.issparse(similarity_matrix.matrix)) +class TestWordEmbeddingSimilarityIndex(unittest.TestCase): + def setUp(self): + self.vectors = KeyedVectors.load_word2vec_format( + datapath('euclidean_vectors.bin'), binary=True, datatype=numpy.float64) + + def test_most_similar(self): + """Test most_similar returns expected results.""" + + # check the handling of out-of-dictionary terms + index = WordEmbeddingSimilarityIndex(self.vectors) + self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10)))) + self.assertEqual(0, len(list(index.most_similar(u"out-of-dictionary term", topn=10)))) + + # check that the topn works as expected + index = WordEmbeddingSimilarityIndex(self.vectors) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertLess(0, len(results)) + self.assertGreaterEqual(10, len(results)) + results = list(index.most_similar(u"holiday", topn=20)) + self.assertLess(10, len(results)) + self.assertGreaterEqual(20, len(results)) + + # check that the term itself is not returned + index = WordEmbeddingSimilarityIndex(self.vectors) + terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.vectors))] + self.assertFalse(u"holiday" in terms) + + # check that the threshold works as expected + index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertLess(0, len(results)) + self.assertGreaterEqual(10, len(results)) + + index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0) + results = list(index.most_similar(u"holiday", topn=10)) + self.assertEqual(0, len(results)) + + # check that the exponent works as expected + index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0) + first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0) + second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) + self.assertTrue(numpy.allclose(first_similarities**2.0, second_similarities)) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index e14fef351e..9dc7d303eb 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -1,4 +1,5 @@ import os +import logging import unittest import numpy import codecs @@ -521,7 +522,8 @@ def testModelNotFitted(self): class TestLdaSeqWrapper(unittest.TestCase): def setUp(self): self.model = LdaSeqTransformer( - id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim' + id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim', + passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4 ) self.model.fit(corpus_ldaseq) @@ -549,7 +551,10 @@ def testPipeline(self): test_target = data.target[0:2] id2word = Dictionary([x.split() for x in test_data]) corpus = [id2word.doc2bow(i.split()) for i in test_data] - model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim') + model = LdaSeqTransformer( + id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim', + passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4 + ) clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_ldaseq = Pipeline([('features', model,), ('classifier', clf)]) text_ldaseq.fit(corpus, test_target) @@ -582,7 +587,10 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - ldaseq_wrapper = LdaSeqTransformer(num_topics=2) + ldaseq_wrapper = LdaSeqTransformer( + num_topics=2, + passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4 + ) doc = list(corpus_ldaseq)[0] self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc) @@ -658,7 +666,7 @@ def testModelNotFitted(self): class TestWord2VecWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) - self.model = W2VTransformer(size=10, min_count=0, seed=42) + self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) self.model.fit(texts) def testTransform(self): @@ -667,21 +675,21 @@ def testTransform(self): words = words + texts[0] matrix = self.model.transform(words) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # tranform one word word = texts[0][0] matrix = self.model.transform(word) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) def testConsistencyWithGensimModel(self): # training a W2VTransformer - self.model = W2VTransformer(size=10, min_count=0, seed=42) + self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) self.model.fit(texts) # training a Gensim Word2Vec model with the same params - gensim_w2vmodel = models.Word2Vec(texts, size=10, min_count=0, seed=42) + gensim_w2vmodel = models.Word2Vec(texts, vector_size=10, min_count=0, seed=42) word = texts[0][0] vec_transformer_api = self.model.transform(word) # vector returned by W2VTransformer @@ -691,7 +699,7 @@ def testConsistencyWithGensimModel(self): def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(size=10, min_count=1) + model = W2VTransformer(vector_size=10, min_count=1) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -728,7 +736,7 @@ def testPersistence(self): # sanity check for transformation operation self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) # comparing the original and loaded models original_transformed_vecs = self.model.transform(word) @@ -736,7 +744,7 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42) + w2vmodel_wrapper = W2VTransformer(vector_size=10, min_count=0, seed=42) word = texts[0][0] self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word) @@ -835,13 +843,13 @@ def testTransform(self): docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]] matrix = self.model.transform(docs) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # tranform one document doc = w2v_texts[0] matrix = self.model.transform(doc) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) def testFitTransform(self): model = D2VTransformer(min_count=1) @@ -850,13 +858,13 @@ def testFitTransform(self): docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]] matrix = model.fit_transform(docs) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], model.size) + self.assertEqual(matrix.shape[1], model.vector_size) # fit and transform one document doc = w2v_texts[0] matrix = model.fit_transform(doc) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], model.size) + self.assertEqual(matrix.shape[1], model.vector_size) def testSetGetParams(self): # updating only one param @@ -896,7 +904,7 @@ def testPersistence(self): # sanity check for transformation operation self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) # comparing the original and loaded models original_transformed_vecs = self.model.transform(doc) @@ -1301,9 +1309,9 @@ def testModelNotFitted(self): self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) -class TestFastTextWrapper(unittest.TestCase): +class TestFTTransformer(unittest.TestCase): def setUp(self): - self.model = FTTransformer(size=10, min_count=0, seed=42, bucket=5000) + self.model = FTTransformer(vector_size=10, min_count=0, seed=42, bucket=5000) self.model.fit(texts) def testTransform(self): @@ -1312,30 +1320,30 @@ def testTransform(self): words = words + texts[0] matrix = self.model.transform(words) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # tranform one word word = texts[0][0] matrix = self.model.transform(word) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # verify oov-word vector retrieval invocab_vec = self.model.transform("computer") # invocab word self.assertEqual(invocab_vec.shape[0], 1) - self.assertEqual(invocab_vec.shape[1], self.model.size) + self.assertEqual(invocab_vec.shape[1], self.model.vector_size) oov_vec = self.model.transform('compute') # oov word self.assertEqual(oov_vec.shape[0], 1) - self.assertEqual(oov_vec.shape[1], self.model.size) + self.assertEqual(oov_vec.shape[1], self.model.vector_size) def testConsistencyWithGensimModel(self): # training a FTTransformer - self.model = FTTransformer(size=10, min_count=0, seed=42, workers=1, bucket=5000) + self.model = FTTransformer(vector_size=10, min_count=0, seed=42, workers=1, bucket=5000) self.model.fit(texts) # training a Gensim FastText model with the same params - gensim_ftmodel = models.FastText(texts, size=10, min_count=0, seed=42, workers=1, bucket=5000) + gensim_ftmodel = models.FastText(texts, vector_size=10, min_count=0, seed=42, workers=1, bucket=5000) # vectors returned by FTTransformer vecs_transformer_api = self.model.transform( @@ -1353,7 +1361,7 @@ def testConsistencyWithGensimModel(self): self.assertTrue(passed) def testPipeline(self): - model = FTTransformer(size=10, min_count=1, bucket=5000) + model = FTTransformer(vector_size=10, min_count=1, bucket=5000) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -1391,7 +1399,7 @@ def testPersistence(self): # sanity check for transformation operation self.assertEqual(loaded_transformed_vecs.shape[0], len(words)) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) # comparing the original and loaded models original_transformed_vecs = self.model.transform(words) @@ -1399,10 +1407,11 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - ftmodel_wrapper = FTTransformer(size=10, min_count=0, seed=42, bucket=5000) + ftmodel_wrapper = FTTransformer(vector_size=10, min_count=0, seed=42, bucket=5000) word = texts[0][0] self.assertRaises(NotFittedError, ftmodel_wrapper.transform, word) if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index 7be7ce4b63..8846dc617d 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -2,7 +2,7 @@ # encoding: utf-8 from collections import namedtuple import unittest -import math +import logging import numpy as np @@ -91,28 +91,35 @@ def setUp(self): filename = datapath("alldata-id-10.txt") train_docs = read_sentiment_docs(filename) self.train_docs = train_docs - self.source_doc_vec_file = datapath("small_tag_doc_5_iter50") - self.target_doc_vec_file = datapath("large_tag_doc_10_iter50") - - self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file) - self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file) + self.source_doc_vec = Doc2Vec(documents=train_docs[:5], vector_size=8, epochs=50, seed=1) + self.target_doc_vec = Doc2Vec(documents=train_docs, vector_size=8, epochs=50, seed=2) def test_translation_matrix(self): model = translation_matrix.BackMappingTranslationMatrix( self.source_doc_vec, self.target_doc_vec, self.train_docs[:5] ) transmat = model.train(self.train_docs[:5]) - self.assertEqual(transmat.shape, (100, 100)) + self.assertEqual(transmat.shape, (8, 8)) def test_infer_vector(self): + """Test that translation gives similar results to traditional inference. + + This may not be completely sensible/salient with such tiny data, but + replaces a nonsensical test. + """ model = translation_matrix.BackMappingTranslationMatrix( self.source_doc_vec, self.target_doc_vec, self.train_docs[:5] ) model.train(self.train_docs[:5]) - infered_vec = model.infer_vector(self.target_doc_vec.docvecs[self.train_docs[5].tags]) - self.assertEqual(infered_vec.shape, (100, )) + backmapped_vec = model.infer_vector(self.target_doc_vec.dv[self.train_docs[5].tags]) + self.assertEqual(backmapped_vec.shape, (8, )) + + d2v_inferred_vector = self.source_doc_vec.infer_vector(self.train_docs[5].words) + + distance = cosine(backmapped_vec, d2v_inferred_vector) + self.assertLessEqual(distance, 0.1) + - expected = 0.6453547135 - eps = 1e-6 - caculated = cosine(self.target_doc_vec.docvecs[self.train_docs[5].tags], infered_vec) - self.assertLessEqual(math.fabs(caculated - expected), eps) +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index 910dea3fb1..626c0de06c 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -8,7 +8,6 @@ """ from __future__ import unicode_literals -import sys import logging import unittest @@ -18,8 +17,6 @@ from gensim import utils from gensim.test.utils import datapath, get_tmpfile -import gensim.models.utils_any2vec - class TestIsCorpus(unittest.TestCase): def test_None(self): @@ -268,257 +265,6 @@ def test_save_as_line_sentence_ru(self): self.assertEqual(sentences, ref_sentences) -def hash_main(alg): - """Generate hash values for test from standard input.""" - import sys - import six - - assert six.PY3, 'this only works under Py3' - - hashmap = { - 'cy_broken': gensim.models.utils_any2vec.ft_hash_broken, - 'cy_bytes': gensim.models.utils_any2vec.ft_hash_bytes, - } - try: - fun = hashmap[alg] - except KeyError: - raise KeyError('invalid alg: %r expected one of %r' % (alg, sorted(hashmap))) - - for line in sys.stdin: - if 'bytes' in alg: - words = line.encode('utf-8').rstrip().split(b' ') - else: - words = line.rstrip().split(' ') - for word in words: - print('u%r: %r,' % (word, fun(word))) - - -class HashTest(unittest.TestCase): - def setUp(self): - # - # I obtained these expected values using: - # - # $ echo word1 ... wordN | python -c 'from gensim.test.test_utils import hash_main;hash_main("alg")' # noqa: E501 - # - # where alg is one of py_bytes, py_broken, cy_bytes, cy_broken. - - # - self.expected = { - u'команда': 1725507386, - u'маленьких': 3011324125, - u'друзей': 737001801, - u'возит': 4225261911, - u'грузы': 1301826944, - u'всех': 706328732, - u'быстрей': 1379730754, - u'mysterious': 1903186891, - u'asteroid': 1988297200, - u'odyssey': 310195777, - u'introduction': 2848265721, - u'北海道': 4096045468, - u'札幌': 3909947444, - u'西区': 3653372632, - } - self.expected_broken = { - u'команда': 962806708, - u'маленьких': 3633597485, - u'друзей': 214728041, - u'возит': 3590926132, - u'грузы': 3674544745, - u'всех': 3931012458, - u'быстрей': 822471432, - u'mysterious': 1903186891, - u'asteroid': 1988297200, - u'odyssey': 310195777, - u'introduction': 2848265721, - u'北海道': 4017049120, - u'札幌': 1706980764, - u'西区': 1113327900, - } - - def test_cython(self): - actual = {k: gensim.models.utils_any2vec.ft_hash_bytes(k.encode('utf-8')) for k in self.expected} - self.assertEqual(self.expected, actual) - - def test_cython_broken(self): - actual = {k: gensim.models.utils_any2vec.ft_hash_broken(k) for k in self.expected} - self.assertEqual(self.expected_broken, actual) - - -# -# Run with: -# -# python -c 'import gensim.test.test_utils as t;t.ngram_main()' py_text 3 5 -# -def ngram_main(): - """Generate ngrams for tests from standard input.""" - import sys - import six - - alg = sys.argv[1] - minn = int(sys.argv[2]) - maxn = int(sys.argv[3]) - - assert six.PY3, 'this only works under Py3' - assert minn <= maxn, 'expected sane command-line parameters' - - hashmap = { - 'cy_text': gensim.models.utils_any2vec.compute_ngrams, - 'cy_bytes': gensim.models.utils_any2vec.compute_ngrams_bytes, - } - try: - fun = hashmap[alg] - except KeyError: - raise KeyError('invalid alg: %r expected one of %r' % (alg, sorted(hashmap))) - - for line in sys.stdin: - word = line.rstrip('\n') - ngrams = fun(word, minn, maxn) - print("%r: %r," % (word, ngrams)) - - -class NgramsTest(unittest.TestCase): - def setUp(self): - self.expected_text = { - 'test': ['', '', ''], - 'at the': [ - '', - '', '' - ], - 'at\nthe': [ - '', - '', '' - ], - 'тест': ['<те', 'тес', 'ест', 'ст>', '<тес', 'тест', 'ест>', '<тест', 'тест>'], - 'テスト': ['<テス', 'テスト', 'スト>', '<テスト', 'テスト>', '<テスト>'], - '試し': ['<試し', '試し>', '<試し>'], - } - self.expected_bytes = { - 'test': [b'', b'est', b'est>', b'st>'], - 'at the': [ - b'', b'the', b'the>', b'he>' - ], - 'тест': [ - b'<\xd1\x82\xd0\xb5', b'<\xd1\x82\xd0\xb5\xd1\x81', b'<\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', - b'\xd1\x82\xd0\xb5\xd1\x81', b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82>', - b'\xd0\xb5\xd1\x81\xd1\x82', b'\xd0\xb5\xd1\x81\xd1\x82>', b'\xd1\x81\xd1\x82>' - ], - 'テスト': [ - b'<\xe3\x83\x86\xe3\x82\xb9', b'<\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88', - b'<\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88>', b'\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88', - b'\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88>', b'\xe3\x82\xb9\xe3\x83\x88>' - ], - '試し': [b'<\xe8\xa9\xa6\xe3\x81\x97', b'<\xe8\xa9\xa6\xe3\x81\x97>', b'\xe8\xa9\xa6\xe3\x81\x97>'], - } - - self.expected_text_wide_unicode = { - '🚑🚒🚓🚕': [ - '<🚑🚒', '🚑🚒🚓', '🚒🚓🚕', '🚓🚕>', - '<🚑🚒🚓', '🚑🚒🚓🚕', '🚒🚓🚕>', '<🚑🚒🚓🚕', '🚑🚒🚓🚕>' - ], - } - self.expected_bytes_wide_unicode = { - '🚑🚒🚓🚕': [ - b'<\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92', - b'<\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93', - b'<\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95', - b'\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93', - b'\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95', - b'\xf0\x9f\x9a\x91\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95>', - b'\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95', - b'\xf0\x9f\x9a\x92\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95>', - b'\xf0\x9f\x9a\x93\xf0\x9f\x9a\x95>' - ], - } - - def test_text_cy(self): - for word in self.expected_text: - expected = self.expected_text[word] - actual = gensim.models.utils_any2vec.compute_ngrams(word, 3, 5) - self.assertEqual(expected, actual) - - @unittest.skipIf(sys.maxunicode == 0xffff, "Python interpreter doesn't support UCS-4 (wide unicode)") - def test_text_cy_wide_unicode(self): - for word in self.expected_text_wide_unicode: - expected = self.expected_text_wide_unicode[word] - actual = gensim.models.utils_any2vec.compute_ngrams(word, 3, 5) - self.assertEqual(expected, actual) - - def test_bytes_cy(self): - for word in self.expected_bytes: - expected = self.expected_bytes[word] - actual = gensim.models.utils_any2vec.compute_ngrams_bytes(word, 3, 5) - self.assertEqual(expected, actual) - - expected_text = self.expected_text[word] - actual_text = [n.decode('utf-8') for n in actual] - self.assertEqual(sorted(expected_text), sorted(actual_text)) - - for word in self.expected_bytes_wide_unicode: - expected = self.expected_bytes_wide_unicode[word] - actual = gensim.models.utils_any2vec.compute_ngrams_bytes(word, 3, 5) - self.assertEqual(expected, actual) - - expected_text = self.expected_text_wide_unicode[word] - actual_text = [n.decode('utf-8') for n in actual] - self.assertEqual(sorted(expected_text), sorted(actual_text)) - - def test_fb(self): - """Test against results from Facebook's implementation.""" - with utils.open(datapath('fb-ngrams.txt'), 'r', encoding='utf-8') as fin: - fb = dict(_read_fb(fin)) - - for word, expected in fb.items(): - # - # The model was trained with minn=3, maxn=6 - # - actual = gensim.models.utils_any2vec.compute_ngrams(word, 3, 6) - self.assertEqual(sorted(expected), sorted(actual)) - - -def _read_fb(fin): - """Read ngrams from output of the FB utility.""" - # - # $ cat words.txt - # test - # at the - # at\nthe - # тест - # テスト - # 試し - # 🚑🚒🚓🚕 - # $ while read w; - # do - # echo ""; - # echo $w; - # ./fasttext print-ngrams gensim/test/test_data/crime-and-punishment.bin "$w"; - # echo ""; - # done < words.txt > gensim/test/test_data/fb-ngrams.txt - # - while fin: - line = fin.readline().rstrip() - if not line: - break - - assert line == '' - word = fin.readline().rstrip() - - fin.readline() # ignore this line, it contains an origin vector for the full term - - ngrams = [] - while True: - line = fin.readline().rstrip() - if line == '': - break - - columns = line.split(' ') - term = ' '.join(columns[:-5]) - ngrams.append(term) - - yield word, ngrams - - if __name__ == '__main__': logging.root.setLevel(logging.WARNING) unittest.main() diff --git a/gensim/test/test_utils_any2vec.py b/gensim/test/test_utils_any2vec.py deleted file mode 100644 index f4c5c2c430..0000000000 --- a/gensim/test/test_utils_any2vec.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking utils_any2vec functionality. -""" - -import logging -import unittest - -import numpy as np - -import gensim.utils -import gensim.test.utils - -import gensim.models.utils_any2vec - - -logger = logging.getLogger(__name__) - - -def save_dict_to_word2vec_formated_file(fname, word2vec_dict): - - with gensim.utils.open(fname, "bw") as f: - - num_words = len(word2vec_dict) - vector_length = len(list(word2vec_dict.values())[0]) - - header = "%d %d\n" % (num_words, vector_length) - f.write(header.encode(encoding="ascii")) - - for word, vector in word2vec_dict.items(): - f.write(word.encode()) - f.write(' '.encode()) - f.write(np.array(vector).astype(np.float32).tobytes()) - - -class LoadWord2VecFormatTest(unittest.TestCase): - - def assert_dict_equal_to_model(self, d, m): - self.assertEqual(len(d), len(m.vocab)) - - for word in d.keys(): - self.assertSequenceEqual(list(d[word]), list(m[word])) - - def verify_load2vec_binary_result(self, w2v_dict, binary_chunk_size, limit): - tmpfile = gensim.test.utils.get_tmpfile("tmp_w2v") - save_dict_to_word2vec_formated_file(tmpfile, w2v_dict) - w2v_model = \ - gensim.models.utils_any2vec._load_word2vec_format( - cls=gensim.models.KeyedVectors, - fname=tmpfile, - binary=True, - limit=limit, - binary_chunk_size=binary_chunk_size) - if limit is None: - limit = len(w2v_dict) - - w2v_keys_postprocessed = list(w2v_dict.keys())[:limit] - w2v_dict_postprocessed = {k.lstrip(): w2v_dict[k] for k in w2v_keys_postprocessed} - - self.assert_dict_equal_to_model(w2v_dict_postprocessed, w2v_model) - - def test_load_word2vec_format_basic(self): - w2v_dict = {"abc": [1, 2, 3], - "cde": [4, 5, 6], - "def": [7, 8, 9]} - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=None) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=None) - - w2v_dict = {"abc": [1, 2, 3], - "cdefg": [4, 5, 6], - "d": [7, 8, 9]} - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=None) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=None) - - def test_load_word2vec_format_limit(self): - w2v_dict = {"abc": [1, 2, 3], - "cde": [4, 5, 6], - "def": [7, 8, 9]} - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=1) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=1) - - w2v_dict = {"abc": [1, 2, 3], - "cde": [4, 5, 6], - "def": [7, 8, 9]} - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=2) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=2) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=2) - - w2v_dict = {"abc": [1, 2, 3], - "cdefg": [4, 5, 6], - "d": [7, 8, 9]} - - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=1) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=1) - - w2v_dict = {"abc": [1, 2, 3], - "cdefg": [4, 5, 6], - "d": [7, 8, 9]} - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=2) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=2) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=2) - - def test_load_word2vec_format_space_stripping(self): - w2v_dict = {"\nabc": [1, 2, 3], - "cdefdg": [4, 5, 6], - "\n\ndef": [7, 8, 9]} - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None) - self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_varembed_wrapper.py b/gensim/test/test_varembed_wrapper.py index 54401a15e6..9d0a16d6e3 100644 --- a/gensim/test/test_varembed_wrapper.py +++ b/gensim/test/test_varembed_wrapper.py @@ -48,7 +48,7 @@ def testSimilarity(self): def model_sanity(self, model): """Check vocabulary and vector size""" self.assertEqual(model.vectors.shape, (model.vocab_size, model.vector_size)) - self.assertTrue(model.vectors.shape[0] == len(model.vocab)) + self.assertTrue(model.vectors.shape[0] == len(model)) @unittest.skipIf(sys.version_info < (2, 7), 'Supported only on Python 2.7 and above') def testAddMorphemesToEmbeddings(self): diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index ef176754da..1be1ea9d21 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -20,7 +20,8 @@ from gensim import utils from gensim.models import word2vec, keyedvectors -from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences +from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences, \ + LeeCorpus, lee_corpus_list from testfixtures import log_capture try: @@ -30,15 +31,6 @@ PYEMD_EXT = False -class LeeCorpus(object): - def __iter__(self): - with open(datapath('lee_background.cor')) as f: - for line in f: - yield utils.simple_preprocess(line) - - -list_corpus = list(LeeCorpus()) - new_sentences = [ ['computer', 'artificial', 'intelligence'], ['artificial', 'trees'], @@ -75,45 +67,26 @@ def testBuildVocabFromFreq(self): 'survey': 2, 'user': 3, 'human': 2, 'time': 2, 'interface': 2, 'response': 2 } - model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5) + freq_dict_orig = freq_dict.copy() + model_hs = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42, hs=0, negative=5) model_hs.build_vocab_from_freq(freq_dict) model_neg.build_vocab_from_freq(freq_dict) - self.assertEqual(len(model_hs.wv.vocab), 12) - self.assertEqual(len(model_neg.wv.vocab), 12) - self.assertEqual(model_hs.wv.vocab['minors'].count, 2) - self.assertEqual(model_hs.wv.vocab['graph'].count, 3) - self.assertEqual(model_hs.wv.vocab['system'].count, 4) - self.assertEqual(model_hs.wv.vocab['trees'].count, 3) - self.assertEqual(model_hs.wv.vocab['eps'].count, 2) - self.assertEqual(model_hs.wv.vocab['computer'].count, 2) - self.assertEqual(model_hs.wv.vocab['survey'].count, 2) - self.assertEqual(model_hs.wv.vocab['user'].count, 3) - self.assertEqual(model_hs.wv.vocab['human'].count, 2) - self.assertEqual(model_hs.wv.vocab['time'].count, 2) - self.assertEqual(model_hs.wv.vocab['interface'].count, 2) - self.assertEqual(model_hs.wv.vocab['response'].count, 2) - self.assertEqual(model_neg.wv.vocab['minors'].count, 2) - self.assertEqual(model_neg.wv.vocab['graph'].count, 3) - self.assertEqual(model_neg.wv.vocab['system'].count, 4) - self.assertEqual(model_neg.wv.vocab['trees'].count, 3) - self.assertEqual(model_neg.wv.vocab['eps'].count, 2) - self.assertEqual(model_neg.wv.vocab['computer'].count, 2) - self.assertEqual(model_neg.wv.vocab['survey'].count, 2) - self.assertEqual(model_neg.wv.vocab['user'].count, 3) - self.assertEqual(model_neg.wv.vocab['human'].count, 2) - self.assertEqual(model_neg.wv.vocab['time'].count, 2) - self.assertEqual(model_neg.wv.vocab['interface'].count, 2) - self.assertEqual(model_neg.wv.vocab['response'].count, 2) + self.assertEqual(len(model_hs.wv), 12) + self.assertEqual(len(model_neg.wv), 12) + for k in freq_dict_orig.keys(): + self.assertEqual(model_hs.wv.get_vecattr(k, 'count'), freq_dict_orig[k]) + self.assertEqual(model_neg.wv.get_vecattr(k, 'count'), freq_dict_orig[k]) + new_freq_dict = { 'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1 } model_hs.build_vocab_from_freq(new_freq_dict, update=True) model_neg.build_vocab_from_freq(new_freq_dict, update=True) - self.assertEqual(model_hs.wv.vocab['graph'].count, 4) - self.assertEqual(model_hs.wv.vocab['artificial'].count, 4) - self.assertEqual(len(model_hs.wv.vocab), 14) - self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertEqual(model_hs.wv.get_vecattr('graph', 'count'), 4) + self.assertEqual(model_hs.wv.get_vecattr('artificial', 'count'), 4) + self.assertEqual(len(model_hs.wv), 14) + self.assertEqual(len(model_neg.wv), 14) def testPruneVocab(self): """Test Prune vocab while scanning sentences""" @@ -123,10 +96,10 @@ def testPruneVocab(self): ["system", "eps"], ["graph", "system"] ] - model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) - self.assertEqual(len(model.wv.vocab), 2) - self.assertEqual(model.wv.vocab['graph'].count, 3) - self.assertEqual(model.wv.vocab['system'].count, 4) + model = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + self.assertEqual(len(model.wv), 2) + self.assertEqual(model.wv.get_vecattr('graph', 'count'), 3) + self.assertEqual(model.wv.get_vecattr('system', 'count'), 4) sentences = [ ["graph", "system"], @@ -135,63 +108,63 @@ def testPruneVocab(self): ["graph", "system"], ["minors", "survey", "minors", "survey", "minors"] ] - model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) - self.assertEqual(len(model.wv.vocab), 3) - self.assertEqual(model.wv.vocab['graph'].count, 3) - self.assertEqual(model.wv.vocab['minors'].count, 3) - self.assertEqual(model.wv.vocab['system'].count, 4) + model = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + self.assertEqual(len(model.wv), 3) + self.assertEqual(model.wv.get_vecattr('graph', 'count'), 3) + self.assertEqual(model.wv.get_vecattr('minors', 'count'), 3) + self.assertEqual(model.wv.get_vecattr('system', 'count'), 4) def testTotalWordCount(self): - model = word2vec.Word2Vec(size=10, min_count=0, seed=42) - total_words = model.vocabulary.scan_vocab(sentences)[0] + model = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42) + total_words = model.scan_vocab(sentences)[0] self.assertEqual(total_words, 29) def testMaxFinalVocab(self): # Test for less restricting effect of max_final_vocab # max_final_vocab is specified but has no effect - model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=4, sample=0) - model.vocabulary.scan_vocab(sentences) - reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0) + model = word2vec.Word2Vec(vector_size=10, max_final_vocab=4, min_count=4, sample=0) + model.scan_vocab(sentences) + reported_values = model.prepare_vocab() self.assertEqual(reported_values['drop_unique'], 11) self.assertEqual(reported_values['retain_total'], 4) self.assertEqual(reported_values['num_retained_words'], 1) - self.assertEqual(model.vocabulary.effective_min_count, 4) + self.assertEqual(model.effective_min_count, 4) # Test for more restricting effect of max_final_vocab # results in setting a min_count more restricting than specified min_count - model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=2, sample=0) - model.vocabulary.scan_vocab(sentences) - reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0) + model = word2vec.Word2Vec(vector_size=10, max_final_vocab=4, min_count=2, sample=0) + model.scan_vocab(sentences) + reported_values = model.prepare_vocab() self.assertEqual(reported_values['drop_unique'], 8) self.assertEqual(reported_values['retain_total'], 13) self.assertEqual(reported_values['num_retained_words'], 4) - self.assertEqual(model.vocabulary.effective_min_count, 3) + self.assertEqual(model.effective_min_count, 3) def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" - model_hs = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) - self.assertTrue(len(model_hs.wv.vocab), 12) - self.assertTrue(model_hs.wv.vocab['graph'].count, 3) + model_hs = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=0, negative=5) + self.assertTrue(len(model_hs.wv), 12) + self.assertTrue(model_hs.wv.get_vecattr('graph', 'count'), 3) model_hs.build_vocab(new_sentences, update=True) model_neg.build_vocab(new_sentences, update=True) - self.assertTrue(model_hs.wv.vocab['graph'].count, 4) - self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) - self.assertEqual(len(model_hs.wv.vocab), 14) - self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertTrue(model_hs.wv.get_vecattr('graph', 'count'), 4) + self.assertTrue(model_hs.wv.get_vecattr('artificial', 'count'), 4) + self.assertEqual(len(model_hs.wv), 14) + self.assertEqual(len(model_neg.wv), 14) def testOnlineLearningAfterSave(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" tmpf = get_tmpfile('gensim_word2vec.tst') - model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = word2vec.Word2Vec.load(tmpf) - self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertTrue(len(model_neg.wv), 12) model_neg.build_vocab(new_sentences, update=True) model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.epochs) - self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertEqual(len(model_neg.wv), 14) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testOnlineLearningFromFile(self): @@ -202,20 +175,22 @@ def testOnlineLearningFromFile(self): utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) - model_hs = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) - self.assertTrue(len(model_hs.wv.vocab), 12) - self.assertTrue(model_hs.wv.vocab['graph'].count, 3) + model_hs = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=1, negative=0) + model_neg = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=0, negative=5) + self.assertTrue(len(model_hs.wv), 12) + self.assertTrue(model_hs.wv.get_vecattr('graph', 'count'), 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) model_hs.train(corpus_file=new_corpus_file, total_words=model_hs.corpus_total_words, epochs=model_hs.epochs) model_neg.build_vocab(corpus_file=new_corpus_file, update=True) model_neg.train( corpus_file=new_corpus_file, total_words=model_hs.corpus_total_words, epochs=model_hs.epochs) - self.assertTrue(model_hs.wv.vocab['graph'].count, 4) - self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) - self.assertEqual(len(model_hs.wv.vocab), 14) - self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertTrue(model_hs.wv.get_vecattr('graph', 'count'), 4) + self.assertTrue(model_hs.wv.get_vecattr('artificial', 'count'), 4) + self.assertEqual(len(model_hs.wv), 14) + self.assertEqual(len(model_neg.wv), 14) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testOnlineLearningAfterSaveFromFile(self): @@ -227,31 +202,32 @@ def testOnlineLearningAfterSaveFromFile(self): utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_word2vec.tst') - model_neg = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=0, negative=5) model_neg.save(tmpf) model_neg = word2vec.Word2Vec.load(tmpf) - self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertTrue(len(model_neg.wv), 12) # Check that training works on the same data after load without calling build_vocab model_neg.train(corpus_file=corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.epochs) # Train on new corpus file model_neg.build_vocab(corpus_file=new_corpus_file, update=True) model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.epochs) - self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertEqual(len(model_neg.wv), 14) def onlineSanity(self, model, trained_model=False): terro, others = [], [] - for x in list_corpus: - if 'terrorism' in x: - terro.append(x) + for line in lee_corpus_list: + if 'terrorism' in line: + terro.append(line) else: - others.append(x) - self.assertTrue(all('terrorism' not in x for x in others)) + others.append(line) + self.assertTrue(all('terrorism' not in line for line in others)) model.build_vocab(others, update=trained_model) model.train(others, total_examples=model.corpus_count, epochs=model.epochs) - self.assertFalse('terrorism' in model.wv.vocab) + self.assertFalse('terrorism' in model.wv) model.build_vocab(terro, update=True) - self.assertTrue('terrorism' in model.wv.vocab) + self.assertTrue('terrorism' in model.wv) orig0 = np.copy(model.wv.vectors) model.train(terro, total_examples=len(terro), epochs=model.epochs) self.assertFalse(np.allclose(model.wv.vectors, orig0)) @@ -260,19 +236,19 @@ def onlineSanity(self, model, trained_model=False): def test_sg_hs_online(self): """Test skipgram w/ hierarchical softmax""" - model = word2vec.Word2Vec(sg=1, window=5, hs=1, negative=0, min_count=3, iter=10, seed=42, workers=2) + model = word2vec.Word2Vec(sg=1, window=5, hs=1, negative=0, min_count=3, epochs=10, seed=42, workers=2) self.onlineSanity(model) def test_sg_neg_online(self): """Test skipgram w/ negative sampling""" - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=3, iter=10, seed=42, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=3, epochs=10, seed=42, workers=2) self.onlineSanity(model) def test_cbow_hs_online(self): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=3, iter=10, seed=42, workers=2 + min_count=3, epochs=20, seed=42, workers=2 ) self.onlineSanity(model) @@ -280,7 +256,7 @@ def test_cbow_neg_online(self): """Test CBOW w/ negative sampling""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, seed=42, workers=2, sample=0 + min_count=5, epochs=10, seed=42, workers=2, sample=0 ) self.onlineSanity(model) @@ -295,7 +271,7 @@ def testPersistence(self): wv.save(tmpf) loaded_wv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.vectors, loaded_wv.vectors)) - self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) + self.assertEqual(len(wv), len(loaded_wv)) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testPersistenceFromFile(self): @@ -312,7 +288,7 @@ def testPersistenceFromFile(self): wv.save(tmpf) loaded_wv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.vectors, loaded_wv.vectors)) - self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) + self.assertEqual(len(wv), len(loaded_wv)) def testPersistenceWithConstructorRule(self): """Test storing/loading the entire model with a vocab trimming rule passed in the constructor.""" @@ -324,15 +300,15 @@ def testPersistenceWithConstructorRule(self): def testRuleWithMinCount(self): """Test that returning RULE_DEFAULT from trim_rule triggers min_count.""" model = word2vec.Word2Vec(sentences + [["occurs_only_once"]], min_count=2, trim_rule=_rule) - self.assertTrue("human" not in model.wv.vocab) - self.assertTrue("occurs_only_once" not in model.wv.vocab) - self.assertTrue("interface" in model.wv.vocab) + self.assertTrue("human" not in model.wv) + self.assertTrue("occurs_only_once" not in model.wv) + self.assertTrue("interface" in model.wv) def testRule(self): """Test applying vocab trim_rule to build_vocab instead of constructor.""" model = word2vec.Word2Vec(min_count=1) model.build_vocab(sentences, trim_rule=_rule) - self.assertTrue("human" not in model.wv.vocab) + self.assertTrue("human" not in model.wv) def testLambdaRule(self): """Test that lambda trim_rule works.""" @@ -340,23 +316,9 @@ def rule(word, count, min_count): return utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule) - self.assertTrue("human" not in model.wv.vocab) - - def testSyn0NormNotSaved(self): - """Test syn0norm isn't saved in model file""" - tmpf = get_tmpfile('gensim_word2vec.tst') - model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() - model.save(tmpf) - loaded_model = word2vec.Word2Vec.load(tmpf) - self.assertTrue(loaded_model.wv.vectors_norm is None) - - wv = model.wv - wv.save(tmpf) - loaded_kv = keyedvectors.KeyedVectors.load(tmpf) - self.assertTrue(loaded_kv.vectors_norm is None) + self.assertTrue("human" not in model.wv) - def testLoadPreKeyedVectorModel(self): + def obsolete_testLoadPreKeyedVectorModel(self): """Test loading pre-KeyedVectors word2vec model""" if sys.version_info[:2] == (3, 4): @@ -369,33 +331,31 @@ def testLoadPreKeyedVectorModel(self): # Model stored in one file model_file = 'word2vec_pre_kv%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), model.vector_size)) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) # Model stored in multiple files model_file = 'word2vec_pre_kv_sep%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), model.vector_size)) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) def testLoadPreKeyedVectorModelCFormat(self): """Test loading pre-KeyedVectors word2vec model saved in word2vec format""" model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c')) - self.assertTrue(model.vectors.shape[0] == len(model.vocab)) + self.assertTrue(model.vectors.shape[0] == len(model)) def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() model.wv.save_word2vec_format(tmpf, binary=True) binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) - binary_model_kv.init_sims(replace=False) self.assertTrue(np.allclose(model.wv['human'], binary_model_kv['human'])) norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) - norm_only_model.init_sims(replace=True) + norm_only_model.unit_normalize_all() self.assertFalse(np.allclose(model.wv['human'], norm_only_model['human'])) - self.assertTrue(np.allclose(model.wv.vectors_norm[model.wv.vocab['human'].index], norm_only_model['human'])) + self.assertTrue(np.allclose(model.wv.get_vector('human', use_norm=True), norm_only_model['human'])) limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True, limit=3) self.assertEqual(len(limited_model_kv.vectors), 3) half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( @@ -406,7 +366,6 @@ def testPersistenceWord2VecFormat(self): def testNoTrainingCFormat(self): tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() model.wv.save_word2vec_format(tmpf, binary=True) kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model = word2vec.Word2Vec() @@ -416,7 +375,6 @@ def testNoTrainingCFormat(self): def testTooShortBinaryWord2VecFormat(self): tfile = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() model.wv.save_word2vec_format(tfile, binary=True) f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count @@ -426,7 +384,6 @@ def testTooShortBinaryWord2VecFormat(self): def testTooShortTextWord2VecFormat(self): tfile = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() model.wv.save_word2vec_format(tfile, binary=False) f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count @@ -437,37 +394,35 @@ def testPersistenceWord2VecFormatNonBinary(self): """Test storing/loading the entire model in word2vec non-binary format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() model.wv.save_word2vec_format(tmpf, binary=False) text_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False) - text_model.init_sims(False) self.assertTrue(np.allclose(model.wv['human'], text_model['human'], atol=1e-6)) norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False) - norm_only_model.init_sims(True) + norm_only_model.unit_normalize_all() self.assertFalse(np.allclose(model.wv['human'], norm_only_model['human'], atol=1e-6)) self.assertTrue(np.allclose( - model.wv.vectors_norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4 + model.wv.get_vector('human', use_norm=True), norm_only_model['human'], atol=1e-4 )) def testPersistenceWord2VecFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) - self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count) + self.assertEqual(model.wv.get_vecattr('human', 'count'), + binary_model_with_vocab_kv.get_vecattr('human', 'count')) def testPersistenceKeyedVectorsFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) - self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count) + self.assertEqual(model.wv.get_vecattr('human', 'count'), + kv_binary_model_with_vocab.get_vecattr('human', 'count')) def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): """Test storing/loading the entire model and vocabulary in word2vec format chained with @@ -475,7 +430,6 @@ def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): It was possible prior to 1.0.0 release, now raises Exception""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) - model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) @@ -502,18 +456,18 @@ def testVocab(self): # try vocab building explicitly, using all words model = word2vec.Word2Vec(min_count=1, hs=1, negative=0) model.build_vocab(corpus) - self.assertTrue(len(model.wv.vocab) == 6981) + self.assertTrue(len(model.wv) == 6981) # with min_count=1, we're not throwing away anything, # so make sure the word counts add up to be the entire corpus - self.assertEqual(sum(v.count for v in model.wv.vocab.values()), total_words) + self.assertEqual(sum(model.wv.get_vecattr(k, 'count') for k in model.wv.key_to_index), total_words) # make sure the binary codes are correct - np.allclose(model.wv.vocab['the'].code, [1, 1, 0, 0]) + np.allclose(model.wv.get_vecattr('the', 'code'), [1, 1, 0, 0]) # test building vocab with default params model = word2vec.Word2Vec(hs=1, negative=0) model.build_vocab(corpus) - self.assertTrue(len(model.wv.vocab) == 1750) - np.allclose(model.wv.vocab['the'].code, [1, 1, 1, 0]) + self.assertTrue(len(model.wv) == 1750) + np.allclose(model.wv.get_vecattr('the', 'code'), [1, 1, 1, 0]) # no input => "RuntimeError: you must first build vocabulary before training the model" self.assertRaises(RuntimeError, word2vec.Word2Vec, []) @@ -524,24 +478,24 @@ def testVocab(self): def testTraining(self): """Test word2vec training.""" # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.vectors_norm[model.wv.get_index('graph')] sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) self.models_equal(model, model2) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") @@ -551,25 +505,25 @@ def testTrainingFromFile(self): with temporary_file(get_tmpfile('gensim_word2vec.tst')) as tf: utils.save_as_line_sentence(sentences, tf) - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(corpus_file=tf) - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv), 2)) model.train(corpus_file=tf, total_words=model.corpus_total_words, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.vectors_norm[model.wv.get_index('graph')] sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) def testScoring(self): """Test word2vec scoring.""" - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) # just score and make sure they exist scores = model.score(sentences, len(sentences)) @@ -580,14 +534,16 @@ def testLocking(self): corpus = LeeCorpus() # build vocabulary, don't train yet for sg in range(2): # test both cbow and sg - model = word2vec.Word2Vec(size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) + model = word2vec.Word2Vec(vector_size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) model.build_vocab(corpus) # remember two vectors locked0 = np.copy(model.wv.vectors[0]) unlocked1 = np.copy(model.wv.vectors[1]) + # alocate a full lockf array (not just default single val for all) + model.wv.vectors_lockf = np.ones(len(model.wv), dtype=np.float32) # lock the vector in slot 0 against change - model.trainables.vectors_lockf[0] = 0.0 + model.wv.vectors_lockf[0] = 0.0 model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs) self.assertFalse((unlocked1 == model.wv.vectors[1]).all()) # unlocked vector should vary @@ -609,14 +565,14 @@ def testEvaluateWordAnalogies(self): def testEvaluateWordPairs(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets""" corpus = word2vec.LineSentence(datapath('head500.noblanks.cor.bz2')) - model = word2vec.Word2Vec(corpus, min_count=3, iter=10) + model = word2vec.Word2Vec(corpus, min_count=3, epochs=20) correlation = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] - self.assertTrue(0.1 < pearson < 1.0) - self.assertTrue(0.1 < spearman < 1.0) - self.assertTrue(0.0 <= oov < 90.0) + self.assertTrue(0.1 < pearson < 1.0, "pearson %f not between 0.1 & 1.0" % pearson) + self.assertTrue(0.1 < spearman < 1.0, "spearman %f not between 0.1 and 1.0" % spearman) + self.assertTrue(0.0 <= oov < 90.0, "oov %f not between 0.0 and 90.0" % oov) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testEvaluateWordPairsFromFile(self): @@ -624,28 +580,28 @@ def testEvaluateWordPairsFromFile(self): with temporary_file(get_tmpfile('gensim_word2vec.tst')) as tf: utils.save_as_line_sentence(word2vec.LineSentence(datapath('head500.noblanks.cor.bz2')), tf) - model = word2vec.Word2Vec(corpus_file=tf, min_count=3, iter=10) + model = word2vec.Word2Vec(corpus_file=tf, min_count=3, epochs=20) correlation = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] - self.assertTrue(0.1 < pearson < 1.0) - self.assertTrue(0.1 < spearman < 1.0) - self.assertTrue(0.0 <= oov < 90.0) + self.assertTrue(0.1 < pearson < 1.0, "pearson %f not between 0.1 & 1.0" % pearson) + self.assertTrue(0.1 < spearman < 1.0, "spearman %f not between 0.1 and 1.0" % spearman) + self.assertTrue(0.0 <= oov < 90.0, "oov %f not between 0.0 and 90.0" % oov) def model_sanity(self, model, train=True, with_corpus_file=False): """Even tiny models trained on LeeCorpus should pass these sanity checks""" # run extra before/after training tests if train=True if train: - model.build_vocab(list_corpus) + model.build_vocab(lee_corpus_list) orig0 = np.copy(model.wv.vectors[0]) if with_corpus_file: tmpfile = get_tmpfile('gensim_word2vec.tst') - utils.save_as_line_sentence(list_corpus, tmpfile) + utils.save_as_line_sentence(lee_corpus_list, tmpfile) model.train(corpus_file=tmpfile, total_words=model.corpus_total_words, epochs=model.epochs) else: - model.train(list_corpus, total_examples=model.corpus_count, epochs=model.epochs) + model.train(lee_corpus_list, total_examples=model.corpus_count, epochs=model.epochs) self.assertFalse((orig0 == model.wv.vectors[1]).all()) # vector should vary after training sims = model.wv.most_similar('war', topn=len(model.wv.index2word)) t_rank = [word for word, score in sims].index('terrorism') @@ -658,29 +614,29 @@ def model_sanity(self, model, train=True, with_corpus_file=False): def test_sg_hs(self): """Test skipgram w/ hierarchical softmax""" - model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2) self.model_sanity(model) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_sg_hs_fromfile(self): - model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2) self.model_sanity(model, with_corpus_file=True) def test_sg_neg(self): """Test skipgram w/ negative sampling""" - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) self.model_sanity(model) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_sg_neg_fromfile(self): - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) self.model_sanity(model, with_corpus_file=True) def test_cbow_hs(self): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, - min_count=5, iter=10, workers=2, batch_words=1000 + min_count=5, epochs=20, workers=2, batch_words=1000 ) self.model_sanity(model) @@ -688,7 +644,7 @@ def test_cbow_hs(self): def test_cbow_hs_fromfile(self): model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, - min_count=5, iter=10, workers=2, batch_words=1000 + min_count=5, epochs=20, workers=2, batch_words=1000 ) self.model_sanity(model, with_corpus_file=True) @@ -696,7 +652,7 @@ def test_cbow_neg(self): """Test CBOW w/ negative sampling""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, workers=2, sample=0 + min_count=5, epochs=10, workers=2, sample=0 ) self.model_sanity(model) @@ -704,17 +660,17 @@ def test_cbow_neg(self): def test_cbow_neg_fromfile(self): model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, workers=2, sample=0 + min_count=5, epochs=10, workers=2, sample=0 ) self.model_sanity(model, with_corpus_file=True) def test_cosmul(self): - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) sims = model.wv.most_similar_cosmul('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.vectors_norm[model.wv.get_index('graph')] sims2 = model.wv.most_similar_cosmul(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) @@ -723,75 +679,75 @@ def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=1, negative=0) model.build_vocab(sentences) - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.vectors_norm[model.wv.get_index('graph')] sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=0, hs=1, negative=0) self.models_equal(model, model2) def testTrainingSgNegative(self): """Test skip-gram (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=1, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=1, hs=0, negative=2) model.build_vocab(sentences) - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), 2)) + self.assertTrue(model.syn1neg.shape == (len(model.wv), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.vectors_norm[model.wv.get_index('graph')] sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=1, hs=0, negative=2) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=1, hs=0, negative=2) self.models_equal(model, model2) def testTrainingCbowNegative(self): """Test CBOW (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) - self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.wv.vectors.shape == (len(model.wv), 2)) + self.assertTrue(model.syn1neg.shape == (len(model.wv), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector - graph_vector = model.wv.vectors_norm[model.wv.vocab['graph'].index] + graph_vector = model.wv.vectors_norm[model.wv.get_index('graph')] sims2 = model.wv.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=0, hs=0, negative=2) self.models_equal(model, model2) def testSimilarities(self): """Test similarity and n_similarity methods.""" # The model is trained using CBOW - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) @@ -803,7 +759,7 @@ def testSimilarities(self): def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector.""" - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) wordsims = model.wv.similar_by_word('graph', topn=10) wordsims2 = model.wv.most_similar(positive='graph', topn=10) vectorsims = model.wv.similar_by_vector(model.wv['graph'], topn=10) @@ -813,15 +769,17 @@ def testSimilarBy(self): def testParallel(self): """Test word2vec parallel training.""" - corpus = utils.RepeatCorpus(LeeCorpus(), 10000) + corpus = utils.RepeatCorpus(LeeCorpus(), 10000) # repeats about 33 times - for workers in [2, 4]: - model = word2vec.Word2Vec(corpus, workers=workers) - sims = model.wv.most_similar('israeli') # noqa:F841 + for workers in [4, ]: # [4, 2] + model = word2vec.Word2Vec(corpus, vector_size=16, min_count=(10 * 33), workers=workers) + origin_word = 'israeli' + expected_neighbor = 'palestinian' + sims = model.wv.most_similar(origin_word, topn=len(model.wv)) # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization - # so let's test only for top3 - # TODO: commented out for now; find a more robust way to compare against "gold standard" - # self.assertTrue('palestinian' in [sims[i][0] for i in range(3)]) + # so let's test only for top10 + neighbor_rank = [word for word, sim in sims].index(expected_neighbor) + self.assertLess(neighbor_rank, 20) def testRNG(self): """Test word2vec results identical with identical RNG seed.""" @@ -830,41 +788,16 @@ def testRNG(self): self.models_equal(model, model2) def models_equal(self, model, model2): - self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) + self.assertEqual(len(model.wv), len(model2.wv)) self.assertTrue(np.allclose(model.wv.vectors, model2.wv.vectors)) if model.hs: - self.assertTrue(np.allclose(model.trainables.syn1, model2.trainables.syn1)) + self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: - self.assertTrue(np.allclose(model.trainables.syn1neg, model2.trainables.syn1neg)) - most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) + most_common_word_index = np.argsort(model.wv.expandos['count'])[-1] + most_common_word = model.wv.index_to_key[most_common_word_index] self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word])) - def testDeleteTemporaryTrainingData(self): - """Test word2vec model after delete_temporary_training_data""" - for i in [0, 1]: - for j in [0, 1]: - model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j) - if i: - self.assertTrue(hasattr(model.trainables, 'syn1')) - if j: - self.assertTrue(hasattr(model, 'syn1neg')) - self.assertTrue(hasattr(model, 'syn0_lockf')) - model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) - self.assertTrue(len(model.wv['human']), 10) - self.assertTrue(len(model.wv.vocab), 12) - self.assertTrue(model.wv.vocab['graph'].count, 3) - self.assertTrue(not hasattr(model.trainables, 'syn1')) - self.assertTrue(not hasattr(model.trainables, 'syn1neg')) - self.assertTrue(not hasattr(model.trainables, 'syn0_lockf')) - - def testNormalizeAfterTrainingData(self): - tmpf = get_tmpfile('gensim_word2vec.tst') - model = word2vec.Word2Vec(sentences, min_count=1) - model.save(tmpf) - norm_only_model = word2vec.Word2Vec.load(tmpf) - norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) - self.assertFalse(np.allclose(model.wv['human'], norm_only_model.wv['human'])) - def testPredictOutputWord(self): '''Test word2vec predict_output_word method handling for negative sampling scheme''' # under normal circumstances @@ -878,7 +811,6 @@ def testPredictOutputWord(self): # when required model parameters have been deleted tmpf = get_tmpfile('gensim_word2vec.tst') - model_with_neg.init_sims() model_with_neg.wv.save_word2vec_format(tmpf, binary=True) kv_model_with_neg = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model_with_neg = word2vec.Word2Vec() @@ -890,78 +822,121 @@ def testPredictOutputWord(self): self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human']) def testLoadOldModel(self): - """Test loading word2vec models from previous version""" + """Test loading an old word2vec model of indeterminate version""" - model_file = 'word2vec_old' + model_file = 'word2vec_old' # which version?!? model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) - self.assertTrue(len(model.wv.vocab) == 12) + self.assertTrue(len(model.wv) == 12) self.assertTrue(len(model.wv.index2word) == 12) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) - self.assertTrue(model.vocabulary.cum_table.shape == (12,)) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.wv.vector_size)) + self.assertTrue(len(model.wv.vectors_lockf.shape) > 0) + self.assertTrue(model.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) + def testLoadOldModelSeparates(self): + """Test loading an old word2vec model of indeterminate version""" + # Model stored in multiple files model_file = 'word2vec_old_sep' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) - self.assertTrue(len(model.wv.vocab) == 12) + self.assertTrue(len(model.wv) == 12) self.assertTrue(len(model.wv.index2word) == 12) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) - self.assertTrue(model.vocabulary.cum_table.shape == (12,)) + self.assertTrue(model.syn1neg.shape == (len(model.wv), model.wv.vector_size)) + self.assertTrue(len(model.wv.vectors_lockf.shape) > 0) + self.assertTrue(model.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) + def obsolete_test_load_old_models_pre_1_0(self): + """Test loading pre-1.0 models""" # load really old model model_file = 'w2v-lee-v0.12.0' model = word2vec.Word2Vec.load(datapath(model_file)) self.onlineSanity(model, trained_model=True) + old_versions = [ + '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', + '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', + ] + + for old_version in old_versions: + self._check_old_version(old_version) + + def test_load_old_models_1_x(self): + """Test loading 1.x models""" + + old_versions = [ + '1.0.0', '1.0.1', + ] + + for old_version in old_versions: + self._check_old_version(old_version) + + def test_load_old_models_2_x(self): + """Test loading 2.x models""" + + old_versions = [ + '2.0.0', '2.1.0', '2.2.0', '2.3.0', + ] + + for old_version in old_versions: + self._check_old_version(old_version) + + def test_load_old_models_3_x(self): + """Test loading 3.x models""" + # test for max_final_vocab for model saved in 3.3 model_file = 'word2vec_3.3' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertEqual(model.max_final_vocab, None) - self.assertEqual(model.vocabulary.max_final_vocab, None) + self.assertEqual(model.max_final_vocab, None) - # Test loading word2vec models from all previous versions old_versions = [ - '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', - '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', - '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0', '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' ] - saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl') for old_version in old_versions: - model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) - self.assertIsNone(model.corpus_total_words) - self.assertTrue(len(model.wv.vocab) == 3) + self._check_old_version(old_version) + + def _check_old_version(self, old_version): + logging.info("TESTING LOAD of %s Word2Vec MODEL", old_version) + saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl') + model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) + self.assertIsNone(model.corpus_total_words) + self.assertTrue(len(model.wv) == 3) + try: self.assertTrue(model.wv.vectors.shape == (3, 4)) - # check if similarity search and online training works. - self.assertTrue(len(model.wv.most_similar('sentence')) == 2) - model.build_vocab(list_corpus, update=True) - model.train(list_corpus, total_examples=model.corpus_count, epochs=model.epochs) - # check if similarity search and online training works after saving and loading back the model. - tmpf = get_tmpfile('gensim_word2vec.tst') - model.save(tmpf) - loaded_model = word2vec.Word2Vec.load(tmpf) - loaded_model.build_vocab(list_corpus, update=True) - loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.epochs) + except AttributeError as ae: + print("WV") + print(model.wv) + print(dir(model.wv)) + print(model.wv.syn0) + raise ae + # check if similarity search and online training works. + self.assertTrue(len(model.wv.most_similar('sentence')) == 2) + model.build_vocab(lee_corpus_list, update=True) + model.train(lee_corpus_list, total_examples=model.corpus_count, epochs=model.epochs) + # check if similarity search and online training works after saving and loading back the model. + tmpf = get_tmpfile('gensim_word2vec.tst') + model.save(tmpf) + loaded_model = word2vec.Word2Vec.load(tmpf) + loaded_model.build_vocab(lee_corpus_list, update=True) + loaded_model.train(lee_corpus_list, total_examples=model.corpus_count, epochs=model.epochs) @log_capture() - def testBuildVocabWarning(self, line): + def testBuildVocabWarning(self, loglines): """Test if warning is raised on non-ideal input to a word2vec model""" sentences = ['human', 'machine'] model = word2vec.Word2Vec() model.build_vocab(sentences) warning = "Each 'sentences' item should be a list of words (usually unicode strings)." - self.assertTrue(warning in str(line)) + self.assertTrue(warning in str(loglines)) @log_capture() - def testTrainWarning(self, line): + def testTrainWarning(self, loglines): """Test if warning is raised if alpha rises during subsequent calls to train()""" sentences = [ ['human'], @@ -976,10 +951,10 @@ def testTrainWarning(self, line): if epoch == 5: model.alpha += 0.05 warning = "Effective 'alpha' higher than previous training cycles" - self.assertTrue(warning in str(line)) + self.assertTrue(warning in str(loglines)) def test_train_with_explicit_param(self): - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) with self.assertRaises(ValueError): model.train(sentences, total_examples=model.corpus_count) @@ -1005,9 +980,8 @@ def test_reset_from(self): """Test if reset_from() uses pre-built structures from other model""" model = word2vec.Word2Vec(sentences, min_count=1) other_model = word2vec.Word2Vec(new_sentences, min_count=1) - other_vocab = other_model.wv.vocab model.reset_from(other_model) - self.assertEqual(model.wv.vocab, other_vocab) + self.assertEqual(model.wv.key_to_index, other_model.wv.key_to_index) def test_compute_training_loss(self): model = word2vec.Word2Vec(min_count=1, sg=1, negative=5, hs=1) @@ -1132,4 +1106,4 @@ def assertLess(self, a, b, msg=None): format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.DEBUG ) - unittest.main() + unittest.main(module='gensim.test.test_word2vec') diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index dc565001fa..b5b4a2b489 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -38,7 +38,7 @@ def testLoadWordrankFormat(self): model = wordrank.Wordrank.load_wordrank_model(self.wr_file) vocab_size, dim = 76, 50 self.assertEqual(model.vectors.shape, (vocab_size, dim)) - self.assertEqual(len(model.vocab), vocab_size) + self.assertEqual(len(model), vocab_size) os.remove(self.wr_file + '.w2vformat') def testEnsemble(self): @@ -71,8 +71,8 @@ def testLookup(self): self.assertTrue(numpy.allclose(self.test_model['night'], self.test_model[['night']])) def models_equal(self, model, model2): - self.assertEqual(len(model.vocab), len(model2.vocab)) - self.assertEqual(set(model.vocab.keys()), set(model2.vocab.keys())) + self.assertEqual(len(model), len(model2)) + self.assertEqual(set(model.index_to_key), set(model2.index_to_key)) self.assertTrue(numpy.allclose(model.syn0, model2.syn0)) diff --git a/gensim/test/utils.py b/gensim/test/utils.py index 1802984e68..ffc402c13d 100644 --- a/gensim/test/utils.py +++ b/gensim/test/utils.py @@ -73,6 +73,7 @@ import shutil from gensim.corpora import Dictionary +from gensim.utils import simple_preprocess module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -205,3 +206,13 @@ def temporary_file(name=""): common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] + + +class LeeCorpus(object): + def __iter__(self): + with open(datapath('lee_background.cor')) as f: + for line in f: + yield simple_preprocess(line) + + +lee_corpus_list = list(LeeCorpus()) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 79ffc132fd..477a9f2bc3 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -621,7 +621,7 @@ def __init__(self, relevant_ids, dictionary, model=None, **model_kwargs): def not_in_vocab(self, words): uniq_words = set(utils.flatten(words)) - return set(word for word in uniq_words if word not in self.model.vocab) + return set(word for word in uniq_words if word not in self.model) def get_occurrences(self, word): """Return number of docs the word occurs in, once `accumulate` has been called.""" @@ -629,7 +629,7 @@ def get_occurrences(self, word): self.token2id[word] # is this a token or an id? except KeyError: word = self.dictionary.id2token[word] - return self.model.vocab[word].count + return self.model.get_vecattr(word, 'count') def get_co_occurrences(self, word1, word2): """Return number of docs the words co-occur in, once `accumulate` has been called.""" @@ -663,4 +663,4 @@ def _words_with_embeddings(self, ids): ids = [ids] words = [self.dictionary.id2token[word_id] for word_id in ids] - return [word for word in words if word in self.model.vocab] + return [word for word in words if word in self.model] diff --git a/gensim/utils.py b/gensim/utils.py index 90c9279338..bb9ee2fa02 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -64,6 +64,9 @@ ) """An exception that gensim code raises when Cython extensions are unavailable.""" +#: A default, shared numpy-Generator-based PRNG for any/all uses that don't require seeding +default_prng = np.random.default_rng() + def get_random_state(seed): """Generate :class:`numpy.random.RandomState` based on input seed. diff --git a/setup.py b/setup.py index 5caa6206c5..abd19aa2c7 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,6 @@ 'gensim.models.word2vec_inner': 'gensim/models/word2vec_inner.c', 'gensim.corpora._mmreader': 'gensim/corpora/_mmreader.c', 'gensim.models.fasttext_inner': 'gensim/models/fasttext_inner.c', - 'gensim.models._utils_any2vec': 'gensim/models/_utils_any2vec.c', 'gensim._matutils': 'gensim/_matutils.c', 'gensim.models.nmf_pgd': 'gensim/models/nmf_pgd.c', } @@ -42,7 +41,9 @@ def need_cython(): """Return True if we need Cython to translate any of the extensions. If the extensions have already been translated to C/C++, then we don't need - to install Cython and perform the translation.""" + to install Cython and perform the translation. + + """ expected = list(c_extensions.values()) + list(cpp_extensions.values()) return any([not os.path.isfile(f) for f in expected]) @@ -51,7 +52,14 @@ def make_c_ext(use_cython=False): for module, source in c_extensions.items(): if use_cython: source = source.replace('.c', '.pyx') - yield Extension(module, sources=[source], language='c') + extra_args = [] +# extra_args.extend(['-g', '-O0']) # uncomment if optimization limiting crash info + yield Extension( + module, + sources=[source], + language='c', + extra_compile_args=extra_args, + ) def make_cpp_ext(use_cython=False): @@ -62,7 +70,7 @@ def make_cpp_ext(use_cython=False): extra_args.append('-std=c++11') elif system == 'Darwin': extra_args.extend(['-stdlib=libc++', '-std=c++11']) - +# extra_args.extend(['-g', '-O0']) # uncomment if optimization limiting crash info for module, source in cpp_extensions.items(): if use_cython: source = source.replace('.cpp', '.pyx') @@ -164,7 +172,7 @@ def run(self): Features --------- -* All algorithms are **memory-independent** w.r.t. the corpus size (can process input larger than RAM, streamed, out-of-core), +* All algorithms are **memory-independent** w.r.t. the corpus size (can process input larger than RAM, streamed, out-of-core) * **Intuitive interfaces** * easy to plug in your own input corpus/datastream (simple streaming API) @@ -254,7 +262,10 @@ def run(self): distributed_env = ['Pyro4 >= 4.27'] -win_testenv = [ +visdom_req = ['visdom >= 0.1.8, != 0.1.8.7'] + +# packages included for build-testing everywhere +core_testenv = [ 'pytest', 'pytest-rerunfailures', 'mock', @@ -265,13 +276,24 @@ def run(self): 'Morfessor==2.0.2a4', 'python-Levenshtein >= 0.10.2', 'scikit-learn', - # The following packages are commented out because they don't install on Windows. So skip the - # related tests in AppVeyor. We still test them in Linux via Travis, see linux_testenv below. - # See https://github.com/RaRe-Technologies/gensim/pull/2814 - # 'tensorflow', - # 'keras', ] +# Add additional requirements for testing on Linux that are skipped on Windows. +linux_testenv = core_testenv[:] + visdom_req + ['pyemd', ] +if sys.version_info >= (3, 7): + # HACK: Installing tensorflow causes a segfault in Travis on py3.6. Other Pythons work – a mystery. + # See https://github.com/RaRe-Technologies/gensim/pull/2814#issuecomment-621477948 + linux_testenv += [ + 'tensorflow', + 'keras==2.3.1', + ] + +# Skip problematic/uninstallable packages (& thus related conditional tests) in Windows builds. +# We still test them in Linux via Travis, see linux_testenv above. +# See https://github.com/RaRe-Technologies/gensim/pull/2814 +win_testenv = core_testenv[:] + +# # This list partially duplicates requirements_docs.txt. # The main difference is that we don't include version pins here unless # absolutely necessary, whereas requirements_docs.txt includes pins for @@ -281,8 +303,8 @@ def run(self): # # https://packaging.python.org/discussions/install-requires-vs-requirements/ # -visdom_req = ['visdom >= 0.1.8, != 0.1.8.7'] -docs_testenv = win_testenv + distributed_env + visdom_req + [ + +docs_testenv = core_testenv + distributed_env + visdom_req + [ 'sphinx <= 2.4.4', # avoid `sphinx >= 3.0` that breaks the build 'sphinx-gallery', 'sphinxcontrib.programoutput', @@ -306,17 +328,6 @@ def run(self): 'pandas', ] -# Add additional requirements for testing on Linux. We skip some tests on Windows, -# because the libraries below are too tricky to install there. -linux_testenv = win_testenv[:] + visdom_req -if sys.version_info >= (3, 7): - # HACK: Installing tensorflow causes a segfault in Travis on py3.6. Other Pythons work – a mystery. - # See https://github.com/RaRe-Technologies/gensim/pull/2814#issuecomment-621477948 - linux_testenv += [ - 'tensorflow', - 'keras', - ] - NUMPY_STR = 'numpy >= 1.11.3' # # We pin the Cython version for reproducibility. We expect our extensions @@ -330,6 +341,7 @@ def run(self): 'scipy >= 0.18.1', 'six >= 1.5.0', 'smart_open >= 1.8.1', + "dataclasses; python_version < '3.7'", # pre-py3.7 needs `dataclasses` backport for use of `dataclass` in doc2vec.py ] setup_requires = [NUMPY_STR] diff --git a/tox.ini b/tox.ini index e7ca40eaaf..1d0c0b0e09 100644 --- a/tox.ini +++ b/tox.ini @@ -17,7 +17,7 @@ ignore = F821 ; TODO remove me when all examples in docstrings will be executab exclude=.venv, .git, .tox, dist, doc, build, gensim/models/deprecated [pytest] -addopts = -rfxEXs --durations=20 --showlocals --reruns 3 --reruns-delay 1 +addopts = -rfxEXs --durations=20 --showlocals [testenv] recreate = True @@ -50,7 +50,8 @@ commands = [testenv:flake8] recreate = True -deps = flake8 +deps = + flake8==3.7.9 # 3.8.0 triggers "AttributeError: 'Namespace' object has no attribute 'output_file'" commands = flake8 gensim/ {posargs}