From d398e92291e3de4d6d2a73016150cd9aa2e52714 Mon Sep 17 00:00:00 2001 From: funasshi Date: Wed, 22 Jun 2022 16:23:26 +0900 Subject: [PATCH 1/6] fix loading old gensim model by new model --- gensim/models/word2vec.py | 345 ++++++++++++++++++++++++-------------- 1 file changed, 215 insertions(+), 130 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 061dcfc817..e8a8f96d96 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -197,7 +197,7 @@ import numpy as np from gensim.utils import keep_vocab_item, call_on_class_only, deprecated -from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector +from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector, Vocab from gensim import utils, matutils from smart_open.compression import get_supported_extensions @@ -223,26 +223,28 @@ CORPUSFILE_VERSION = -1 def train_epoch_sg( - model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, - ): - raise RuntimeError("Training with corpus_file argument is not supported") + model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, + _work, _neu1, compute_loss, + ): + raise RuntimeError( + "Training with corpus_file argument is not supported") def train_epoch_cbow( - model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, - ): - raise RuntimeError("Training with corpus_file argument is not supported") + model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, + _work, _neu1, compute_loss, + ): + raise RuntimeError( + "Training with corpus_file argument is not supported") class Word2Vec(utils.SaveLoad): def __init__( - self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - comment=None, max_final_vocab=None, shrink_windows=True, - ): + self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), + comment=None, max_final_vocab=None, shrink_windows=True, + ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. Once you're finished training a model (=no more updates, only querying) @@ -410,11 +412,13 @@ def __init__( self.wv = KeyedVectors(vector_size) # EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0) # advanced users should directly resize/adjust as desired after any vocab growth - self.wv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows + # 0.0 values suppress word-backprop-updates; 1.0 allows + self.wv.vectors_lockf = np.ones(1, dtype=REAL) self.hashfxn = hashfxn self.seed = seed - if not hasattr(self, 'layer1_size'): # set unless subclass already set (as for Doc2Vec dm_concat mode) + # set unless subclass already set (as for Doc2Vec dm_concat mode) + if not hasattr(self, 'layer1_size'): self.layer1_size = vector_size self.comment = comment @@ -422,8 +426,10 @@ def __init__( self.load = call_on_class_only if corpus_iterable is not None or corpus_file is not None: - self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=(epochs + 1)) - self.build_vocab(corpus_iterable=corpus_iterable, corpus_file=corpus_file, trim_rule=trim_rule) + self._check_corpus_sanity( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=(epochs + 1)) + self.build_vocab(corpus_iterable=corpus_iterable, + corpus_file=corpus_file, trim_rule=trim_rule) self.train( corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, @@ -444,9 +450,9 @@ def __init__( self.add_lifecycle_event("created", params=str(self)) def build_vocab( - self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, - keep_raw_vocab=False, trim_rule=None, **kwargs, - ): + self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs, + ): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Parameters @@ -484,19 +490,23 @@ def build_vocab( Keyword arguments propagated to `self.prepare_vocab`. """ - self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1) + self._check_corpus_sanity( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1) total_words, corpus_count = self.scan_vocab( corpus_iterable=corpus_iterable, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) self.corpus_count = corpus_count self.corpus_total_words = total_words - report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) - report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) + report_values = self.prepare_vocab( + update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + report_values['memory'] = self.estimate_memory( + vocab_size=report_values['num_retained_words']) self.prepare_weights(update=update) - self.add_lifecycle_event("build_vocab", update=update, trim_rule=str(trim_rule)) + self.add_lifecycle_event( + "build_vocab", update=update, trim_rule=str(trim_rule)) def build_vocab_from_freq( - self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False, - ): + self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False, + ): """Build vocabulary from a dictionary of word frequencies. Parameters @@ -539,8 +549,10 @@ def build_vocab_from_freq( self.raw_vocab = raw_vocab # trim by min_count & precalculate downsampling - report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) - report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) + report_values = self.prepare_vocab( + keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) + report_values['memory'] = self.estimate_memory( + vocab_size=report_values['num_retained_words']) self.prepare_weights(update=update) # build tables & arrays def _scan_vocab(self, sentences, progress_per, trim_rule): @@ -580,7 +592,8 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, if corpus_file: corpus_iterable = LineSentence(corpus_file) - total_words, corpus_count = self._scan_vocab(corpus_iterable, progress_per, trim_rule) + total_words, corpus_count = self._scan_vocab( + corpus_iterable, progress_per, trim_rule) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", @@ -590,9 +603,9 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, return total_words, corpus_count def prepare_vocab( - self, update=False, keep_raw_vocab=False, trim_rule=None, - min_count=None, sample=None, dry_run=False, - ): + self, update=False, keep_raw_vocab=False, trim_rule=None, + min_count=None, sample=None, dry_run=False, + ): """Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -615,7 +628,8 @@ def prepare_vocab( # If max_final_vocab is specified instead of min_count, # pick a min_count which satisfies max_final_vocab as well as possible. if self.max_final_vocab is not None: - sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) + sorted_vocab = sorted(self.raw_vocab.keys( + ), key=lambda word: self.raw_vocab[word], reverse=True) calc_min_count = 1 if self.max_final_vocab < len(sorted_vocab): @@ -656,7 +670,8 @@ def prepare_vocab( for word in self.wv.index_to_key: self.wv.set_vecattr(word, 'count', self.raw_vocab[word]) original_unique_total = len(retain_words) + drop_unique - retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) + retain_unique_pct = len(retain_words) * \ + 100 / max(original_unique_total, 1) self.add_lifecycle_event( "prepare_vocab", msg=( @@ -690,7 +705,8 @@ def prepare_vocab( new_words.append(word) new_total += v if not dry_run: - self.wv.key_to_index[word] = len(self.wv.index_to_key) + self.wv.key_to_index[word] = len( + self.wv.index_to_key) self.wv.index_to_key.append(word) else: drop_unique += 1 @@ -699,10 +715,14 @@ def prepare_vocab( # now update counts self.wv.allocate_vecattrs(attrs=['count'], types=[type(0)]) for word in self.wv.index_to_key: - self.wv.set_vecattr(word, 'count', self.wv.get_vecattr(word, 'count') + self.raw_vocab.get(word, 0)) - original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique - pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) - new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) + self.wv.set_vecattr(word, 'count', self.wv.get_vecattr( + word, 'count') + self.raw_vocab.get(word, 0)) + original_unique_total = len( + pre_exist_words) + len(new_words) + drop_unique + pre_exist_unique_pct = len( + pre_exist_words) * 100 / max(original_unique_total, 1) + new_unique_pct = len(new_words) * 100 / \ + max(original_unique_total, 1) self.add_lifecycle_event( "prepare_vocab", msg=( @@ -728,7 +748,8 @@ def prepare_vocab( downsample_total, downsample_unique = 0, 0 for w in retain_words: v = self.raw_vocab[w] - word_probability = (np.sqrt(v / threshold_count) + 1) * (threshold_count / v) + word_probability = ( + np.sqrt(v / threshold_count) + 1) * (threshold_count / v) if word_probability < 1.0: downsample_unique += 1 downsample_total += word_probability * v @@ -736,13 +757,16 @@ def prepare_vocab( word_probability = 1.0 downsample_total += v if not dry_run: - self.wv.set_vecattr(w, 'sample_int', np.uint32(word_probability * (2**32 - 1))) + self.wv.set_vecattr(w, 'sample_int', np.uint32( + word_probability * (2**32 - 1))) if not dry_run and not keep_raw_vocab: - logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) + logger.info( + "deleting the raw counts dictionary of %i items", len(self.raw_vocab)) self.raw_vocab = defaultdict(int) - logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) + logger.info("sample=%g downsamples %i most-common words", + sample, downsample_unique) self.add_lifecycle_event( "prepare_vocab", msg=( @@ -793,11 +817,14 @@ def estimate_memory(self, vocab_size=None, report=None): vocab_size = vocab_size or len(self.wv) report = report or {} report['vocab'] = vocab_size * (700 if self.hs else 500) - report['vectors'] = vocab_size * self.vector_size * np.dtype(REAL).itemsize + report['vectors'] = vocab_size * \ + self.vector_size * np.dtype(REAL).itemsize if self.hs: - report['syn1'] = vocab_size * self.layer1_size * np.dtype(REAL).itemsize + report['syn1'] = vocab_size * \ + self.layer1_size * np.dtype(REAL).itemsize if self.negative: - report['syn1neg'] = vocab_size * self.layer1_size * np.dtype(REAL).itemsize + report['syn1neg'] = vocab_size * \ + self.layer1_size * np.dtype(REAL).itemsize report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words and %i dimensions: %i bytes", @@ -839,7 +866,8 @@ def make_cum_table(self, domain=2**31 - 1): for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') cumulative += count**float(self.ns_exponent) - self.cum_table[word_index] = round(cumulative / train_words_pow * domain) + self.cum_table[word_index] = round( + cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -863,7 +891,8 @@ def init_weights(self): if self.hs: self.syn1 = np.zeros((len(self.wv), self.layer1_size), dtype=REAL) if self.negative: - self.syn1neg = np.zeros((len(self.wv), self.layer1_size), dtype=REAL) + self.syn1neg = np.zeros( + (len(self.wv), self.layer1_size), dtype=REAL) def update_weights(self): """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" @@ -879,7 +908,8 @@ def update_weights(self): gained_vocab = len(self.wv.vectors) - preresize_count if self.hs: - self.syn1 = np.vstack([self.syn1, np.zeros((gained_vocab, self.layer1_size), dtype=REAL)]) + self.syn1 = np.vstack([self.syn1, np.zeros( + (gained_vocab, self.layer1_size), dtype=REAL)]) if self.negative: pad = np.zeros((gained_vocab, self.layer1_size), dtype=REAL) self.syn1neg = np.vstack([self.syn1neg, pad]) @@ -910,9 +940,9 @@ def init_sims(self, replace=False): self.wv.init_sims(replace=replace) def _do_train_epoch( - self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=None, total_words=None, **kwargs, - ): + self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, + total_examples=None, total_words=None, **kwargs, + ): work, neu1 = thread_private_mem if self.sg: @@ -949,9 +979,11 @@ def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits tally = 0 if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) + tally += train_batch_sg(self, sentences, + alpha, work, self.compute_loss) else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) + tally += train_batch_cbow(self, sentences, + alpha, work, neu1, self.compute_loss) return tally, self._raw_word_count(sentences) def _clear_post_train(self): @@ -959,11 +991,11 @@ def _clear_post_train(self): self.wv.norms = None def train( - self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), - **kwargs, - ): + self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, word_count=0, + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), + **kwargs, + ): """Update the model's neural weights from a sequence of sentences. Notes @@ -1039,8 +1071,10 @@ def train( self.min_alpha = end_alpha or self.min_alpha self.epochs = epochs - self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) - self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) + self._check_training_sanity( + epochs=epochs, total_examples=total_examples, total_words=total_words) + self._check_corpus_sanity( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) self.add_lifecycle_event( "train", @@ -1085,7 +1119,8 @@ def train( # Log overall time total_elapsed = default_timer() - start - self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally) + self._log_train_end( + raw_word_count, trained_word_count, total_elapsed, job_tally) self.train_count += 1 # number of times train() has been called self._clear_post_train() @@ -1096,9 +1131,9 @@ def train( return trained_word_count, raw_word_count def _worker_loop_corpusfile( - self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, - total_examples=None, total_words=None, **kwargs, - ): + self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, + total_examples=None, total_words=None, **kwargs, + ): """Train the model on a `corpus_file` in LineSentence format. This function will be called in parallel by multiple workers (threads or processes) to make @@ -1160,9 +1195,11 @@ def _worker_loop(self, job_queue, progress_queue): break # no more jobs => quit this worker data_iterable, alpha = job - tally, raw_tally = self._do_train_job(data_iterable, alpha, thread_private_mem) + tally, raw_tally = self._do_train_job( + data_iterable, alpha, thread_private_mem) - progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress + # report back progress + progress_queue.put((len(data_iterable), tally, raw_tally)) jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) @@ -1238,9 +1275,9 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No logger.debug("job loop exiting, total %i jobs", job_no) def _log_epoch_progress( - self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, - total_words=None, report_delay=1.0, is_corpus_file_mode=None, - ): + self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, + total_words=None, report_delay=1.0, is_corpus_file_mode=None, + ): """Get the progress report for a single training epoch. Parameters @@ -1286,7 +1323,8 @@ def _log_epoch_progress( report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 - logger.debug("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) + logger.debug( + "worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 @@ -1312,8 +1350,8 @@ def _log_epoch_progress( return trained_word_count, raw_word_count, job_tally def _train_epoch_corpusfile( - self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs, - ): + self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs, + ): """Train the model for a single epoch. Parameters @@ -1342,11 +1380,13 @@ def _train_epoch_corpusfile( """ if not total_words: - raise ValueError("total_words must be provided alongside corpus_file argument.") + raise ValueError( + "total_words must be provided alongside corpus_file argument.") from gensim.models.word2vec_corpusfile import CythonVocab from gensim.models.fasttext import FastText - cython_vocab = CythonVocab(self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) + cython_vocab = CythonVocab( + self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) progress_queue = Queue() @@ -1360,7 +1400,8 @@ def _train_epoch_corpusfile( threading.Thread( target=self._worker_loop_corpusfile, args=( - corpus_file, thread_id, corpus_file_size / self.workers * thread_id, cython_vocab, progress_queue + corpus_file, thread_id, corpus_file_size / + self.workers * thread_id, cython_vocab, progress_queue ), kwargs=thread_kwargs ) for thread_id in range(self.workers) @@ -1377,9 +1418,9 @@ def _train_epoch_corpusfile( return trained_word_count, raw_word_count, job_tally def _train_epoch( - self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, - queue_factor=2, report_delay=1.0, callbacks=(), - ): + self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, + queue_factor=2, report_delay=1.0, callbacks=(), + ): """Train the model for a single epoch. Parameters @@ -1468,7 +1509,8 @@ def _get_thread_working_mem(self): Each worker threads private work memory. """ - work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory + work = matutils.zeros_aligned( + self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) return work, neu1 @@ -1491,11 +1533,14 @@ def _raw_word_count(self, job): def _check_corpus_sanity(self, corpus_iterable=None, corpus_file=None, passes=1): """Checks whether the corpus parameters make sense.""" if corpus_file is None and corpus_iterable is None: - raise TypeError("Either one of corpus_file or corpus_iterable value must be provided") + raise TypeError( + "Either one of corpus_file or corpus_iterable value must be provided") if corpus_file is not None and corpus_iterable is not None: - raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") + raise TypeError( + "Both corpus_file and corpus_iterable must not be provided at the same time") if corpus_iterable is None and not os.path.isfile(corpus_file): - raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) + raise TypeError( + "Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) if corpus_iterable is not None and not isinstance(corpus_iterable, Iterable): raise TypeError( "The corpus_iterable must be an iterable of lists of strings, got %r instead" % corpus_iterable) @@ -1534,12 +1579,15 @@ def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None """ if self.alpha > self.min_alpha_yet_reached: - logger.warning("Effective 'alpha' higher than previous training cycles") + logger.warning( + "Effective 'alpha' higher than previous training cycles") if not self.wv.key_to_index: # should be set by `build_vocab` - raise RuntimeError("you must first build vocabulary before training the model") + raise RuntimeError( + "you must first build vocabulary before training the model") if not len(self.wv.vectors): - raise RuntimeError("you must initialize vectors before training the model") + raise RuntimeError( + "you must initialize vectors before training the model") if total_words is None and total_examples is None: raise ValueError( @@ -1549,12 +1597,13 @@ def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None "in the model is sufficient: total_examples=model.corpus_count." ) if epochs is None or epochs <= 0: - raise ValueError("You must specify an explicit epochs count. The usual value is epochs=model.epochs.") + raise ValueError( + "You must specify an explicit epochs count. The usual value is epochs=model.epochs.") def _log_progress( - self, job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed - ): + self, job_queue, progress_queue, cur_epoch, example_count, total_examples, + raw_word_count, total_words, trained_word_count, elapsed + ): """Callback used to log progress for long running jobs. Parameters @@ -1605,9 +1654,9 @@ def _log_progress( ) def _log_epoch_end( - self, cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode - ): + self, cur_epoch, example_count, total_examples, raw_word_count, total_words, + trained_word_count, elapsed, is_corpus_file_mode + ): """Callback used to log the end of a training epoch. Parameters @@ -1716,7 +1765,8 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor ) if not self.wv.key_to_index: - raise RuntimeError("you must first build vocabulary before scoring new data") + raise RuntimeError( + "you must first build vocabulary before scoring new data") if not self.hs: raise RuntimeError( @@ -1726,7 +1776,8 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor def worker_loop(): """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" - work = np.zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) + work = np.zeros( + 1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = job_queue.get() @@ -1749,7 +1800,8 @@ def worker_loop(): job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] + workers = [threading.Thread(target=worker_loop) + for _ in range(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() @@ -1775,13 +1827,16 @@ def worker_loop(): logger.debug("putting job #%i in the queue", job_no) job_queue.put(items) except StopIteration: - logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) + logger.info( + "reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) for _ in range(self.workers): - job_queue.put(None) # give the workers heads up that they can finish -- no more work! + # give the workers heads up that they can finish -- no more work! + job_queue.put(None) push_done = True try: while done_jobs < (job_no + 1) or not push_done: - ns = progress_queue.get(push_done) # only block after all jobs pushed + # only block after all jobs pushed + ns = progress_queue.get(push_done) sentence_count += ns done_jobs += 1 elapsed = default_timer() - start @@ -1790,7 +1845,8 @@ def worker_loop(): "PROGRESS: at %.2f%% sentences, %.0f sentences/s", 100.0 * sentence_count, sentence_count / elapsed ) - next_report = elapsed + report_delay # don't flood log, wait report_delay seconds + # don't flood log, wait report_delay seconds + next_report = elapsed + report_delay else: # loop ended by job count; really done break @@ -1834,11 +1890,14 @@ def predict_output_word(self, context_words_list, topn=10): ) if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): - raise RuntimeError("Parameters required for predicting the output words not found.") - word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv] + raise RuntimeError( + "Parameters required for predicting the output words not found.") + word2_indices = [self.wv.get_index( + w) for w in context_words_list if w in self.wv] if not word2_indices: - logger.warning("All the input context words are out-of-vocabulary for the current model.") + logger.warning( + "All the input context words are out-of-vocabulary for the current model.") return None l1 = np.sum(self.wv.vectors[word2_indices], axis=0) @@ -1892,7 +1951,8 @@ def __str__(self): """ return "%s" % ( - self.__class__.__name__, len(self.wv.index_to_key), self.wv.vector_size, self.alpha, + self.__class__.__name__, len( + self.wv.index_to_key), self.wv.vector_size, self.alpha, ) def save(self, *args, **kwargs): @@ -1939,7 +1999,8 @@ def load(cls, *args, rethrow=False, **kwargs): model = super(Word2Vec, cls).load(*args, **kwargs) if not isinstance(model, Word2Vec): rethrow = True - raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls))) + raise AttributeError( + "Model of type %s can't be loaded by %s" % (type(model), str(cls))) return model except AttributeError as ae: if rethrow: @@ -2020,9 +2081,11 @@ def __iter__(self): line = utils.to_unicode(line) # each file line is a single sentence in the Brown corpus # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] + token_tags = [ + t.split('/') for t in line.split() if len(t.split('/')) == 2] # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + words = ["%s/%s" % (token.lower(), tag[:2]) + for token, tag in token_tags if tag[:2].isalpha()] if not words: # don't bother sending out empty sentences continue yield words @@ -2040,14 +2103,17 @@ def __iter__(self): sentence, rest = [], b'' with utils.open(self.fname, 'rb') as fin: while True: - text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM + # avoid loading the entire file (=1 line) into RAM + text = rest + fin.read(8192) if text == rest: # EOF words = utils.to_unicode(text).split() - sentence.extend(words) # return the last chunk of words, too (may be shorter/longer) + # return the last chunk of words, too (may be shorter/longer) + sentence.extend(words) if sentence: yield sentence break - last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration + # last token may have been split in two... keep for next iteration + last_token = text.rfind(b' ') words, rest = (utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text) sentence.extend(words) @@ -2133,18 +2199,25 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.limit = limit if os.path.isfile(self.source): - logger.debug('single file given as source, rather than a directory of files') - logger.debug('consider using models.word2vec.LineSentence for a single file') - self.input_files = [self.source] # force code compatibility with list of files + logger.debug( + 'single file given as source, rather than a directory of files') + logger.debug( + 'consider using models.word2vec.LineSentence for a single file') + # force code compatibility with list of files + self.input_files = [self.source] elif os.path.isdir(self.source): - self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path + # ensures os-specific slash at end of path + self.source = os.path.join(self.source, '') logger.info('reading directory %s', self.source) self.input_files = os.listdir(self.source) - self.input_files = [self.source + filename for filename in self.input_files] # make full paths + # make full paths + self.input_files = [self.source + + filename for filename in self.input_files] self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) + logger.info('files read into PathLineSentences:%s', + '\n'.join(self.input_files)) def __iter__(self): """iterate through the files""" @@ -2175,12 +2248,14 @@ def __lt__(self, other): def _build_heap(wv): - heap = list(Heapitem(wv.get_vecattr(i, 'count'), i, None, None) for i in range(len(wv.index_to_key))) + heap = list(Heapitem(wv.get_vecattr(i, 'count'), i, None, None) + for i in range(len(wv.index_to_key))) heapq.heapify(heap) for i in range(len(wv) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) heapq.heappush( - heap, Heapitem(count=min1.count + min2.count, index=i + len(wv), left=min1, right=min2) + heap, Heapitem(count=min1.count + min2.count, + index=i + len(wv), left=min1, right=min2) ) return heap @@ -2223,9 +2298,12 @@ def _assign_binary_codes(wv): max_depth = max(len(codes), max_depth) else: # inner node => continue recursion - points = np.array(list(points) + [node.index - len(wv)], dtype=np.uint32) - stack.append((node.left, np.array(list(codes) + [0], dtype=np.uint8), points)) - stack.append((node.right, np.array(list(codes) + [1], dtype=np.uint8), points)) + points = np.array( + list(points) + [node.index - len(wv)], dtype=np.uint32) + stack.append((node.left, np.array( + list(codes) + [0], dtype=np.uint8), points)) + stack.append((node.right, np.array( + list(codes) + [1], dtype=np.uint8), points)) logger.info("built huffman tree with maximum node depth %i", max_depth) @@ -2251,10 +2329,14 @@ def _assign_binary_codes(wv): np.seterr(all='raise') # don't ignore numpy errors parser = argparse.ArgumentParser() - parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) - parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") - parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) - parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) + parser.add_argument( + "-train", help="Use text data from file TRAIN to train the model", required=True) + parser.add_argument( + "-output", help="Use file OUTPUT to save the resulting word vectors") + parser.add_argument( + "-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) + parser.add_argument( + "-size", help="Set size of word vectors; default is 100", type=int, default=100) parser.add_argument( "-sample", help="Set threshold for occurrence of words. " @@ -2270,8 +2352,10 @@ def _assign_binary_codes(wv): "-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5 ) - parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) - parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) + parser.add_argument( + "-threads", help="Use THREADS threads (default 12)", type=int, default=12) + parser.add_argument( + "-iter", help="Run more training iterations (default 5)", type=int, default=5) parser.add_argument( "-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5 @@ -2284,7 +2368,8 @@ def _assign_binary_codes(wv): "-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1] ) - parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") + parser.add_argument( + "-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() From 1119705469556854881790f63aada6803bee009a Mon Sep 17 00:00:00 2001 From: funasshi Date: Wed, 22 Jun 2022 16:28:54 +0900 Subject: [PATCH 2/6] Revert "fix loading old gensim model by new model" This reverts commit d398e92291e3de4d6d2a73016150cd9aa2e52714. --- gensim/models/word2vec.py | 345 ++++++++++++++------------------------ 1 file changed, 130 insertions(+), 215 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e8a8f96d96..061dcfc817 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -197,7 +197,7 @@ import numpy as np from gensim.utils import keep_vocab_item, call_on_class_only, deprecated -from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector, Vocab +from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils from smart_open.compression import get_supported_extensions @@ -223,28 +223,26 @@ CORPUSFILE_VERSION = -1 def train_epoch_sg( - model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, - ): - raise RuntimeError( - "Training with corpus_file argument is not supported") + model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, + _work, _neu1, compute_loss, + ): + raise RuntimeError("Training with corpus_file argument is not supported") def train_epoch_cbow( - model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss, - ): - raise RuntimeError( - "Training with corpus_file argument is not supported") + model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, + _work, _neu1, compute_loss, + ): + raise RuntimeError("Training with corpus_file argument is not supported") class Word2Vec(utils.SaveLoad): def __init__( - self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - comment=None, max_final_vocab=None, shrink_windows=True, - ): + self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), + comment=None, max_final_vocab=None, shrink_windows=True, + ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. Once you're finished training a model (=no more updates, only querying) @@ -412,13 +410,11 @@ def __init__( self.wv = KeyedVectors(vector_size) # EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0) # advanced users should directly resize/adjust as desired after any vocab growth - # 0.0 values suppress word-backprop-updates; 1.0 allows - self.wv.vectors_lockf = np.ones(1, dtype=REAL) + self.wv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows self.hashfxn = hashfxn self.seed = seed - # set unless subclass already set (as for Doc2Vec dm_concat mode) - if not hasattr(self, 'layer1_size'): + if not hasattr(self, 'layer1_size'): # set unless subclass already set (as for Doc2Vec dm_concat mode) self.layer1_size = vector_size self.comment = comment @@ -426,10 +422,8 @@ def __init__( self.load = call_on_class_only if corpus_iterable is not None or corpus_file is not None: - self._check_corpus_sanity( - corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=(epochs + 1)) - self.build_vocab(corpus_iterable=corpus_iterable, - corpus_file=corpus_file, trim_rule=trim_rule) + self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=(epochs + 1)) + self.build_vocab(corpus_iterable=corpus_iterable, corpus_file=corpus_file, trim_rule=trim_rule) self.train( corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, @@ -450,9 +444,9 @@ def __init__( self.add_lifecycle_event("created", params=str(self)) def build_vocab( - self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, - keep_raw_vocab=False, trim_rule=None, **kwargs, - ): + self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs, + ): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Parameters @@ -490,23 +484,19 @@ def build_vocab( Keyword arguments propagated to `self.prepare_vocab`. """ - self._check_corpus_sanity( - corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1) + self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1) total_words, corpus_count = self.scan_vocab( corpus_iterable=corpus_iterable, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) self.corpus_count = corpus_count self.corpus_total_words = total_words - report_values = self.prepare_vocab( - update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) - report_values['memory'] = self.estimate_memory( - vocab_size=report_values['num_retained_words']) + report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) self.prepare_weights(update=update) - self.add_lifecycle_event( - "build_vocab", update=update, trim_rule=str(trim_rule)) + self.add_lifecycle_event("build_vocab", update=update, trim_rule=str(trim_rule)) def build_vocab_from_freq( - self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False, - ): + self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False, + ): """Build vocabulary from a dictionary of word frequencies. Parameters @@ -549,10 +539,8 @@ def build_vocab_from_freq( self.raw_vocab = raw_vocab # trim by min_count & precalculate downsampling - report_values = self.prepare_vocab( - keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) - report_values['memory'] = self.estimate_memory( - vocab_size=report_values['num_retained_words']) + report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) + report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) self.prepare_weights(update=update) # build tables & arrays def _scan_vocab(self, sentences, progress_per, trim_rule): @@ -592,8 +580,7 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, if corpus_file: corpus_iterable = LineSentence(corpus_file) - total_words, corpus_count = self._scan_vocab( - corpus_iterable, progress_per, trim_rule) + total_words, corpus_count = self._scan_vocab(corpus_iterable, progress_per, trim_rule) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", @@ -603,9 +590,9 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, return total_words, corpus_count def prepare_vocab( - self, update=False, keep_raw_vocab=False, trim_rule=None, - min_count=None, sample=None, dry_run=False, - ): + self, update=False, keep_raw_vocab=False, trim_rule=None, + min_count=None, sample=None, dry_run=False, + ): """Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -628,8 +615,7 @@ def prepare_vocab( # If max_final_vocab is specified instead of min_count, # pick a min_count which satisfies max_final_vocab as well as possible. if self.max_final_vocab is not None: - sorted_vocab = sorted(self.raw_vocab.keys( - ), key=lambda word: self.raw_vocab[word], reverse=True) + sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) calc_min_count = 1 if self.max_final_vocab < len(sorted_vocab): @@ -670,8 +656,7 @@ def prepare_vocab( for word in self.wv.index_to_key: self.wv.set_vecattr(word, 'count', self.raw_vocab[word]) original_unique_total = len(retain_words) + drop_unique - retain_unique_pct = len(retain_words) * \ - 100 / max(original_unique_total, 1) + retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) self.add_lifecycle_event( "prepare_vocab", msg=( @@ -705,8 +690,7 @@ def prepare_vocab( new_words.append(word) new_total += v if not dry_run: - self.wv.key_to_index[word] = len( - self.wv.index_to_key) + self.wv.key_to_index[word] = len(self.wv.index_to_key) self.wv.index_to_key.append(word) else: drop_unique += 1 @@ -715,14 +699,10 @@ def prepare_vocab( # now update counts self.wv.allocate_vecattrs(attrs=['count'], types=[type(0)]) for word in self.wv.index_to_key: - self.wv.set_vecattr(word, 'count', self.wv.get_vecattr( - word, 'count') + self.raw_vocab.get(word, 0)) - original_unique_total = len( - pre_exist_words) + len(new_words) + drop_unique - pre_exist_unique_pct = len( - pre_exist_words) * 100 / max(original_unique_total, 1) - new_unique_pct = len(new_words) * 100 / \ - max(original_unique_total, 1) + self.wv.set_vecattr(word, 'count', self.wv.get_vecattr(word, 'count') + self.raw_vocab.get(word, 0)) + original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique + pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) + new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) self.add_lifecycle_event( "prepare_vocab", msg=( @@ -748,8 +728,7 @@ def prepare_vocab( downsample_total, downsample_unique = 0, 0 for w in retain_words: v = self.raw_vocab[w] - word_probability = ( - np.sqrt(v / threshold_count) + 1) * (threshold_count / v) + word_probability = (np.sqrt(v / threshold_count) + 1) * (threshold_count / v) if word_probability < 1.0: downsample_unique += 1 downsample_total += word_probability * v @@ -757,16 +736,13 @@ def prepare_vocab( word_probability = 1.0 downsample_total += v if not dry_run: - self.wv.set_vecattr(w, 'sample_int', np.uint32( - word_probability * (2**32 - 1))) + self.wv.set_vecattr(w, 'sample_int', np.uint32(word_probability * (2**32 - 1))) if not dry_run and not keep_raw_vocab: - logger.info( - "deleting the raw counts dictionary of %i items", len(self.raw_vocab)) + logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) self.raw_vocab = defaultdict(int) - logger.info("sample=%g downsamples %i most-common words", - sample, downsample_unique) + logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) self.add_lifecycle_event( "prepare_vocab", msg=( @@ -817,14 +793,11 @@ def estimate_memory(self, vocab_size=None, report=None): vocab_size = vocab_size or len(self.wv) report = report or {} report['vocab'] = vocab_size * (700 if self.hs else 500) - report['vectors'] = vocab_size * \ - self.vector_size * np.dtype(REAL).itemsize + report['vectors'] = vocab_size * self.vector_size * np.dtype(REAL).itemsize if self.hs: - report['syn1'] = vocab_size * \ - self.layer1_size * np.dtype(REAL).itemsize + report['syn1'] = vocab_size * self.layer1_size * np.dtype(REAL).itemsize if self.negative: - report['syn1neg'] = vocab_size * \ - self.layer1_size * np.dtype(REAL).itemsize + report['syn1neg'] = vocab_size * self.layer1_size * np.dtype(REAL).itemsize report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words and %i dimensions: %i bytes", @@ -866,8 +839,7 @@ def make_cum_table(self, domain=2**31 - 1): for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') cumulative += count**float(self.ns_exponent) - self.cum_table[word_index] = round( - cumulative / train_words_pow * domain) + self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -891,8 +863,7 @@ def init_weights(self): if self.hs: self.syn1 = np.zeros((len(self.wv), self.layer1_size), dtype=REAL) if self.negative: - self.syn1neg = np.zeros( - (len(self.wv), self.layer1_size), dtype=REAL) + self.syn1neg = np.zeros((len(self.wv), self.layer1_size), dtype=REAL) def update_weights(self): """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" @@ -908,8 +879,7 @@ def update_weights(self): gained_vocab = len(self.wv.vectors) - preresize_count if self.hs: - self.syn1 = np.vstack([self.syn1, np.zeros( - (gained_vocab, self.layer1_size), dtype=REAL)]) + self.syn1 = np.vstack([self.syn1, np.zeros((gained_vocab, self.layer1_size), dtype=REAL)]) if self.negative: pad = np.zeros((gained_vocab, self.layer1_size), dtype=REAL) self.syn1neg = np.vstack([self.syn1neg, pad]) @@ -940,9 +910,9 @@ def init_sims(self, replace=False): self.wv.init_sims(replace=replace) def _do_train_epoch( - self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=None, total_words=None, **kwargs, - ): + self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, + total_examples=None, total_words=None, **kwargs, + ): work, neu1 = thread_private_mem if self.sg: @@ -979,11 +949,9 @@ def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits tally = 0 if self.sg: - tally += train_batch_sg(self, sentences, - alpha, work, self.compute_loss) + tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) else: - tally += train_batch_cbow(self, sentences, - alpha, work, neu1, self.compute_loss) + tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) return tally, self._raw_word_count(sentences) def _clear_post_train(self): @@ -991,11 +959,11 @@ def _clear_post_train(self): self.wv.norms = None def train( - self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), - **kwargs, - ): + self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, word_count=0, + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), + **kwargs, + ): """Update the model's neural weights from a sequence of sentences. Notes @@ -1071,10 +1039,8 @@ def train( self.min_alpha = end_alpha or self.min_alpha self.epochs = epochs - self._check_training_sanity( - epochs=epochs, total_examples=total_examples, total_words=total_words) - self._check_corpus_sanity( - corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) + self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) + self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) self.add_lifecycle_event( "train", @@ -1119,8 +1085,7 @@ def train( # Log overall time total_elapsed = default_timer() - start - self._log_train_end( - raw_word_count, trained_word_count, total_elapsed, job_tally) + self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally) self.train_count += 1 # number of times train() has been called self._clear_post_train() @@ -1131,9 +1096,9 @@ def train( return trained_word_count, raw_word_count def _worker_loop_corpusfile( - self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, - total_examples=None, total_words=None, **kwargs, - ): + self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, + total_examples=None, total_words=None, **kwargs, + ): """Train the model on a `corpus_file` in LineSentence format. This function will be called in parallel by multiple workers (threads or processes) to make @@ -1195,11 +1160,9 @@ def _worker_loop(self, job_queue, progress_queue): break # no more jobs => quit this worker data_iterable, alpha = job - tally, raw_tally = self._do_train_job( - data_iterable, alpha, thread_private_mem) + tally, raw_tally = self._do_train_job(data_iterable, alpha, thread_private_mem) - # report back progress - progress_queue.put((len(data_iterable), tally, raw_tally)) + progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) @@ -1275,9 +1238,9 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No logger.debug("job loop exiting, total %i jobs", job_no) def _log_epoch_progress( - self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, - total_words=None, report_delay=1.0, is_corpus_file_mode=None, - ): + self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, + total_words=None, report_delay=1.0, is_corpus_file_mode=None, + ): """Get the progress report for a single training epoch. Parameters @@ -1323,8 +1286,7 @@ def _log_epoch_progress( report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 - logger.debug( - "worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) + logger.debug("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 @@ -1350,8 +1312,8 @@ def _log_epoch_progress( return trained_word_count, raw_word_count, job_tally def _train_epoch_corpusfile( - self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs, - ): + self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs, + ): """Train the model for a single epoch. Parameters @@ -1380,13 +1342,11 @@ def _train_epoch_corpusfile( """ if not total_words: - raise ValueError( - "total_words must be provided alongside corpus_file argument.") + raise ValueError("total_words must be provided alongside corpus_file argument.") from gensim.models.word2vec_corpusfile import CythonVocab from gensim.models.fasttext import FastText - cython_vocab = CythonVocab( - self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) + cython_vocab = CythonVocab(self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) progress_queue = Queue() @@ -1400,8 +1360,7 @@ def _train_epoch_corpusfile( threading.Thread( target=self._worker_loop_corpusfile, args=( - corpus_file, thread_id, corpus_file_size / - self.workers * thread_id, cython_vocab, progress_queue + corpus_file, thread_id, corpus_file_size / self.workers * thread_id, cython_vocab, progress_queue ), kwargs=thread_kwargs ) for thread_id in range(self.workers) @@ -1418,9 +1377,9 @@ def _train_epoch_corpusfile( return trained_word_count, raw_word_count, job_tally def _train_epoch( - self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, - queue_factor=2, report_delay=1.0, callbacks=(), - ): + self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, + queue_factor=2, report_delay=1.0, callbacks=(), + ): """Train the model for a single epoch. Parameters @@ -1509,8 +1468,7 @@ def _get_thread_working_mem(self): Each worker threads private work memory. """ - work = matutils.zeros_aligned( - self.layer1_size, dtype=REAL) # per-thread private work memory + work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) return work, neu1 @@ -1533,14 +1491,11 @@ def _raw_word_count(self, job): def _check_corpus_sanity(self, corpus_iterable=None, corpus_file=None, passes=1): """Checks whether the corpus parameters make sense.""" if corpus_file is None and corpus_iterable is None: - raise TypeError( - "Either one of corpus_file or corpus_iterable value must be provided") + raise TypeError("Either one of corpus_file or corpus_iterable value must be provided") if corpus_file is not None and corpus_iterable is not None: - raise TypeError( - "Both corpus_file and corpus_iterable must not be provided at the same time") + raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") if corpus_iterable is None and not os.path.isfile(corpus_file): - raise TypeError( - "Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) + raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) if corpus_iterable is not None and not isinstance(corpus_iterable, Iterable): raise TypeError( "The corpus_iterable must be an iterable of lists of strings, got %r instead" % corpus_iterable) @@ -1579,15 +1534,12 @@ def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None """ if self.alpha > self.min_alpha_yet_reached: - logger.warning( - "Effective 'alpha' higher than previous training cycles") + logger.warning("Effective 'alpha' higher than previous training cycles") if not self.wv.key_to_index: # should be set by `build_vocab` - raise RuntimeError( - "you must first build vocabulary before training the model") + raise RuntimeError("you must first build vocabulary before training the model") if not len(self.wv.vectors): - raise RuntimeError( - "you must initialize vectors before training the model") + raise RuntimeError("you must initialize vectors before training the model") if total_words is None and total_examples is None: raise ValueError( @@ -1597,13 +1549,12 @@ def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None "in the model is sufficient: total_examples=model.corpus_count." ) if epochs is None or epochs <= 0: - raise ValueError( - "You must specify an explicit epochs count. The usual value is epochs=model.epochs.") + raise ValueError("You must specify an explicit epochs count. The usual value is epochs=model.epochs.") def _log_progress( - self, job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed - ): + self, job_queue, progress_queue, cur_epoch, example_count, total_examples, + raw_word_count, total_words, trained_word_count, elapsed + ): """Callback used to log progress for long running jobs. Parameters @@ -1654,9 +1605,9 @@ def _log_progress( ) def _log_epoch_end( - self, cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode - ): + self, cur_epoch, example_count, total_examples, raw_word_count, total_words, + trained_word_count, elapsed, is_corpus_file_mode + ): """Callback used to log the end of a training epoch. Parameters @@ -1765,8 +1716,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor ) if not self.wv.key_to_index: - raise RuntimeError( - "you must first build vocabulary before scoring new data") + raise RuntimeError("you must first build vocabulary before scoring new data") if not self.hs: raise RuntimeError( @@ -1776,8 +1726,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor def worker_loop(): """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" - work = np.zeros( - 1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) + work = np.zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = job_queue.get() @@ -1800,8 +1749,7 @@ def worker_loop(): job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - workers = [threading.Thread(target=worker_loop) - for _ in range(self.workers)] + workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() @@ -1827,16 +1775,13 @@ def worker_loop(): logger.debug("putting job #%i in the queue", job_no) job_queue.put(items) except StopIteration: - logger.info( - "reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) + logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) for _ in range(self.workers): - # give the workers heads up that they can finish -- no more work! - job_queue.put(None) + job_queue.put(None) # give the workers heads up that they can finish -- no more work! push_done = True try: while done_jobs < (job_no + 1) or not push_done: - # only block after all jobs pushed - ns = progress_queue.get(push_done) + ns = progress_queue.get(push_done) # only block after all jobs pushed sentence_count += ns done_jobs += 1 elapsed = default_timer() - start @@ -1845,8 +1790,7 @@ def worker_loop(): "PROGRESS: at %.2f%% sentences, %.0f sentences/s", 100.0 * sentence_count, sentence_count / elapsed ) - # don't flood log, wait report_delay seconds - next_report = elapsed + report_delay + next_report = elapsed + report_delay # don't flood log, wait report_delay seconds else: # loop ended by job count; really done break @@ -1890,14 +1834,11 @@ def predict_output_word(self, context_words_list, topn=10): ) if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): - raise RuntimeError( - "Parameters required for predicting the output words not found.") - word2_indices = [self.wv.get_index( - w) for w in context_words_list if w in self.wv] + raise RuntimeError("Parameters required for predicting the output words not found.") + word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv] if not word2_indices: - logger.warning( - "All the input context words are out-of-vocabulary for the current model.") + logger.warning("All the input context words are out-of-vocabulary for the current model.") return None l1 = np.sum(self.wv.vectors[word2_indices], axis=0) @@ -1951,8 +1892,7 @@ def __str__(self): """ return "%s" % ( - self.__class__.__name__, len( - self.wv.index_to_key), self.wv.vector_size, self.alpha, + self.__class__.__name__, len(self.wv.index_to_key), self.wv.vector_size, self.alpha, ) def save(self, *args, **kwargs): @@ -1999,8 +1939,7 @@ def load(cls, *args, rethrow=False, **kwargs): model = super(Word2Vec, cls).load(*args, **kwargs) if not isinstance(model, Word2Vec): rethrow = True - raise AttributeError( - "Model of type %s can't be loaded by %s" % (type(model), str(cls))) + raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls))) return model except AttributeError as ae: if rethrow: @@ -2081,11 +2020,9 @@ def __iter__(self): line = utils.to_unicode(line) # each file line is a single sentence in the Brown corpus # each token is WORD/POS_TAG - token_tags = [ - t.split('/') for t in line.split() if len(t.split('/')) == 2] + token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) - for token, tag in token_tags if tag[:2].isalpha()] + words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] if not words: # don't bother sending out empty sentences continue yield words @@ -2103,17 +2040,14 @@ def __iter__(self): sentence, rest = [], b'' with utils.open(self.fname, 'rb') as fin: while True: - # avoid loading the entire file (=1 line) into RAM - text = rest + fin.read(8192) + text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM if text == rest: # EOF words = utils.to_unicode(text).split() - # return the last chunk of words, too (may be shorter/longer) - sentence.extend(words) + sentence.extend(words) # return the last chunk of words, too (may be shorter/longer) if sentence: yield sentence break - # last token may have been split in two... keep for next iteration - last_token = text.rfind(b' ') + last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration words, rest = (utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text) sentence.extend(words) @@ -2199,25 +2133,18 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.limit = limit if os.path.isfile(self.source): - logger.debug( - 'single file given as source, rather than a directory of files') - logger.debug( - 'consider using models.word2vec.LineSentence for a single file') - # force code compatibility with list of files - self.input_files = [self.source] + logger.debug('single file given as source, rather than a directory of files') + logger.debug('consider using models.word2vec.LineSentence for a single file') + self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): - # ensures os-specific slash at end of path - self.source = os.path.join(self.source, '') + self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path logger.info('reading directory %s', self.source) self.input_files = os.listdir(self.source) - # make full paths - self.input_files = [self.source + - filename for filename in self.input_files] + self.input_files = [self.source + filename for filename in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences:%s', - '\n'.join(self.input_files)) + logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) def __iter__(self): """iterate through the files""" @@ -2248,14 +2175,12 @@ def __lt__(self, other): def _build_heap(wv): - heap = list(Heapitem(wv.get_vecattr(i, 'count'), i, None, None) - for i in range(len(wv.index_to_key))) + heap = list(Heapitem(wv.get_vecattr(i, 'count'), i, None, None) for i in range(len(wv.index_to_key))) heapq.heapify(heap) for i in range(len(wv) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) heapq.heappush( - heap, Heapitem(count=min1.count + min2.count, - index=i + len(wv), left=min1, right=min2) + heap, Heapitem(count=min1.count + min2.count, index=i + len(wv), left=min1, right=min2) ) return heap @@ -2298,12 +2223,9 @@ def _assign_binary_codes(wv): max_depth = max(len(codes), max_depth) else: # inner node => continue recursion - points = np.array( - list(points) + [node.index - len(wv)], dtype=np.uint32) - stack.append((node.left, np.array( - list(codes) + [0], dtype=np.uint8), points)) - stack.append((node.right, np.array( - list(codes) + [1], dtype=np.uint8), points)) + points = np.array(list(points) + [node.index - len(wv)], dtype=np.uint32) + stack.append((node.left, np.array(list(codes) + [0], dtype=np.uint8), points)) + stack.append((node.right, np.array(list(codes) + [1], dtype=np.uint8), points)) logger.info("built huffman tree with maximum node depth %i", max_depth) @@ -2329,14 +2251,10 @@ def _assign_binary_codes(wv): np.seterr(all='raise') # don't ignore numpy errors parser = argparse.ArgumentParser() - parser.add_argument( - "-train", help="Use text data from file TRAIN to train the model", required=True) - parser.add_argument( - "-output", help="Use file OUTPUT to save the resulting word vectors") - parser.add_argument( - "-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) - parser.add_argument( - "-size", help="Set size of word vectors; default is 100", type=int, default=100) + parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) + parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") + parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) + parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) parser.add_argument( "-sample", help="Set threshold for occurrence of words. " @@ -2352,10 +2270,8 @@ def _assign_binary_codes(wv): "-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5 ) - parser.add_argument( - "-threads", help="Use THREADS threads (default 12)", type=int, default=12) - parser.add_argument( - "-iter", help="Run more training iterations (default 5)", type=int, default=5) + parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) + parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) parser.add_argument( "-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5 @@ -2368,8 +2284,7 @@ def _assign_binary_codes(wv): "-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1] ) - parser.add_argument( - "-accuracy", help="Use questions from file ACCURACY to evaluate the model") + parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() From 9abc9f53f4d5e027a75b0d5b784126efeaff53ae Mon Sep 17 00:00:00 2001 From: funasshi Date: Wed, 22 Jun 2022 16:31:53 +0900 Subject: [PATCH 3/6] fix loading old gensim model by new model --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 061dcfc817..b4fc26f23f 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -197,7 +197,7 @@ import numpy as np from gensim.utils import keep_vocab_item, call_on_class_only, deprecated -from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector +from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector, Vocab from gensim import utils, matutils from smart_open.compression import get_supported_extensions From 805915756bd5ec69e9ba47d3e51c6784f3628266 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 16 Oct 2022 12:57:34 +0900 Subject: [PATCH 4/6] Update word2vec.py --- gensim/models/word2vec.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b4fc26f23f..a176b1c76a 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -197,9 +197,14 @@ import numpy as np from gensim.utils import keep_vocab_item, call_on_class_only, deprecated -from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector, Vocab +from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils +# +# This unused import is "somehow" required to fix #3357 +# +from gensim.models.keyedvectors import Vocab # noqa + from smart_open.compression import get_supported_extensions logger = logging.getLogger(__name__) From 40eb25101d4ef6ce1570810443dddff9270486f0 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 16 Oct 2022 22:16:49 +0900 Subject: [PATCH 5/6] Update word2vec.py --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a176b1c76a..bf7516942f 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -201,7 +201,7 @@ from gensim import utils, matutils # -# This unused import is "somehow" required to fix #3357 +# This import is required by pickle to load models stored by Gensim < 4.0, such as Gensim 3.8.3. # from gensim.models.keyedvectors import Vocab # noqa From e8e63c4429701b499c732370377650d2d090acf3 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 17 Oct 2022 09:47:04 +0900 Subject: [PATCH 6/6] Update word2vec.py --- gensim/models/word2vec.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index bf7516942f..b195293ac2 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -200,9 +200,7 @@ from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils -# # This import is required by pickle to load models stored by Gensim < 4.0, such as Gensim 3.8.3. -# from gensim.models.keyedvectors import Vocab # noqa from smart_open.compression import get_supported_extensions