From 7b73db9ba65c006b16dc100954a633b9a6bf29c1 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 1 Dec 2018 19:17:38 +0100 Subject: [PATCH 001/166] added EnsembleLda --- docs/notebooks/Opinosis.ipynb | 248 ++++++ gensim/corpora/opinosiscorpus.py | 88 ++ gensim/models/__init__.py | 1 + gensim/models/ensemblelda.py | 1368 ++++++++++++++++++++++++++++++ 4 files changed, 1705 insertions(+) create mode 100644 docs/notebooks/Opinosis.ipynb create mode 100644 gensim/corpora/opinosiscorpus.py create mode 100644 gensim/models/ensemblelda.py diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb new file mode 100644 index 0000000000..bbcc2f6525 --- /dev/null +++ b/docs/notebooks/Opinosis.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-01T17:57:35.858751Z", + "start_time": "2018-12-01T17:57:34.507686Z" + }, + "scrolled": false + }, + "outputs": [], + "source": [ + "import logging\n", + "from gensim.models import EnsembleLda, LdaMulticore\n", + "from gensim.corpora import OpinosisCorpus\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "enable the ensemble logger to show what it is doing currently" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-01T17:57:36.539523Z", + "start_time": "2018-12-01T17:57:36.534986Z" + } + }, + "outputs": [], + "source": [ + "elda_logger = logging.getLogger(EnsembleLda.__module__)\n", + "elda_logger.setLevel(logging.INFO)\n", + "elda_logger.addHandler(logging.StreamHandler())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiments on the Opinosis Dataset\n", + "\n", + "Opinosis is an extremely small corpus that contains 289 product reviews for 51 products, which is why it is hard to extract topics from it. https://github.com/kavgan/opinosis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing the corpus\n", + "\n", + "First, download the opinosis dataset. On linux it can be done like this for example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-29T20:19:17.891961Z", + "start_time": "2018-11-29T20:19:16.731678Z" + } + }, + "outputs": [], + "source": [ + "!mkdir ~/opinosis\n", + "!wget -P ~/opinosis https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip\n", + "!unzip ~/opinosis/OpinosisDataset1.0_0.zip -d ~/opinosis" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-01T17:57:39.575650Z", + "start_time": "2018-12-01T17:57:39.571588Z" + } + }, + "outputs": [], + "source": [ + "path = os.path.expanduser('~/opinosis/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Corpus and id2word mapping can be created using the load_opinosis_data function provided in the package.\n", + "It preprocesses the data using the PorterStemmer and stopwords from the nltk package.\n", + "\n", + "The parameter of the function is the relative path to the folder, into which the zip file was extracted before. That folder contains a 'summaries-gold' subfolder." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-01T17:57:40.842440Z", + "start_time": "2018-12-01T17:57:39.905273Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data source:\n", + "title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions\n", + "authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei\n", + "booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics\n", + "pages:\t\t340-348\n", + "year:\t\t2010\n", + "organization:\tAssociation for Computational Linguistics\n" + ] + } + ], + "source": [ + "opinosis = OpinosisCorpus(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**parameters**\n", + "\n", + "**topic_model_kind** ldamulticore is highly recommended for EnsembleLda. ensemble_workers and **distance_workers** are used to improve the time needed to train the models, as well as the **masking_method** 'rank'. ldamulticore is not able to fully utilize all cores on this small corpus, so **ensemble_workers** can be set to 3 to get 95 - 100% cpu usage on my i5 3470.\n", + "\n", + "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories.\n", + "\n", + "The default for **min_samples** would be 64, half of the number of models. But since this does not return any topics, or at most 2, I set this to 32." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-01T18:10:13.690983Z", + "start_time": "2018-12-01T17:57:41.089661Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Spawned worker to generate 42 topic models...\n", + "Spawned worker to generate 43 topic models...\n", + "Generating 42 topic models...\n", + "Generating 43 topic models...\n", + "Spawned worker to generate 43 topic models...\n", + "Generating 43 topic models...\n", + "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", + "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", + "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", + "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", + "Fitting the clustering model\n", + "Generating stable topics\n", + "Generating classic gensim model representation based on results from the ensemble\n" + ] + } + ], + "source": [ + "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n", + " passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n", + " topic_model_kind='ldamulticore', masking_method='rank', min_samples=32)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2018-12-01T18:10:13.712818Z", + "start_time": "2018-12-01T18:10:13.695844Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- 0.145 free, 0.043 park, 0.033 coffe, 0.030 wine, 0.027 even, 0.025 morn, 0.022 internet \n", + "\n", + "- 0.161 screen, 0.062 bright, 0.043 clear, 0.027 easi, 0.021 read, 0.019 touch, 0.011 size \n", + "\n", + "- 0.127 staff, 0.075 friendli, 0.074 help, 0.070 servic, 0.016 quick, 0.012 profession, 0.012 good \n", + "\n", + "- 0.123 seat, 0.067 comfort, 0.050 uncomfort, 0.039 front, 0.037 back, 0.036 firm, 0.032 drive \n", + "\n", + "- 0.113 room, 0.104 clean, 0.041 small, 0.037 bathroom, 0.036 comfort, 0.023 size, 0.016 well \n", + "\n" + ] + } + ], + "source": [ + "# pretty print, note that the words are stemmed so they appear chopped off\n", + "for t in elda.print_topics(num_words=7):\n", + " print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py new file mode 100644 index 0000000000..eee2229849 --- /dev/null +++ b/gensim/corpora/opinosiscorpus.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Tobias B + +import os +from pathlib import Path +import re +from gensim.corpora import Dictionary +from nltk.stem import PorterStemmer +from nltk.corpus import stopwords + + +class OpinosisCorpus(): + """Creates a corpus and dictionary from the opinosis dataset: + + http://kavita-ganesan.com/opinosis-opinion-dataset/#.WyF_JNWxW00 + + This data is organized in folders, each folder containing a few short docs. + + Data can be obtained quickly using the following commands in bash: + + mkdir opinosis && cd opinosis + wget https://github.com/kavgan/opinosis/raw/master/OpinosisDataset1.0_0.zip + unzip OpinosisDataset1.0_0.zip + + corpus and dictionary can be accessed by using the .corpus and .id2word members + + """ + + def __init__(self, path): + """ + + Parameters + ---------- + path : string + Path to the extracted zip file. If 'summaries-gold' is in a folder + called 'opinosis', then the Path parameter would be 'opinosis', + either relative to you current working directory or absolute. + + """ + + # citation + print("data source:") + print("title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions") + print("authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei") + print("booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics") + print("pages:\t\t340-348") + print("year:\t\t2010") + print("organization:\tAssociation for Computational Linguistics") + + path = Path(path).joinpath("summaries-gold") + dictionary = Dictionary() + corpus = [] + stemmer = PorterStemmer() + + for directory, b, filenames in os.walk(path): + # iterates over folders, so in this + # scope a new topic is prepared + + # root directory? + if len(filenames) == 0: + continue + + # folder = directory.split(os.sep)[-1] + # print("processing folder ", "'"+folder+"'", "...") + + # now get the corpus/documents + for filename in filenames: + + filepath = directory + os.sep + filename + # write down the document and the topicId and split into train and testdata + with open(filepath) as file: + + doc = file.read() + # overwrite dictionary, add corpus to test or train afterwards + # the following function takes an array of documents, so wrap it in square braces + processed = [ + stemmer.stem(token) for token in re.findall('\w+', doc.lower()) + if token not in stopwords.words('english') + ] + + dictionary.add_documents([processed]) + corpus += [dictionary.doc2bow(processed)] + + # and return the results the same way the other corpus generating functions do + self.corpus = corpus + self.id2word = dictionary diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 96ca698b27..382e585c79 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -21,6 +21,7 @@ from .ldaseqmodel import LdaSeqModel # noqa:F401 from .fasttext import FastText # noqa:F401 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401 +from .ensemblelda import EnsembleLda # noqa:F401 from . import wrappers # noqa:F401 from . import deprecated # noqa:F401 diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py new file mode 100644 index 0000000000..6abf70f5e3 --- /dev/null +++ b/gensim/models/ensemblelda.py @@ -0,0 +1,1368 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Authors: Tobias B , Alex Loosley , +# Stephan Sahm , Alex Salles , Data Reply Munich + +"""Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. They are clustered +using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more true topics and therefore undesired - +are being used to support topics in becoming cores, but not vice versa. + +Usage examples +-------------- + +Train an ensemble of LdaModels using a Gensim corpus + +.. sourcecode:: pycon + + >>> from gensim.test.utils import common_texts + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.models import EnsembleLda, LdaModel + >>> + >>> # Create a corpus from a list of texts + >>> common_dictionary = Dictionary(common_texts) + >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] + >>> + >>> # Train the model on the corpus. corpus has to be provided as a + >>> # keyword argument, as they are passed through to the children. + >>> elda = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=10, num_models=4) + +Save a model to disk, or reload a pre-trained model + +.. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> + >>> # Save model to disk. + >>> temp_file = datapath("model") + >>> elda.save(temp_file) + >>> + >>> # Load a potentially pretrained model from disk. + >>> elda = EnsembleLda.load(temp_file) + +Query, the model using new, unseen documents + +.. sourcecode:: pycon + + >>> # Create a new corpus, made of previously unseen documents. + >>> other_texts = [ + ... ['computer', 'time', 'graph'], + ... ['survey', 'response', 'eps'], + ... ['human', 'system', 'computer'] + ... ] + >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] + >>> + >>> unseen_doc = other_corpus[0] + >>> vector = elda[unseen_doc] # get topic probability distribution for a document + +Increase the ensemble size by adding a new model + +.. sourcecode:: pycon + + >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)) + >>> vector = elda[unseen_doc] + +To optimize the ensemble for your specific case, the children can be clustered again using +different hyperparameters + +.. sourcecode:: pycon + + >>> elda.recluster(eps=0.2) + +References +---------- +.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic + modelling with large corpora. In : THE LREC 2010 WORKSHOP ON NEW + CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. + 2010. p. 45-50. Available from: + http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 + +""" + +import logging +import os +from multiprocessing import Process, Pipe, ProcessError +import copy +try: + import cPickle as _pickle +except ImportError: + import pickle as _pickle +import numpy as np +import pandas as pd +from scipy.spatial.distance import cosine +from gensim import utils +from gensim.models import ldamodel, ldamulticore + +logger = logging.getLogger(__name__) + +GENSIM_MODEL = dict( + lda=ldamodel.LdaModel, + ldamulticore=ldamulticore.LdaMulticore +) + +MAX_RANDOM_STATE = 2**32 - 1 + + +class EnsembleLda(): + """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. + They are clustered using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more + true topics and therefore undesired - are being used to support topics in becoming cores, + but not vice versa. + + """ + def __init__(self, topic_model_kind="lda", num_models=3, + min_cores=None, # default value from generate_stable_topics() + epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, + min_samples=None, masking_method="mass", masking_threshold=None, + distance_workers=1, random_state=None, **gensim_kw_args): + """ + + Parameters + ---------- + topic_model_kind : str, optional + Examples: + 'ldamulticore' (recommended) or 'lda' (default) + num_models : number, optional + How many LDA models to train in this ensemble. + Default: 3 + min_cores : number, optional + Minimum cores a cluster of topics has to contain + so that it is recognized as stable topic. + epsilon : float, optional + Defaults to 0.1. Epsilon for the cbdbscan clustering + that generates the stable topics. + ensemble_workers : number, optional + Spawns that many processes and distributes the + models from the ensemble to those as evenly as + possible. num_models should be a multiple of + ensemble_workers. + + Setting it to 0 or 1 will both use the non- + multiprocessing version. Default:1 + memory_friendly_ttda : boolean, optional + If True, the models in the ensemble are + deleted after training and only the + total topic word distribution is kept + to save memory. + + Defaults to True. When False, trained + models are stored in a list in self.tms, + and no models that are not of a gensim + model type can be added to this ensemble + using the add_model function. + + If False, any topic term matrix can be + supllied to add_model. + min_samples : number, optional + Required number of nearby topics for + a topic to be considered as 'core' in + the CBDBSCAN clustering. + masking_method : {'mass', 'rank}, optional + One of "mass" (default) or "rank" (faster). + masking_threshold : float, optional + Default: None, which uses 0.11 for + masking_method "rank", and 0.95 + for "mass". + distance_workers : number, optional + When distance_workers is None, it defaults to + os.cpu_count() for maximum performance. + Default is 1, which is not multiprocessed. + Set to > 1 to enable multiprocessing. + **gensim_kw_args + All the parameters for the gensim model, + that is used for each model in the ensemble, + can be provided as additional keyword arguments. + https://radimrehurek.com/gensim/models/ldamodel.html + + """ + + if "id2word" not in gensim_kw_args: + gensim_kw_args["id2word"] = None + if "corpus" not in gensim_kw_args: + gensim_kw_args["corpus"] = None + + # dictionary. modified version of from gensim/models/ldamodel.py + # error messages are copied from the original gensim module [1]: + # will create a fake dict if no dict is provided. + if gensim_kw_args["id2word"] is None and not gensim_kw_args["corpus"] is None: + logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"]) + if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None: + raise ValueError("at least one of corpus/id2word must be specified, to establish " + "input space dimensionality. Corpus should be provided using the " + "keyword argument corpus.") + + # Store some of the parameters: + # - store topic_model_kind for generate_gensim_representation, + # so that it is made sure that the gensim_kw_args fit into + # the constructor of the gensim model. + self.topic_model_kind = topic_model_kind + self.num_models = num_models + self.gensim_kw_args = gensim_kw_args + + # some settings affecting performance + self.memory_friendly_ttda = memory_friendly_ttda + self.distance_workers = distance_workers + + # this will provide the gensim api to the ensemble basically + self.classic_model_representation = None + + # the ensembles state + self.random_state = utils.get_random_state(random_state) + self.sstats_sum = 0 + self.eta = None + self.tms = [] + self.ttda = None + + # in case the model will not train due to some + # parameters, stop here and don't train. + if num_models <= 0: + return + if ("corpus" not in gensim_kw_args): + return + if gensim_kw_args["corpus"] is None: + return + if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0: + return + if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: + return + + # training + if ensemble_workers > 1: + # multiprocessed + self.generate_ttda_multiprocessed(num_models, ensemble_workers) + else: + # singlecore + self.generate_topic_models(num_models) + + if not memory_friendly_ttda: + self.generate_ttda() + + self.generate_asymmetric_distance_matrix(workers=self.distance_workers, + threshold=masking_threshold, + method=masking_method) + self.generate_topic_clusters(epsilon, min_samples) + self.generate_stable_topics(min_cores) + + # create model that can provide the usual gensim api to the stable topics from the ensemble + self.generate_gensim_representation() + + def generate_gensim_representation(self): + """creates a gensim-model from the stable topics + + The prior for the gensim model, eta, Can be anything, + (a parameter in object constructor) and won't influence + get_topics(). Note that different eta means different + log_perplexity (the default is the same as the gensim + default eta, which is for example [0.5, 0.5, 0.5]). + + Note, that when the clustering of the topics + was changed using add_model, etc., and when no + topics were detected because of that, the old + classic gensim model will stay in place. + + Returns + ------- + LdaModel + A Gensim LDA Model classic_model_representation for which: + classic_model_representation.get_topics() == self.get_topics() + + """ + + logger.info("Generating classic gensim model representation based on results from the ensemble") + + numberOfWords = self.sstats_sum + # if numberOfWords should be wrong for some fantastic funny reason + # that makes you want to peel your skin off, recreate it (takes a while): + if numberOfWords == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None: + for document in self.gensim_kw_args["corpus"]: + for token in document: + numberOfWords += token[1] + self.sstats_sum = numberOfWords + assert numberOfWords != 0 + + stableTopics = self.get_topics() + + nStableTopics = len(stableTopics) + + if nStableTopics == 0: + logger.warning("The model did not detect any stable topic. You can try to adjust epsilon: " + "generate_topic_clusters(eps); generate_stable_topics()") + return + + # create a new gensim model + params = self.gensim_kw_args.copy() + params["eta"] = self.eta + params["num_topics"] = nStableTopics + # adjust params in a way that no training happens + params["iterations"] = 0 # no training + params["passes"] = 0 # no training + + classic_model_representation = GENSIM_MODEL[self.topic_model_kind](**params) + + # when eta was None, use what gensim generates as default eta for the following tasks: + eta = classic_model_representation.eta + + # the following is important for the denormalization + # to generate the proper sstats for the new gensim model: + # transform to dimensionality of stableTopics. axis=1 is summed + eta_sum = 0 + if int == type(eta) or float == type(eta): + eta_sum = [eta * len(stableTopics[0])] * nStableTopics + else: + if len(eta.shape) == 1: # [e1, e2, e3] + eta_sum = [[eta.sum()]] * nStableTopics + if len(eta.shape) > 1: # [[e11, e12, ...], [e21, e22, ...], ...] + eta_sum = np.array(eta.sum(axis=1)[:, None]) + + # the factor, that will be used when get_topics() is used, for normalization + # will never change, because the sum for eta as well as the sum for sstats is constant. + # Therefore predicting normalization_factor becomes super easy. + # corpus is a mapping of id to occurences + + # so one can also easily calculate the + # right sstats, so that get_topics() will return the stable topics no + # matter eta. + + normalization_factor = np.array([[numberOfWords / nStableTopics]] * nStableTopics) + eta_sum + + if stableTopics.shape[1] != len(eta): + raise ValueError("If load() was used: was the correct dictionary used when loading the model? " + "because stableTopics.shape[0] {} != len(eta) {}".format(stableTopics.shape[1], len(eta))) + + sstats = stableTopics * normalization_factor + sstats -= eta + + # Note that this will cause some stuff to fail if nStableTopics is 1 + # if nStableTopics is 1, the result will be (close to) uniform anyway + # inserting the new sstats + classic_model_representation.state.sstats = sstats.astype("float32") + # fix expElogbeta. + classic_model_representation.sync_state() + + self.classic_model_representation = classic_model_representation + + return classic_model_representation + + def add_model(self, target, generate=True, num_new_models=None): + """Adds the ttda of another model to the ensemble + this way, multiple topic models can be connected + to an ensemble. + + The ttda of another ensemble can also be used, + in that case set num_new_models to the num_models + parameter of the ensemble, that means the number + of classic models in the ensemble that generated + the ttda. This is important, because that information + is used to estimate "min_samples" for + generate_topic_clusters. + + Make sure that all the models use the exact same + dictionary/idword mapping and size. + + If you trained this ensemble in the past with a + certain Dictionary that you want to reuse for other + models, you can get it from: self.id2word + + This will also trigger the generation of + stable topics automatically, but it can be + disabled with the generate parameter. + + Parameters + ---------- + target : {see description} + 1. topic-term-distribution-array + + example: [[0.1, 0.1, 0.8], [...], ...] + + [topic_1, topic_2, ...] + with topic being: + [word_1, word_2, ...] + + sum of words in a single topic sum to one, + therefore, all the words sum to len(ttda) + + You can use this (option 1) only if + memory_friendly_ttda is set to True. + + 2. A single EnsembleLda object + 3. List of EnsembleLda objects + 4. A single Gensim topic model + 5. List of Gensim topic models + generate : boolean, optional + if True, will cluster and generate + stable_topics afterwards. This is the default. + + if False, you have to call: + self.generate_asymmetric_distance_matrix() + self.generate_topic_clusters() + self.generate_stable_topics() + self.generate_gensim_representation() + afterwards manually. + num_new_models : integer, optional + the model keeps track of how many models + were used in this ensemble. Set higher + if ttda contained topics from more than + one model. Default: None, which takes + care of it automatically. + + If target is a 2D-array of float values, + it assumes 1. + + If the ensemble has memory_friendly_ttda + set to False, then it will always use + the number of models in the target + parameter. + + """ + + # If the model has never seen a ttda before, initialize. + # If it has, append. + + # Be flexible. Can be a single element or a list of elements + # make sure it's a numpy array + if not isinstance(target, (np.ndarray, list)): + target = np.array([target]) + else: + target = np.array(target) + assert len(target) > 0 + + detected_num_models = None + + if self.memory_friendly_ttda: + # for memory friendly models/ttdas, append the ttdas to itself + + ttda = None + + # ttdas + if isinstance(target.dtype.type(), (np.number, float)): + # multiple ttdas, concatenate first + if len(target.shape) == 3: + detected_num_models = len(target) + ttda = np.concatenate(target) + + # one single ttda + elif len(target.shape) == 2: + detected_num_models = 1 + ttda = target + + # ensemble lda object + else: + if type(target[0]) == EnsembleLda: + # check if it is an ensemble with ttdas + ttda = np.concatenate([model.ttda for model in target], axis=0) + detected_num_models = sum([model.num_models for model in target]) + + # any gensim topic model object + elif isinstance(target[0], ldamodel.LdaModel): + # might be a gensim model, use get_topics() then + ttda = np.concatenate([model.get_topics() for model in target], axis=0) + detected_num_models = len(target) + + # unknown + else: + raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) + + logger.info("Adding {} model(s) with {} topics".format(detected_num_models, len(ttda))) + + # check if ttda array already exists + if self.ttda is None: + self.ttda = ttda + else: + if self.ttda.shape[1] != ttda.shape[1]: + raise ValueError(("target ttda dimensions do not match. Topics must be {} but was" + "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1])) + self.ttda = np.append(self.ttda, ttda, axis=0) + + # new models were added, increase num_models + if num_new_models is None: + self.num_models += detected_num_models + else: + self.num_models += num_new_models + else: + # memory unfriendly ensembles have all the models + # stored in tms. Can be a list of ensembles or + # a list of gensim topic models. + if type(target[0]) == type(self): + detected_num_models = 0 + for ensemble in target: + self.tms += ensemble.tms + detected_num_models += len(ensemble.tms) + else: + # else it is a list of gensim models + # (np.ndarray of len 1 or more, since it was wrapped in the beginning) + self.tms += target.tolist() + detected_num_models = len(target) + + # in this case, len(self.tms) should + # always match self.num_models + self.num_models = len(self.tms) + + if generate: + self.generate_asymmetric_distance_matrix(workers=None) # multiprocessing enabled + self.generate_topic_clusters() + self.generate_stable_topics() + self.generate_gensim_representation() + + def save(self, fname): + """Save the ensemble to a file. + + Parameters + ---------- + fname : str + Path to the system file where the model will be persisted. + + """ + + # parameter descriptions are similar to [1] in + # order to make the api less confusing + + gensim_kw_args_persist = self.gensim_kw_args.copy() + + payload = { + "ensemble_kw_args": { + "topic_model_kind": self.topic_model_kind, + "memory_friendly_ttda": self.memory_friendly_ttda, + "num_models": self.num_models + }, + "gensim_kw_args": gensim_kw_args_persist, + "ttda": self.ttda, + "amatrix": self.asymmetric_distance_matrix, + "stable_topics": self.stable_topics, + "cluster_model.results": self.cluster_model.results, + "sstats_sum": self.sstats_sum, + "id2word": self.id2word + } + + if not self.memory_friendly_ttda: + payload["topicModels"] = self.tms + + # Version of the save/load api, to eventually make load + # backwards compatible if the save-function changes. + payload["version"] = "1.0" + + with open(fname, 'wb') as f: + _pickle.dump(payload, f) + + logger.info("saved %s object", self.__class__.__name__) + + @staticmethod + def load(fname): + """Load a previously stored ensemble from disk. + + Parameters + ---------- + fname : str + Path to file that contains the needed object. + + Returns + ------- + A previously saved ensembleLda object + + """ + + # message copied from [1] + logger.info("loading %s object from %s", EnsembleLda.__name__, fname) + + with open(fname, 'rb') as f: + data = _pickle.load(f) + + # update() on the dict can be safely used, because they don't have + # keywords in common that overwrite each other + kwArgs = data["ensemble_kw_args"] + kwArgs.update(data["gensim_kw_args"]) + + kwArgs["id2word"] = data["id2word"] + # prevent training + kwArgs["iterations"] = 0 + + # now reconstruct an ensemble LDA object based on the data: + eLDA = EnsembleLda(**kwArgs) + # since no training will take place, num_models will be set to 0. restore: + eLDA.num_models = kwArgs["num_models"] + + # some stuff that was not created automatically, because the "empty" eLDA object did not train + eLDA.ttda = data["ttda"] + eLDA.asymmetric_distance_matrix = data["amatrix"] + eLDA.sstats_sum = data["sstats_sum"] + + # depends on memory_friendly_ttda in the save function: + if "topicModels" in data: + eLDA.tms = [] + for i in range(len(data["topicModels"])): + eLDA.tms += [data["topicModels"][i]] + + # As long as fit is not called on cluster_model, no results will be generated, + # so the CBDBSCAN constructor is safe to call without consuming cpu time. + # No need to provide the original params from the constructor, the result are + # going to be written into the model from the pickle. + eLDA.cluster_model = CBDBSCAN() + eLDA.cluster_model.results = data["cluster_model.results"] + eLDA.stable_topics = data["stable_topics"] + + eLDA.generate_gensim_representation() + + # message copied from [1] + logger.info("loaded %s", fname) + + return eLDA + + def generate_ttda_multiprocessed(self, num_models, ensemble_workers): + """Will make the ensemble multiprocess, which results + in a speedup on multicore machines. Results from the + processes will be piped to the parent and concatenated. + + Parameters + ---------- + num_models : number + how many models to train in the ensemble + ensemble_workers : number + into how many processes to split the models + will be set to max(workers, num_models), to avoid + workers that are supposed to train 0 models. + + to get maximum performance, set to the number + of your cores, if non-parallelized models + are being used in the ensemble (LdaModel). + + For LdaMulticore, the performance gain is small + and gets larger for a significantly smaller corpus. + In that case, ensemble_workers=2 can be used. + + """ + + # the way random_states is handled needs to prevent getting + # different results when multiprocessing is on, or getting + # the same results in every lda children. + # so it is solved by generating a list of state seeds before + # multiprocessing is started. + random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)] + + # worker, that trains a bunch of models: + def worker(id, num_models, random_states, pipe): + # print(id,"training") + # since models cannot be piped to the parent because messages need to be pickled for that: + # memory_friendly_ttda needs to be true, which will not store ensemble submodels + logger.info("Spawned worker to generate {} topic models...".format(num_models)) + self.generate_topic_models(num_models, random_states=random_states) + # send the ttda that is in the child/workers version of the memory into the pipe + # available, after generate_topic_models has been called in the worker + if self.memory_friendly_ttda: + pipe.send((id, self.ttda)) # remember that this code is inside the workers memory + else: + pipe.send((id, self.tms)) + + pipe.close() + + # each worker has to work on at least one model. + # Don't spawn idle workers: + workers = min(ensemble_workers, num_models) + + # create worker processes: + # from what I know this is basically forking with a jump to a target function in each child + # so modifying the ensemble object will not modify the one in the parent + # because of no shared memory + processes = [] + pipes = [] + num_models_unhandled = num_models # how many more models need to be trained by workers? + + for i in range(workers): + try: + parentConn, childConn = Pipe() + num_subprocess_models = 0 + if i == workers - 1: # i is a index, hence -1 + # is this the last worker that needs to be created? + # then task that worker with all the remaining models + num_subprocess_models = num_models_unhandled + else: + num_subprocess_models = int(num_models_unhandled / (workers - i)) + + # get the chunk from the random states that is meant to be for those models + random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] + + p = Process(target=worker, args=(i, num_subprocess_models, random_states_for_worker, childConn)) + + processes += [p] + pipes += [(parentConn, childConn)] + p.start() + + num_models_unhandled -= num_subprocess_models + + except ProcessError: + logger.error("could not start process {}".format(i)) + # close all pipes + for p in pipes: + p[1].close() + p[0].close() + # end all processes + for p in processes: + if p.is_alive(): + p.terminate() + del p + # stop + raise + + # aggregate results + if self.memory_friendly_ttda: + + # collect results from the pipes + # will also block until workers are finished + self.ttda = None + for p in pipes: + answer = p[0].recv() # [0], because that is the parentConn + p[0].close() + # this does basically the same as the generate_topic_models function (concatenate all the ttdas): + if self.ttda is None: + self.ttda = answer[1] # [1] is the ttda ([0] is the worker id) + else: + self.ttda = np.concatenate([self.ttda, answer[1]]) + # print(answer[0], "received and ttda concatenated") + # in [0] the id of the worker that sent the data is stored + + # end all processes + for p in processes: + p.terminate() + + # again, the same thing generate_topic_models does at the end + self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None] + + else: + # now do the same thing for a memory unfriendly ensemble + self.tms = [] + for p in pipes: + answer = p[0].recv() # [0], because that is the parentConn + p[0].close() + self.tms += answer[1] + + # end all processes + for p in processes: + p.terminate() + + def generate_topic_models(self, num_models, random_states=None): + """Will generate the topic models, that form the ensemble. + + Parameters + ---------- + num_models : number + number of models to be generated + random_states : list + list of numbers or np.random.RandomState objects. + Will be autogenerated based on the ensembles RandomState + if None (default). + """ + + if random_states is None: + random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)] + + assert len(random_states) == num_models + + logger.info("Generating {} topic models...".format(num_models)) + + kwArgs = self.gensim_kw_args.copy() + + tm = None # remember one of the topic models from the following + # loop, in order to collect some properties from it afterwards. + + for i in range(num_models): + + kwArgs["random_state"] = random_states[i] + tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs) + + # memory friendly ttda = generate ttda on the run and delete model + if self.memory_friendly_ttda: + if self.ttda is None: + self.ttda = tm.state.get_lambda() + else: + # adds the lambda (that is the unnormalized get_topics) to ttda, which is + # a list of all those lambdas + self.ttda = np.concatenate([self.ttda, tm.state.get_lambda()]) + else: + # only saves the model if it is not "memory friendly" + self.tms += [tm] + + # use one of the tms to get some info that will be needed later + self.sstats_sum = tm.state.sstats.sum() + self.eta = tm.eta + + if self.memory_friendly_ttda: + # normalize all the ttda + # this results in the same that is being displayed on .get_topics() of a gensim LDA model + self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None] + + def generate_ttda(self): + """Generate the topic_term_distribution for all the topics - P(W|T)""" + + logger.info("Generating topic term distributions over all topics over all topic models in the ensemble...") + + lambda_all = np.concatenate([self.tms[x].state.get_lambda() for x in range(self.num_models)], axis=0) + + self.ttda = lambda_all / lambda_all.sum(axis=1)[:, None] + + def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"): + """Makes the pairwise similarity matrix for all the ttdas + from the ensemble. + + Returns the asymmetric pairwise distance matrix that + is used in the DBSCAN clustering. + + Parameters + ---------- + threshold : float, optional + if threshold is None and method == "rank": + threshold = 0.11 + if threshold is None and method == "mass": + threshold = 0.95 + + threshold keeps by default 95% of the largest terms by + mass. Except the "fast" parameter is "rank", then it + just selects that many of the largest terms. + workers : number, optional + when workers is None, it defaults to os.cpu_count() + for maximum performance. Default is 1, which is not + multiprocessed. Set to > 1 to enable multiprocessing. + + """ + + # singlecore: + if workers is not None and workers <= 1: + logger.info("Generating a {} x {} asymmetric similarity matrix...".format(len(self.ttda), len(self.ttda))) + self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, + ttda2=self.ttda, + threshold=threshold, + method=method) + return self.asymmetric_distance_matrix + + # multicore: + + # best performance on 2-core machine: 2 workers + if workers is None: + workers = os.cpu_count() + + # worker, that computes the distance to + # all other nodes from a chunk of nodes. + # https://stackoverflow.com/questions/1743293/why-does-my-python-program-average-only-33-cpu-per-process-how-can-i-make-pyth#1743350 + def worker(worker_id, ttdas_sent, n_ttdas, pipe): + logger.info("Spawned worker to generate {} rows of the asymmetric similarity matrix...".format(n_ttdas)) + # print(id,"calculating") + # the chunk of ttda that's going to be calculated: + ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] + distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda, + threshold=threshold, + start_index=ttdas_sent, + method=method) + pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory + pipe.close() + + # create worker processes: + processes = [] + pipes = [] + ttdas_sent = 0 + + for i in range(workers): + + try: + parentConn, childConn = Pipe() + + # figure out how many ttdas to send to the worker + # 9 ttdas to 4 workers: 2 2 2 3 + n_ttdas = 0 + if i == workers - 1: # i is a index, hence -1 + # is this the last worker that needs to be created? + # then task that worker with all the remaining models + n_ttdas = len(self.ttda) - ttdas_sent + else: + n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) + + p = Process(target=worker, args=(i, ttdas_sent, n_ttdas, childConn)) + ttdas_sent += n_ttdas + + processes += [p] + pipes += [(parentConn, childConn)] + p.start() + + except ProcessError: + logger.error("could not start process {}".format(i)) + # close all pipes + for p in pipes: + p[1].close() + p[0].close() + # end all processes + for p in processes: + if p.is_alive(): + p.terminate() + del p + raise + + distances = [] + # note, that the following loop maintains order in how the ttda will be concatenated + # which is very important. Ordering in ttda has to be the same as when using only one process + for p in pipes: + answer = p[0].recv() # [0], because that is the parentConn + p[0].close() # child conn will be closed from inside the worker + # this does basically the same as the generate_topic_models function (concatenate all the ttdas): + distances += [answer[1]] + + # end all processes + for p in processes: + p.terminate() + + self.asymmetric_distance_matrix = np.concatenate(distances) + + return self.asymmetric_distance_matrix + + def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold=None, start_index=0, method="mass"): + """Iterates over ttda1 and calculates the + distance to every ttda in tttda2. + + Parameters + ---------- + ttda1 and ttda2: 2D arrays of floats + Two ttda matrices that are going to be used for distance calculation. + Each row in ttda corresponds to one topic. Each cell in the resulting + matrix corresponds to the distance between a topic pair. + threshold : float, optional + threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected + method + start_index : number + this function might be used in multiprocessing, so start_index has to be set as + ttda1 is a chunk of the complete ttda in that case. start_index would be 0 + if ttda1 == self.ttda. When self.ttda is split into two pieces, each 100 ttdas + long, then start_index should be be 100. default is 0 + method : {'mass', 'rank}, optional + method can be "mass" for the original masking method or "rank" for a faster + masking method that selects by rank of largest elements in the topic word + distribution, to determine which tokens are relevant for the topic. + + """ + + if method not in ["mass", "rank"]: + raise ValueError("method {} unknown".format(method)) + + if threshold is None: + threshold = {"mass": 0.95, "rank": 0.11}[method] + + # select masking method: + def mass_masking(a): + """original masking method. returns a binary mask""" + return (a.sort_values(by=0, ascending=False).cumsum() < threshold).sort_index()[0].values + + def rank_masking(a): # faster + """faster masking method. returns a mask that contains indices to keep""" + return a.sort_values(by=0, ascending=False)[0:int(len(a) * threshold)].index + + create_mask = {"mass": mass_masking, "rank": rank_masking}[method] + + # initialize the distance matrix. ndarray is faster than zeros + distances = np.ndarray((len(ttda1), len(ttda2))) + + # now iterate over each topic + for i in range(len(ttda1)): + + # prepare that topic and create the mask from it + a = pd.DataFrame(ttda1[i]) + # create mask from a, that removes noise from a and keeps the largest terms + mask = create_mask(a) + a_masked = a[0][mask].values + + # now look at every possible pair for topic a: + for j in range(len(ttda2)): + + # distance to itself is 0 + if i + start_index == j: + distances[i][j] = 0 + continue + + # now mask b based on a, which will force the shape of a onto b + b_masked = ttda2[j][mask] + + distance = 0 + # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1 + # (not 2 because that correspondsto negative values. The maximum distance is 1 here) + # don't normalize b_masked, otherwise the following threshold will never: + if b_masked.sum() <= 0.05: + distance = 1 + else: + # if there is indeed some non-noise stuff in the masked topic b, look at + # how similar the two topics are. note that normalizing is not needed for cosine distance, + # as it only looks at the angle between vectors + distance = cosine(a_masked, b_masked) + + distances[i][j] = distance + + return distances + + def generate_topic_clusters(self, eps=0.1, min_samples=None): + """Runs the DBSCAN algorithm on all the detected topics from + the models in the ensemble and labels them with label-indices. + + The final approval and generation of stable topics is done in + generate_stable_topics(). + + Parameters + ---------- + eps : float + eps is 0.1 by default. + min_samples : number + min_samples is int(self.num_models / 2) + """ + + if min_samples is None: + min_samples = int(self.num_models / 2) + + logger.info("Fitting the clustering model") + + self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) + self.cluster_model.fit(self.asymmetric_distance_matrix) + + def generate_stable_topics(self, min_cores=None): + """generates stable topics out of the clusters. + This function is the last step that has to be done + in the ensemble. + + Stable topics can be retreived afterwards using + get_topics(). + + Parameters + ---------- + min_cores : number + how many cores a cluster has to have, + to be treated as stable topic. That means, how many topics + that look similar have to be present, so that the average + topic in those is used as stable topic. + + defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) + + """ + + logger.info("Generating stable topics") + + # min_cores being 0 makes no sense. there has to be a core for a cluster + # or there is no cluster + if min_cores == 0: + min_cores = 1 + + if min_cores is None: + # min_cores is a number between 1 and 3, depending on the number of models + min_cores = min(3, max(1, int(self.num_models / 4 + 1))) + + # helper functions + def remove_label_from_colum(label, valid_parent_labels): + """ + Example + ------- + - label = 0 + - valid_parent_labels = {0, 1} + - result: {1} + will overwrite valid_parent_labels + + """ + try: + valid_parent_labels.remove(label) + except AttributeError: + # parent_labes is NaN + pass + except KeyError: + # Key does not exist + pass + + def remove_label_from_sorted_labels(label, grouped_parent_labels): + """Despite the name, note that no sorting of anything is needed + for this to work. + + Example + ------- + - label = 2 + - grouped_parent_labels = [{1, 2}, {1, 2, 3}] + - result: [{1}, {1, 3}] + will overwrite grouped_parent_labels + + """ + for i in grouped_parent_labels: + remove_label_from_colum(label, i) + + def check_content_from_parent_labels(parent_label): + """ + """ + if not isinstance(parent_label, set): + return 0 + else: + return len(parent_label) + + def keep_only_valid_labels(parent_labels): + if not isinstance(parent_labels, set): + return set() + else: + return {label for label in parent_labels if label in valid_labels} + + def validate_core(core): + """Core is an entry in the self.cluster_model.results dataframe + + Will overwrite "is_valid_core" on that core with: + - False, if it is actually not marked as core (core.is_core) + - True, if valid_parents equals the cores label. E.g.: {1} == {1} + + """ + if not core.is_core: + return False + else: + return core.valid_parents == {core["labels"]} + + # counts how many different labels a core has as parents + self.cluster_model.results.loc[self.cluster_model.results.is_core, "amount_parent_labels"] = \ + self.cluster_model.results.loc[self.cluster_model.results.is_core, "parent_labels"].apply( + check_content_from_parent_labels) + + # sorting the cores by the amount of labels in the "parent_labels" + # groups by labels, and aggregates each entry of the group. example: [{1, 2}, {1, 2, 3}] + sorted_labels = self.cluster_model.results.loc[ + self.cluster_model.results.is_core, ["labels", "parent_labels", "amount_parent_labels"]].groupby( + "labels").agg({"amount_parent_labels": max, "parent_labels": lambda x: list(x)}).sort_values( + "amount_parent_labels") + + # removing NaN + sorted_labels.parent_labels = sorted_labels.parent_labels.apply(lambda L: [x for x in L if x is not np.nan]) + + sorted_labels["is_valid"] = np.nan + # sorted_labels["is_not_valid"] = np.nan + + # APPLYING THE RULES 1 to 3 + # iterate over the cluster labels, see which clusters/labels + # are valid to cause the creation of a stable topic + for label in sorted_labels.index: + # sorted_labels.index example: Int64Index([0, 1, 2, 3, 4, 5], dtype='int64', name='labels') + # label is iterating over 0, 1, 3, ... + + # 1. rule - remove if the cores in the cluster have no parents + if sorted_labels.loc[label].amount_parent_labels == 0: + # label is NOT VALID + sorted_labels.loc[label, "is_valid"] = False + sorted_labels.parent_labels.apply( + lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels)) + + # 2. rule - remove if it has less than min_cores as parents + # (parents are always also cores) + if len(sorted_labels.loc[label, "parent_labels"]) < min_cores: + sorted_labels.loc[label, "is_valid"] = False + sorted_labels.parent_labels.apply( + lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels)) + + # 3. checking for "easy_valid"s + # checks if the core has at least min_cores of cores with the only label as itself + if sorted_labels.loc[label].amount_parent_labels >= 1: + if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores: + sorted_labels.loc[label, "is_valid"] = True + + # Reaplying the rule 3 + # this is NOT DETERMINISTIC, the order will influence the result + # this happens when we have a close relationship among 2 or more clusters + for label in sorted_labels[sorted_labels.is_valid.isnull()].index: + # 3. checking for "easy_valid"s + # checks if the core has at least min_cores of cores with the only label as itself + if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores: + sorted_labels.loc[label, "is_valid"] = True + else: + sorted_labels.loc[label, "is_valid"] = False + sorted_labels.parent_labels.apply( + lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels)) + + self.sorted_labels = sorted_labels + + valid_labels = self.sorted_labels[self.sorted_labels.is_valid].index.values + + self.cluster_model.results["valid_parents"] = \ + self.cluster_model.results.parent_labels.apply(keep_only_valid_labels) + + # VALIDATING CORES + # only cores with the valid_parents as its own label can be a valid core + self.cluster_model.results["is_valid_core"] = self.cluster_model.results.apply(validate_core, axis=1) + + # trainsforming the ttda into pandas and adding labels + ttda_pd = pd.DataFrame(self.ttda) + ttda_pd["labels"] = self.cluster_model.labels_ # same as self.cluster_model.results["labels"] + + # keeping only VALID cores + ttda_pd = ttda_pd.loc[self.cluster_model.results.is_valid_core] + + # averaging the ttd of the cores in the same cluster + # "labels" marks the cluster-id from DBSCAN + stable_topics = ttda_pd.groupby("labels").mean() + + self.stable_topics = stable_topics + + def recluster(self, eps=0.1, min_samples=None, min_cores=None): + """Runs the CBDBSCAN algorithm on all the detected topics from + the children of the ensemble. + + Generates stable topics out of the clusters afterwards. + + Finally, stable topics can be retreived using + get_topics(). + + Parameters + ---------- + eps : float + epsilon for the CBDBSCAN algorithm, having the same + meaning as in classic DBSCAN clustering. + default: 0.1 + min_samples : number + The minimum number of sampels in the neighborhood + of a topic to be considered a core in CBDBSCAN. + default: int(self.num_models / 2) + min_cores : number + how many cores a cluster has to have, + to be treated as stable topic. That means, how many topics + that look similar have to be present, so that the average + topic in those is used as stable topic. + default: min(3, max(1, int(self.num_models /4 +1))) + + """ + self.generate_topic_clusters(eps, min_samples) + self.generate_stable_topics(min_cores) + self.generate_gensim_representation() + + # GENSIM API + # to make using the ensemble in place of a gensim model as easy as possible + + def get_topics(self): + """see https://radimrehurek.com/gensim/models/ldamodel.html""" + return self.stable_topics.values + + def __getitem__(self, i): + """see https://radimrehurek.com/gensim/models/ldamodel.html""" + if self.classic_model_representation is None: + raise ValueError("use generate_gensim_representation() first") + return self.classic_model_representation[i] + + def inference(self, *posargs, **kwArgs): + """see https://radimrehurek.com/gensim/models/ldamodel.html""" + if self.classic_model_representation is None: + raise ValueError("use generate_gensim_representation() first") + return self.classic_model_representation.inference(*posargs, **kwArgs) + + def log_perplexity(self, *posargs, **kwArgs): + """see https://radimrehurek.com/gensim/models/ldamodel.html""" + if self.classic_model_representation is None: + raise ValueError("use generate_gensim_representation() first") + return self.classic_model_representation.log_perplexity(*posargs, **kwArgs) + + def print_topics(self, *posargs, **kwArgs): + """see https://radimrehurek.com/gensim/models/ldamodel.html""" + if self.classic_model_representation is None: + raise ValueError("use generate_gensim_representation() first") + return self.classic_model_representation.print_topics(*posargs, **kwArgs) + + @property + def id2word(self): + return self.gensim_kw_args["id2word"] + + +class CBDBSCAN(): + + def __init__(self, eps=0.1, min_samples=4): + + self.eps = eps + self.min_samples = min_samples + + def fit(self, amatrix): + + self.next_label = 0 + self.results = pd.DataFrame(index=range(len(amatrix)), + columns=["labels", "is_core", + "parent_ids", "parent_labels", "num_samples"]) + self.results.is_core = False + + # otherwise sets cannot be written into the dataframe (will be converted to integers). + # The dtype should be object for everything to fix this. + self.results = self.results.astype(dtype="object") + + tmp_amatrix = copy.deepcopy(amatrix) + # to avoid problem about comparing the topic with itself + np.fill_diagonal(tmp_amatrix, 1) + + min_distance_per_topic = pd.Series(tmp_amatrix.min(axis=1)) + min_distance_per_topic_sorted = min_distance_per_topic.sort_values() + + ordered_min_similarity = list(min_distance_per_topic_sorted.index.values) + + def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None): + + # count how many neighbors + neighbors = pd.Series(tmp_amatrix[topic_index] < self.eps) + num_samples = sum(neighbors) + + # If the number of neighbors of a topic is large enough, + # it is considered a core. + # This also takes neighbors that already are identified as core in count. + if num_samples >= self.min_samples: + # This topic is a core! + self.results.loc[topic_index, "is_core"] = True + self.results.loc[topic_index, "num_samples"] = num_samples + + # if current_label is none, then this is the first core + # of a new cluster (hence next_label) + if current_label is None: + # next_label is initialized with 0 in + # fit() for the first cluster + current_label = self.next_label + self.next_label += 1 + + else: + # In case the core has a parent, + # that means it has a label, namely the + # label from the parent. + # Check the distance between the + # new core and the parent + + # parent neighbors is the list of neighbors of parent_id + # (the topic_index that called this function recursively) + all_members_of_current_cluster = list(parent_neighbors[parent_neighbors].index.values) + all_members_of_current_cluster.append(parent_id) + + # look if 25% of the members of the current cluster are also + # close to the current/new topic... + + # example: (topic_index, 0), (topic_index, 2), (topic_index, ...) + all_members_of_current_cluster_ix = np.ix_([topic_index], all_members_of_current_cluster) + # use the result of the previous step to index the matrix and see if those distances + # are smaller then epsilon. relations_to_the_cluster is a boolean array, True for close elements + relations_to_the_cluster = tmp_amatrix[all_members_of_current_cluster_ix] < self.eps + + # if less than 25% of the elements are close, then the topic index in question is not a + # core of the current_label, but rather the core of a new cluster + if relations_to_the_cluster[0].mean() < 0.25: + # start new cluster by changing current_label + current_label = self.next_label + self.next_label += 1 + + self.results.loc[topic_index, "labels"] = current_label + + for neighbor in neighbors[neighbors].index: + + if self.results.loc[neighbor, "labels"] is np.nan: + ordered_min_similarity.remove(neighbor) + # try to extend the cluster into the direction + # of the neighbor + scan_topic(neighbor, parent_id=topic_index, + current_label=current_label, + parent_neighbors=neighbors) + + if isinstance(self.results.loc[neighbor, "parent_ids"], set): + self.results.loc[neighbor, "parent_ids"].add(topic_index) + self.results.loc[neighbor, "parent_labels"].add(current_label) + else: + self.results.loc[neighbor, "parent_ids"] = set([topic_index]) + self.results.loc[neighbor, "parent_labels"] = set([current_label]) + + else: + # this topic is not a core! + if current_label is None: + self.results.loc[topic_index, "labels"] = -1 + else: + self.results.loc[topic_index, "labels"] = current_label + + # elements are going to be removed from that array in scan_topic, do until it is empty + while len(ordered_min_similarity) != 0: + next_topic_index = ordered_min_similarity.pop(0) + scan_topic(next_topic_index) + + self.labels_ = self.results["labels"] From a67d5db944e97f5c3c492a6d70cbf281a32873ac Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Fri, 5 Apr 2019 02:31:00 +0200 Subject: [PATCH 002/166] improvements to add_model, various small changes to comments and code --- gensim/corpora/__init__.py | 1 + gensim/models/ensemblelda.py | 269 ++++++++++++++++------------------- 2 files changed, 123 insertions(+), 147 deletions(-) diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index 0d51a9b903..7b7044e0b9 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -15,3 +15,4 @@ from .textcorpus import TextCorpus, TextDirectoryCorpus # noqa:F401 from .ucicorpus import UciCorpus # noqa:F401 from .malletcorpus import MalletCorpus # noqa:F401 +from .opinosiscorpus import OpinosisCorpus # noqa:F401 diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 6abf70f5e3..5e442eb62c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -55,11 +55,12 @@ >>> unseen_doc = other_corpus[0] >>> vector = elda[unseen_doc] # get topic probability distribution for a document -Increase the ensemble size by adding a new model +Increase the ensemble size by adding a new model. Make sure it uses the same dictionary .. sourcecode:: pycon >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)) + >>> elda.recluster() >>> vector = elda[unseen_doc] To optimize the ensemble for your specific case, the children can be clustered again using @@ -91,7 +92,7 @@ import pandas as pd from scipy.spatial.distance import cosine from gensim import utils -from gensim.models import ldamodel, ldamulticore +from gensim.models import ldamodel, ldamulticore, basemodel logger = logging.getLogger(__name__) @@ -212,7 +213,9 @@ def __init__(self, topic_model_kind="lda", num_models=3, self.sstats_sum = 0 self.eta = None self.tms = [] - self.ttda = None + # initialize empty topic term distribution array + self.ttda = np.empty((0, len(gensim_kw_args["id2word"])), np.float32) + self.asymmetric_distance_matrix_outdated = True # in case the model will not train due to some # parameters, stop here and don't train. @@ -230,14 +233,11 @@ def __init__(self, topic_model_kind="lda", num_models=3, # training if ensemble_workers > 1: # multiprocessed - self.generate_ttda_multiprocessed(num_models, ensemble_workers) + self.generate_topic_models_multiproc(num_models, ensemble_workers) else: # singlecore self.generate_topic_models(num_models) - if not memory_friendly_ttda: - self.generate_ttda() - self.generate_asymmetric_distance_matrix(workers=self.distance_workers, threshold=masking_threshold, method=masking_method) @@ -247,6 +247,11 @@ def __init__(self, topic_model_kind="lda", num_models=3, # create model that can provide the usual gensim api to the stable topics from the ensemble self.generate_gensim_representation() + def convert_to_memory_friendly(self): + """removes the stored gensim models and only keeps their ttdas""" + self.tms = [] + self.memory_friendly_ttda = True + def generate_gensim_representation(self): """creates a gensim-model from the stable topics @@ -344,11 +349,18 @@ def generate_gensim_representation(self): return classic_model_representation - def add_model(self, target, generate=True, num_new_models=None): + def add_model(self, target, num_new_models=None): """Adds the ttda of another model to the ensemble this way, multiple topic models can be connected to an ensemble. + Make sure that all the models use the exact same + dictionary/idword mapping. + + In order to generate new stable topics afterwards, use + self.generate_asymmetric_distance_matrix() + self.recluster() + The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter of the ensemble, that means the number @@ -357,48 +369,30 @@ def add_model(self, target, generate=True, num_new_models=None): is used to estimate "min_samples" for generate_topic_clusters. - Make sure that all the models use the exact same - dictionary/idword mapping and size. - If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other - models, you can get it from: self.id2word - - This will also trigger the generation of - stable topics automatically, but it can be - disabled with the generate parameter. + models, you can get it from: self.id2word. Parameters ---------- target : {see description} - 1. topic-term-distribution-array + 1. A single EnsembleLda object + 2. List of EnsembleLda objects + 3. A single Gensim topic model + 4. List of Gensim topic models + + if memory_friendly_ttda is True, target can also be: + 5. topic-term-distribution-array example: [[0.1, 0.1, 0.8], [...], ...] - [topic_1, topic_2, ...] - with topic being: - [word_1, word_2, ...] + [topic1, topic2, ...] + with topic being an array of probabilities: + [token1, token2, ...] - sum of words in a single topic sum to one, + token probabilities in a single topic sum to one, therefore, all the words sum to len(ttda) - You can use this (option 1) only if - memory_friendly_ttda is set to True. - - 2. A single EnsembleLda object - 3. List of EnsembleLda objects - 4. A single Gensim topic model - 5. List of Gensim topic models - generate : boolean, optional - if True, will cluster and generate - stable_topics afterwards. This is the default. - - if False, you have to call: - self.generate_asymmetric_distance_matrix() - self.generate_topic_clusters() - self.generate_stable_topics() - self.generate_gensim_representation() - afterwards manually. num_new_models : integer, optional the model keeps track of how many models were used in this ensemble. Set higher @@ -415,7 +409,6 @@ def add_model(self, target, generate=True, num_new_models=None): parameter. """ - # If the model has never seen a ttda before, initialize. # If it has, append. @@ -427,82 +420,77 @@ def add_model(self, target, generate=True, num_new_models=None): target = np.array(target) assert len(target) > 0 - detected_num_models = None - if self.memory_friendly_ttda: # for memory friendly models/ttdas, append the ttdas to itself - ttda = None + detected_num_models = 0 - # ttdas - if isinstance(target.dtype.type(), (np.number, float)): - # multiple ttdas, concatenate first - if len(target.shape) == 3: - detected_num_models = len(target) - ttda = np.concatenate(target) + ttda = [] - # one single ttda - elif len(target.shape) == 2: - detected_num_models = 1 - ttda = target + # 1. ttda array, because that's the only accepted input that contains numbers + if isinstance(target.dtype.type(), (np.number, float)): + ttda = target + detected_num_models = 1 - # ensemble lda object - else: - if type(target[0]) == EnsembleLda: - # check if it is an ensemble with ttdas - ttda = np.concatenate([model.ttda for model in target], axis=0) - detected_num_models = sum([model.num_models for model in target]) - - # any gensim topic model object - elif isinstance(target[0], ldamodel.LdaModel): - # might be a gensim model, use get_topics() then - ttda = np.concatenate([model.get_topics() for model in target], axis=0) - detected_num_models = len(target) - - # unknown - else: - raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) + # 2. list of ensemblelda objects + elif isinstance(target[0], type(self)): + ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0) + detected_num_models = sum([ensemble.num_models for ensemble in target]) - logger.info("Adding {} model(s) with {} topics".format(detected_num_models, len(ttda))) + # 3. list of gensim models + elif isinstance(target[0], basemodel.BaseTopicModel): + ttda = np.concatenate([model.get_topics() for model in target], axis=0) + detected_num_models = len(target) - # check if ttda array already exists - if self.ttda is None: - self.ttda = ttda + # unknown else: - if self.ttda.shape[1] != ttda.shape[1]: - raise ValueError(("target ttda dimensions do not match. Topics must be {} but was" - "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1])) - self.ttda = np.append(self.ttda, ttda, axis=0) + raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) # new models were added, increase num_models + # if the user didn't provide a custon numer to use if num_new_models is None: self.num_models += detected_num_models else: self.num_models += num_new_models - else: - # memory unfriendly ensembles have all the models - # stored in tms. Can be a list of ensembles or - # a list of gensim topic models. - if type(target[0]) == type(self): - detected_num_models = 0 + + else: # memory unfriendly ensembles + + ttda = [] + + # 1. ttda array + if isinstance(target.dtype.type(), (np.number, float)): + raise ValueError('ttda arrays cannot be added to ensembles that are not memory friendly, ' + 'you can call convert_to_memory_friendly to discard the stored gensim models' + 'and only keep the ttdas') + + # 2. list of ensembles + elif isinstance(target[0], type(self)): for ensemble in target: self.tms += ensemble.tms - detected_num_models += len(ensemble.tms) - else: - # else it is a list of gensim models - # (np.ndarray of len 1 or more, since it was wrapped in the beginning) + ttda = np.concatenate([ensemble.ttda for ensemble in target], axis=0) + + # 3. list of gensim models + elif isinstance(target[0], basemodel.BaseTopicModel): self.tms += target.tolist() - detected_num_models = len(target) + ttda = np.concatenate([model.get_topics() for model in target], axis=0) + + # unknown + else: + raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) # in this case, len(self.tms) should # always match self.num_models self.num_models = len(self.tms) - if generate: - self.generate_asymmetric_distance_matrix(workers=None) # multiprocessing enabled - self.generate_topic_clusters() - self.generate_stable_topics() - self.generate_gensim_representation() + logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) + + if self.ttda.shape[1] != ttda.shape[1]: + raise ValueError(("target ttda dimensions do not match. Topics must be {} but was" + "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1])) + self.ttda = np.append(self.ttda, ttda, axis=0) + + # tell recluster that the distance matrix needs to be regenerated + self.asymmetric_distance_matrix_outdated = True def save(self, fname): """Save the ensemble to a file. @@ -607,7 +595,7 @@ def load(fname): return eLDA - def generate_ttda_multiprocessed(self, num_models, ensemble_workers): + def generate_topic_models_multiproc(self, num_models, ensemble_workers): """Will make the ensemble multiprocess, which results in a speedup on multicore machines. Results from the processes will be piped to the parent and concatenated. @@ -648,7 +636,9 @@ def worker(id, num_models, random_states, pipe): # send the ttda that is in the child/workers version of the memory into the pipe # available, after generate_topic_models has been called in the worker if self.memory_friendly_ttda: - pipe.send((id, self.ttda)) # remember that this code is inside the workers memory + # remember that this code is inside the worker processes memory, + # so self.ttda is the ttda of only a chunk of models + pipe.send((id, self.ttda)) else: pipe.send((id, self.tms)) @@ -707,25 +697,14 @@ def worker(id, num_models, random_states, pipe): # collect results from the pipes # will also block until workers are finished - self.ttda = None for p in pipes: answer = p[0].recv() # [0], because that is the parentConn p[0].close() # this does basically the same as the generate_topic_models function (concatenate all the ttdas): - if self.ttda is None: - self.ttda = answer[1] # [1] is the ttda ([0] is the worker id) - else: - self.ttda = np.concatenate([self.ttda, answer[1]]) + self.ttda = np.concatenate([self.ttda, answer[1]]) # print(answer[0], "received and ttda concatenated") # in [0] the id of the worker that sent the data is stored - # end all processes - for p in processes: - p.terminate() - - # again, the same thing generate_topic_models does at the end - self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None] - else: # now do the same thing for a memory unfriendly ensemble self.tms = [] @@ -733,13 +712,15 @@ def worker(id, num_models, random_states, pipe): answer = p[0].recv() # [0], because that is the parentConn p[0].close() self.tms += answer[1] + new_ttda = np.concatenate([model.get_topics() for model in answer[1]]) + self.ttda = np.concatenate([self.ttda, new_ttda]) - # end all processes - for p in processes: - p.terminate() + # end all processes + for p in processes: + p.terminate() def generate_topic_models(self, num_models, random_states=None): - """Will generate the topic models, that form the ensemble. + """Will train the topic models, that form the ensemble. Parameters ---------- @@ -768,38 +749,20 @@ def generate_topic_models(self, num_models, random_states=None): kwArgs["random_state"] = random_states[i] tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs) - # memory friendly ttda = generate ttda on the run and delete model - if self.memory_friendly_ttda: - if self.ttda is None: - self.ttda = tm.state.get_lambda() - else: - # adds the lambda (that is the unnormalized get_topics) to ttda, which is - # a list of all those lambdas - self.ttda = np.concatenate([self.ttda, tm.state.get_lambda()]) - else: - # only saves the model if it is not "memory friendly" + # adds the lambda (that is the unnormalized get_topics) to ttda, which is + # a list of all those lambdas + self.ttda = np.concatenate([self.ttda, tm.get_topics()]) + + # only saves the model if it is not "memory friendly" + if not self.memory_friendly_ttda: self.tms += [tm] # use one of the tms to get some info that will be needed later self.sstats_sum = tm.state.sstats.sum() self.eta = tm.eta - if self.memory_friendly_ttda: - # normalize all the ttda - # this results in the same that is being displayed on .get_topics() of a gensim LDA model - self.ttda = self.ttda / self.ttda.sum(axis=1)[:, None] - - def generate_ttda(self): - """Generate the topic_term_distribution for all the topics - P(W|T)""" - - logger.info("Generating topic term distributions over all topics over all topic models in the ensemble...") - - lambda_all = np.concatenate([self.tms[x].state.get_lambda() for x in range(self.num_models)], axis=0) - - self.ttda = lambda_all / lambda_all.sum(axis=1)[:, None] - def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"): - """Makes the pairwise similarity matrix for all the ttdas + """Makes the pairwise distance matrix for all the ttdas from the ensemble. Returns the asymmetric pairwise distance matrix that @@ -823,12 +786,18 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= """ + # matrix is up to date afterwards + self.asymmetric_distance_matrix_outdated = False + + if threshold is None: + threshold = {"mass": 0.95, "rank": 0.11}[method] + # singlecore: if workers is not None and workers <= 1: - logger.info("Generating a {} x {} asymmetric similarity matrix...".format(len(self.ttda), len(self.ttda))) - self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, - ttda2=self.ttda, + logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) + self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, + start_index=0, method=method) return self.asymmetric_distance_matrix @@ -840,10 +809,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= # worker, that computes the distance to # all other nodes from a chunk of nodes. - # https://stackoverflow.com/questions/1743293/why-does-my-python-program-average-only-33-cpu-per-process-how-can-i-make-pyth#1743350 + # https://stackoverflow.com/a/1743350 def worker(worker_id, ttdas_sent, n_ttdas, pipe): - logger.info("Spawned worker to generate {} rows of the asymmetric similarity matrix...".format(n_ttdas)) - # print(id,"calculating") + logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix...".format(n_ttdas)) + # print(id, "calculating") # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda, @@ -910,7 +879,7 @@ def worker(worker_id, ttdas_sent, n_ttdas, pipe): return self.asymmetric_distance_matrix - def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold=None, start_index=0, method="mass"): + def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): """Iterates over ttda1 and calculates the distance to every ttda in tttda2. @@ -933,14 +902,14 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold=Non masking method that selects by rank of largest elements in the topic word distribution, to determine which tokens are relevant for the topic. + Returns + ------- + 2D Numpy.numpy.ndarray of floats """ if method not in ["mass", "rank"]: raise ValueError("method {} unknown".format(method)) - if threshold is None: - threshold = {"mass": 0.95, "rank": 0.11}[method] - # select masking method: def mass_masking(a): """original masking method. returns a binary mask""" @@ -1216,6 +1185,11 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): default: min(3, max(1, int(self.num_models /4 +1))) """ + # if new models were added to the ensemble, the distance + # matrix needs to be generated again + if self.asymmetric_distance_matrix_outdated: + self.generate_asymmetric_distance_matrix() + self.generate_topic_clusters(eps, min_samples) self.generate_stable_topics(min_cores) self.generate_gensim_representation() @@ -1224,7 +1198,6 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): # to make using the ensemble in place of a gensim model as easy as possible def get_topics(self): - """see https://radimrehurek.com/gensim/models/ldamodel.html""" return self.stable_topics.values def __getitem__(self, i): @@ -1275,11 +1248,13 @@ def fit(self, amatrix): # The dtype should be object for everything to fix this. self.results = self.results.astype(dtype="object") - tmp_amatrix = copy.deepcopy(amatrix) + tmp_amatrix = amatrix.copy() + # to avoid problem about comparing the topic with itself np.fill_diagonal(tmp_amatrix, 1) min_distance_per_topic = pd.Series(tmp_amatrix.min(axis=1)) + min_distance_per_topic_sorted = min_distance_per_topic.sort_values() ordered_min_similarity = list(min_distance_per_topic_sorted.index.values) From e27be0af1e8d03375aa0cab07ea14d8013ccf6fa Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 6 Apr 2019 01:02:20 +0200 Subject: [PATCH 003/166] pandas -> numpy: group by label and mean --- gensim/models/ensemblelda.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 5e442eb62c..fa6a2a2152 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1145,16 +1145,22 @@ def validate_core(core): # only cores with the valid_parents as its own label can be a valid core self.cluster_model.results["is_valid_core"] = self.cluster_model.results.apply(validate_core, axis=1) - # trainsforming the ttda into pandas and adding labels - ttda_pd = pd.DataFrame(self.ttda) - ttda_pd["labels"] = self.cluster_model.labels_ # same as self.cluster_model.results["labels"] + # numpy solution # keeping only VALID cores - ttda_pd = ttda_pd.loc[self.cluster_model.results.is_valid_core] + valid_core_mask = self.cluster_model.results.is_valid_core.values + valid_topics = self.ttda[valid_core_mask] + topic_labels = self.cluster_model.results.labels.values[valid_core_mask] + unique_labels = np.unique(topic_labels) - # averaging the ttd of the cores in the same cluster - # "labels" marks the cluster-id from DBSCAN - stable_topics = ttda_pd.groupby("labels").mean() + num_topics = len(unique_labels) + stable_topics = np.empty((num_topics, len(self.id2word)), dtype=np.float32) + + # for each cluster + for l, label in enumerate(unique_labels): + # mean of all the topics that are of that cluster + topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label]) + stable_topics[l] = topics_of_cluster.mean(axis=0) self.stable_topics = stable_topics @@ -1198,7 +1204,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): # to make using the ensemble in place of a gensim model as easy as possible def get_topics(self): - return self.stable_topics.values + return self.stable_topics def __getitem__(self, i): """see https://radimrehurek.com/gensim/models/ldamodel.html""" From 83de2dd627bb2853dabc79c51f0ea0b0f2517b38 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 7 Apr 2019 00:58:55 +0200 Subject: [PATCH 004/166] pandas -> numpy: generate_stable_topics --- gensim/models/ensemblelda.py | 279 ++++++++++++++++++++--------------- 1 file changed, 163 insertions(+), 116 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index fa6a2a2152..a12d880483 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1014,143 +1014,109 @@ def generate_stable_topics(self, min_cores=None): # min_cores is a number between 1 and 3, depending on the number of models min_cores = min(3, max(1, int(self.num_models / 4 + 1))) - # helper functions - def remove_label_from_colum(label, valid_parent_labels): - """ - Example - ------- - - label = 0 - - valid_parent_labels = {0, 1} - - result: {1} - will overwrite valid_parent_labels - - """ - try: - valid_parent_labels.remove(label) - except AttributeError: - # parent_labes is NaN - pass - except KeyError: - # Key does not exist - pass - - def remove_label_from_sorted_labels(label, grouped_parent_labels): - """Despite the name, note that no sorting of anything is needed - for this to work. - - Example - ------- - - label = 2 - - grouped_parent_labels = [{1, 2}, {1, 2, 3}] - - result: [{1}, {1, 3}] - will overwrite grouped_parent_labels - - """ - for i in grouped_parent_labels: - remove_label_from_colum(label, i) - - def check_content_from_parent_labels(parent_label): - """ - """ - if not isinstance(parent_label, set): - return 0 - else: - return len(parent_label) - - def keep_only_valid_labels(parent_labels): - if not isinstance(parent_labels, set): - return set() - else: - return {label for label in parent_labels if label in valid_labels} - - def validate_core(core): - """Core is an entry in the self.cluster_model.results dataframe - - Will overwrite "is_valid_core" on that core with: - - False, if it is actually not marked as core (core.is_core) - - True, if valid_parents equals the cores label. E.g.: {1} == {1} - - """ - if not core.is_core: - return False - else: - return core.valid_parents == {core["labels"]} - # counts how many different labels a core has as parents - self.cluster_model.results.loc[self.cluster_model.results.is_core, "amount_parent_labels"] = \ - self.cluster_model.results.loc[self.cluster_model.results.is_core, "parent_labels"].apply( - check_content_from_parent_labels) - - # sorting the cores by the amount of labels in the "parent_labels" - # groups by labels, and aggregates each entry of the group. example: [{1, 2}, {1, 2, 3}] - sorted_labels = self.cluster_model.results.loc[ - self.cluster_model.results.is_core, ["labels", "parent_labels", "amount_parent_labels"]].groupby( - "labels").agg({"amount_parent_labels": max, "parent_labels": lambda x: list(x)}).sort_values( - "amount_parent_labels") - - # removing NaN - sorted_labels.parent_labels = sorted_labels.parent_labels.apply(lambda L: [x for x in L if x is not np.nan]) - - sorted_labels["is_valid"] = np.nan - # sorted_labels["is_not_valid"] = np.nan + results = self.cluster_model.results.to_dict('index') + for topic in results.values(): + if topic["is_core"]: + topic["amount_parent_labels"] = StableTopicHelpers.len_of_parent_labels(topic["parent_labels"]) + + # TODO write this back to self.cluster_model.results + # it just adds another column, so that's safe + # but atm cluster_model.results is a dataframe so first I need to + # remove pandas from the cluster_model + + # first, group all the learned cores based on the label, + # which was assigned in the cluster_model + grouped_by_labels = {} + for topic in results.values(): + if topic["is_core"]: + label = topic["labels"] # "labels", but it actually means just a single number + if not label in grouped_by_labels: + grouped_by_labels[label] = [] + grouped_by_labels[label].append(topic) + + # then aggregate their amount_parent_labels by maxing and parent_labels by concatenating + # the result is sorted by amount_parent_labels and stored in sorted_clusters + sorted_clusters = [] + for label, group in grouped_by_labels.items(): + amount_parent_labels = 0 + parent_labels = [] # will be a list of sets + for topic in group: + amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels) + parent_labels.append(topic["parent_labels"]) + # - here, nan means "not yet evaluated" + # - removing nan from parent_labels + sorted_clusters.append({ + "amount_parent_labels": amount_parent_labels, + "parent_labels": [x for x in parent_labels if isinstance(x, set)], + "is_valid": np.nan, + "label": label + }) + # start with the most significant core + sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True) # APPLYING THE RULES 1 to 3 # iterate over the cluster labels, see which clusters/labels # are valid to cause the creation of a stable topic - for label in sorted_labels.index: - # sorted_labels.index example: Int64Index([0, 1, 2, 3, 4, 5], dtype='int64', name='labels') + for i, cluster in enumerate(sorted_clusters): + label = cluster["label"] # label is iterating over 0, 1, 3, ... # 1. rule - remove if the cores in the cluster have no parents - if sorted_labels.loc[label].amount_parent_labels == 0: + if cluster["amount_parent_labels"] == 0: # label is NOT VALID - sorted_labels.loc[label, "is_valid"] = False - sorted_labels.parent_labels.apply( - lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels)) + cluster["is_valid"] = False + # removes a label from every set in parent_labels for each core + for a in sorted_clusters: + if label in a["parent_labels"]: + a["parent_labels"].remove(label) # 2. rule - remove if it has less than min_cores as parents # (parents are always also cores) - if len(sorted_labels.loc[label, "parent_labels"]) < min_cores: - sorted_labels.loc[label, "is_valid"] = False - sorted_labels.parent_labels.apply( - lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels)) - + if len(cluster["parent_labels"]) < min_cores: + cluster["is_valid"] = False + # removes a label from every set in parent_labels for each core + for a in sorted_clusters: + if label in a["parent_labels"]: + a["parent_labels"].remove(label) + # 3. checking for "easy_valid"s # checks if the core has at least min_cores of cores with the only label as itself - if sorted_labels.loc[label].amount_parent_labels >= 1: - if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores: - sorted_labels.loc[label, "is_valid"] = True + if cluster["amount_parent_labels"] >= 1: + if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores: + cluster["is_valid"] = True # Reaplying the rule 3 - # this is NOT DETERMINISTIC, the order will influence the result # this happens when we have a close relationship among 2 or more clusters - for label in sorted_labels[sorted_labels.is_valid.isnull()].index: - # 3. checking for "easy_valid"s - # checks if the core has at least min_cores of cores with the only label as itself - if sum(map(lambda x: x == {label}, sorted_labels.loc[label, "parent_labels"])) >= min_cores: - sorted_labels.loc[label, "is_valid"] = True + for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]: + + # this checks for parent_labels, which are also modified in this function + # hence order will influence the result. it starts with the most significant + # label (hence "sorted_clusters") + if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores: + cluster["is_valid"] = True else: - sorted_labels.loc[label, "is_valid"] = False - sorted_labels.parent_labels.apply( - lambda parent_labels: remove_label_from_sorted_labels(label, parent_labels)) - - self.sorted_labels = sorted_labels - - valid_labels = self.sorted_labels[self.sorted_labels.is_valid].index.values - - self.cluster_model.results["valid_parents"] = \ - self.cluster_model.results.parent_labels.apply(keep_only_valid_labels) - - # VALIDATING CORES - # only cores with the valid_parents as its own label can be a valid core - self.cluster_model.results["is_valid_core"] = self.cluster_model.results.apply(validate_core, axis=1) - - # numpy solution + cluster["is_valid"] = False + # removes a label from every set in parent_labels for each core + for a in sorted_clusters: + if label in a["parent_labels"]: + a["parent_labels"].remove(label) + + # list of all the label numbers that are valid + valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]]) + + for i in results: + # TODO don't even have nan values, have {}/empty-sets instead + if not isinstance(results[i]["parent_labels"], set): + results[i]["parent_labels"] = set() + results[i]["valid_parents"] = set() + else: + results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels} # keeping only VALID cores - valid_core_mask = self.cluster_model.results.is_valid_core.values + valid_core_mask = np.vectorize(StableTopicHelpers.validate_core)([results[i] for i in results]) valid_topics = self.ttda[valid_core_mask] - topic_labels = self.cluster_model.results.labels.values[valid_core_mask] + topic_labels = np.array([results[i]['labels'] for i in results])[valid_core_mask] unique_labels = np.unique(topic_labels) num_topics = len(unique_labels) @@ -1162,6 +1128,7 @@ def validate_core(core): topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label]) stable_topics[l] = topics_of_cluster.mean(axis=0) + self.sorted_clusters = sorted_clusters self.stable_topics = stable_topics def recluster(self, eps=0.1, min_samples=None, min_cores=None): @@ -1234,6 +1201,86 @@ def print_topics(self, *posargs, **kwArgs): def id2word(self): return self.gensim_kw_args["id2word"] +class StableTopicHelpers(): + """since there are so many helper functions involved, + a class is to be prefered over functions in functions + to be able to run unit tests on them individually""" + + @staticmethod + def remove_label_from_colum(label, valid_parent_labels): + """ + Parameters + ------- + valid_parent_labels : set + label : number + + Example + ------- + - label = 0 + - valid_parent_labels = {0, 1} + - result: {1} + will overwrite valid_parent_labels + + """ + try: + valid_parent_labels.remove(label) + except AttributeError: + # parent_labes is NaN + pass + except KeyError: + # Key does not exist + pass + + @staticmethod + def remove_from_all_sets(label, grouped_parent_labels): + """ + Example + ------- + - label = 2 + - grouped_parent_labels = [{1, 2}, {1, 2, 3}] + - result: [{1}, {1, 3}] + will overwrite grouped_parent_labels + + """ + for valid_parent_labels in grouped_parent_labels: + StableTopicHelpers.remove_label_from_colum(label, valid_parent_labels) + # valid_parent_labels.remove(label) + + @staticmethod + def len_of_parent_labels(parent_label): + """ + + """ + if not isinstance(parent_label, set): + return 0 + else: + return len(parent_label) + + @staticmethod + def keep_only_valid_labels(parent_labels, valid_labels): + if not isinstance(parent_labels, set): + return set() + else: + return {label for label in parent_labels if label in valid_labels} + + @staticmethod + def validate_core(core): + """ + + Core is a dict of {is_core, valid_parents, labels} among others + + If not a core returns False + + Only cores with the valid_parents as its own label can be a valid core. Returns + True if that is the case, False otherwise + + """ + # print('validate_core', core) + if not core["is_core"]: + return False + else: + ret = core["valid_parents"] == {core["labels"]} + return ret class CBDBSCAN(): From 2af165863519dd58d543fa5d9e76299c7b59efa9 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 7 Apr 2019 14:17:03 +0200 Subject: [PATCH 005/166] pandas -> numpy: distance matrix creation --- gensim/models/ensemblelda.py | 82 ++++++++++++------------------------ 1 file changed, 26 insertions(+), 56 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a12d880483..e6b70804b6 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -628,7 +628,6 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): # worker, that trains a bunch of models: def worker(id, num_models, random_states, pipe): - # print(id,"training") # since models cannot be piped to the parent because messages need to be pickled for that: # memory_friendly_ttda needs to be true, which will not store ensemble submodels logger.info("Spawned worker to generate {} topic models...".format(num_models)) @@ -702,7 +701,6 @@ def worker(id, num_models, random_states, pipe): p[0].close() # this does basically the same as the generate_topic_models function (concatenate all the ttdas): self.ttda = np.concatenate([self.ttda, answer[1]]) - # print(answer[0], "received and ttda concatenated") # in [0] the id of the worker that sent the data is stored else: @@ -812,7 +810,6 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= # https://stackoverflow.com/a/1743350 def worker(worker_id, ttdas_sent, n_ttdas, pipe): logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix...".format(n_ttdas)) - # print(id, "calculating") # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda, @@ -912,12 +909,15 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st # select masking method: def mass_masking(a): - """original masking method. returns a binary mask""" - return (a.sort_values(by=0, ascending=False).cumsum() < threshold).sort_index()[0].values - - def rank_masking(a): # faster - """faster masking method. returns a mask that contains indices to keep""" - return a.sort_values(by=0, ascending=False)[0:int(len(a) * threshold)].index + """original masking method. returns a new binary mask""" + sorted_a = np.sort(a)[::-1] + largest_mass = sorted_a.cumsum() < threshold + smallest_valid = sorted_a[largest_mass][-1] + return a >= smallest_valid + + def rank_masking(a): + """faster masking method. returns a new binary mask""" + return a > np.sort(a)[::-1][int(len(a) * threshold)] create_mask = {"mass": mass_masking, "rank": rank_masking}[method] @@ -927,11 +927,10 @@ def rank_masking(a): # faster # now iterate over each topic for i in range(len(ttda1)): - # prepare that topic and create the mask from it - a = pd.DataFrame(ttda1[i]) # create mask from a, that removes noise from a and keeps the largest terms + a = ttda1[i] mask = create_mask(a) - a_masked = a[0][mask].values + a_masked = a[mask] # now look at every possible pair for topic a: for j in range(len(ttda2)): @@ -944,10 +943,16 @@ def rank_masking(a): # faster # now mask b based on a, which will force the shape of a onto b b_masked = ttda2[j][mask] + # TODO instaed of checking <= 0.05 and cosine, rather make create_mask(b), + # then check how much of mask_a is covered by mask_b. By doing so, the check + # for <= 0.05 falls away which 1. seems rather like a magic number guess + # and 2. jumps the distance to 1 suddenly when this threshold is not reached + # Do experiments for this. + distance = 0 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1 # (not 2 because that correspondsto negative values. The maximum distance is 1 here) - # don't normalize b_masked, otherwise the following threshold will never: + # don't normalize b_masked, otherwise the following threshold will never work: if b_masked.sum() <= 0.05: distance = 1 else: @@ -1108,6 +1113,8 @@ def generate_stable_topics(self, min_cores=None): for i in results: # TODO don't even have nan values, have {}/empty-sets instead if not isinstance(results[i]["parent_labels"], set): + # TODO remove that: + print('TODO: nan found in parent_labels:', results[i]) results[i]["parent_labels"] = set() results[i]["valid_parents"] = set() else: @@ -1202,49 +1209,10 @@ def id2word(self): return self.gensim_kw_args["id2word"] class StableTopicHelpers(): - """since there are so many helper functions involved, + """since there are so many helper functions involved + (EDIT: there were more of them before I improved things), a class is to be prefered over functions in functions - to be able to run unit tests on them individually""" - - @staticmethod - def remove_label_from_colum(label, valid_parent_labels): - """ - Parameters - ------- - valid_parent_labels : set - label : number - - Example - ------- - - label = 0 - - valid_parent_labels = {0, 1} - - result: {1} - will overwrite valid_parent_labels - - """ - try: - valid_parent_labels.remove(label) - except AttributeError: - # parent_labes is NaN - pass - except KeyError: - # Key does not exist - pass - - @staticmethod - def remove_from_all_sets(label, grouped_parent_labels): - """ - Example - ------- - - label = 2 - - grouped_parent_labels = [{1, 2}, {1, 2, 3}] - - result: [{1}, {1, 3}] - will overwrite grouped_parent_labels - - """ - for valid_parent_labels in grouped_parent_labels: - StableTopicHelpers.remove_label_from_colum(label, valid_parent_labels) - # valid_parent_labels.remove(label) + to be able to run unit tests on them individually (TODO)""" @staticmethod def len_of_parent_labels(parent_label): @@ -1252,6 +1220,9 @@ def len_of_parent_labels(parent_label): """ if not isinstance(parent_label, set): + # TODO make sure this holds always true. either an empty set + # or a populated set, but no weird stuff pls + print('TODO: parent_label should be a set! but is', parent_label) return 0 else: return len(parent_label) @@ -1275,7 +1246,6 @@ def validate_core(core): True if that is the case, False otherwise """ - # print('validate_core', core) if not core["is_core"]: return False else: From 100bbf0cbef105e22b1851a3591e26ca9ffed117 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 7 Apr 2019 17:00:08 +0200 Subject: [PATCH 006/166] pandas -> numpy: CBDBSCAN --- gensim/models/ensemblelda.py | 176 +++++++++++++---------------------- 1 file changed, 64 insertions(+), 112 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index e6b70804b6..116d9d8070 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -83,13 +83,11 @@ import logging import os from multiprocessing import Process, Pipe, ProcessError -import copy try: import cPickle as _pickle except ImportError: import pickle as _pickle import numpy as np -import pandas as pd from scipy.spatial.distance import cosine from gensim import utils from gensim.models import ldamodel, ldamulticore, basemodel @@ -360,7 +358,7 @@ def add_model(self, target, num_new_models=None): In order to generate new stable topics afterwards, use self.generate_asymmetric_distance_matrix() self.recluster() - + The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter of the ensemble, that means the number @@ -453,8 +451,8 @@ def add_model(self, target, num_new_models=None): else: self.num_models += num_new_models - else: # memory unfriendly ensembles - + else: # memory unfriendly ensembles + ttda = [] # 1. ttda array @@ -483,7 +481,7 @@ def add_model(self, target, num_new_models=None): self.num_models = len(self.tms) logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) - + if self.ttda.shape[1] != ttda.shape[1]: raise ValueError(("target ttda dimensions do not match. Topics must be {} but was" "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1])) @@ -750,7 +748,7 @@ def generate_topic_models(self, num_models, random_states=None): # adds the lambda (that is the unnormalized get_topics) to ttda, which is # a list of all those lambdas self.ttda = np.concatenate([self.ttda, tm.get_topics()]) - + # only saves the model if it is not "memory friendly" if not self.memory_friendly_ttda: self.tms += [tm] @@ -793,7 +791,8 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= # singlecore: if workers is not None and workers <= 1: logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) - self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, ttda2=self.ttda, + self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, + ttda2=self.ttda, threshold=threshold, start_index=0, method=method) @@ -914,7 +913,7 @@ def mass_masking(a): largest_mass = sorted_a.cumsum() < threshold smallest_valid = sorted_a[largest_mass][-1] return a >= smallest_valid - + def rank_masking(a): """faster masking method. returns a new binary mask""" return a > np.sort(a)[::-1][int(len(a) * threshold)] @@ -1020,23 +1019,18 @@ def generate_stable_topics(self, min_cores=None): min_cores = min(3, max(1, int(self.num_models / 4 + 1))) # counts how many different labels a core has as parents - results = self.cluster_model.results.to_dict('index') + results = self.cluster_model.results for topic in results.values(): if topic["is_core"]: - topic["amount_parent_labels"] = StableTopicHelpers.len_of_parent_labels(topic["parent_labels"]) - - # TODO write this back to self.cluster_model.results - # it just adds another column, so that's safe - # but atm cluster_model.results is a dataframe so first I need to - # remove pandas from the cluster_model + topic["amount_parent_labels"] = len(topic["parent_labels"]) # first, group all the learned cores based on the label, # which was assigned in the cluster_model grouped_by_labels = {} for topic in results.values(): if topic["is_core"]: - label = topic["labels"] # "labels", but it actually means just a single number - if not label in grouped_by_labels: + label = topic["label"] + if label not in grouped_by_labels: grouped_by_labels[label] = [] grouped_by_labels[label].append(topic) @@ -1045,7 +1039,7 @@ def generate_stable_topics(self, min_cores=None): sorted_clusters = [] for label, group in grouped_by_labels.items(): amount_parent_labels = 0 - parent_labels = [] # will be a list of sets + parent_labels = [] # will be a list of sets for topic in group: amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels) parent_labels.append(topic["parent_labels"]) @@ -1053,13 +1047,19 @@ def generate_stable_topics(self, min_cores=None): # - removing nan from parent_labels sorted_clusters.append({ "amount_parent_labels": amount_parent_labels, - "parent_labels": [x for x in parent_labels if isinstance(x, set)], + "parent_labels": [x for x in parent_labels if len(x) > 0], "is_valid": np.nan, "label": label }) # start with the most significant core sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True) + def remove_from_all_sets(label): + # removes a label from every set in parent_labels for each core + for a in sorted_clusters: + if label in a["parent_labels"]: + a["parent_labels"].remove(label) + # APPLYING THE RULES 1 to 3 # iterate over the cluster labels, see which clusters/labels # are valid to cause the creation of a stable topic @@ -1071,20 +1071,14 @@ def generate_stable_topics(self, min_cores=None): if cluster["amount_parent_labels"] == 0: # label is NOT VALID cluster["is_valid"] = False - # removes a label from every set in parent_labels for each core - for a in sorted_clusters: - if label in a["parent_labels"]: - a["parent_labels"].remove(label) + remove_from_all_sets(label) # 2. rule - remove if it has less than min_cores as parents # (parents are always also cores) if len(cluster["parent_labels"]) < min_cores: cluster["is_valid"] = False - # removes a label from every set in parent_labels for each core - for a in sorted_clusters: - if label in a["parent_labels"]: - a["parent_labels"].remove(label) - + remove_from_all_sets(label) + # 3. checking for "easy_valid"s # checks if the core has at least min_cores of cores with the only label as itself if cluster["amount_parent_labels"] >= 1: @@ -1094,7 +1088,7 @@ def generate_stable_topics(self, min_cores=None): # Reaplying the rule 3 # this happens when we have a close relationship among 2 or more clusters for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]: - + # this checks for parent_labels, which are also modified in this function # hence order will influence the result. it starts with the most significant # label (hence "sorted_clusters") @@ -1111,23 +1105,27 @@ def generate_stable_topics(self, min_cores=None): valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]]) for i in results: - # TODO don't even have nan values, have {}/empty-sets instead - if not isinstance(results[i]["parent_labels"], set): - # TODO remove that: - print('TODO: nan found in parent_labels:', results[i]) - results[i]["parent_labels"] = set() - results[i]["valid_parents"] = set() + results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels} + + def validate_core(core): + """Core is a dict of {is_core, valid_parents, labels} among others. + If not a core returns False. Only cores with the valid_parents as + its own label can be a valid core. Returns True if that is the case, + False otherwise""" + if not core["is_core"]: + return False else: - results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels} + ret = core["valid_parents"] == {core["label"]} + return ret # keeping only VALID cores - valid_core_mask = np.vectorize(StableTopicHelpers.validate_core)([results[i] for i in results]) + valid_core_mask = np.vectorize(validate_core)(list(results.values())) valid_topics = self.ttda[valid_core_mask] - topic_labels = np.array([results[i]['labels'] for i in results])[valid_core_mask] + topic_labels = np.array([results[i]["label"] for i in results])[valid_core_mask] unique_labels = np.unique(topic_labels) - num_topics = len(unique_labels) - stable_topics = np.empty((num_topics, len(self.id2word)), dtype=np.float32) + num_stable_topics = len(unique_labels) + stable_topics = np.empty((num_stable_topics, len(self.id2word)), dtype=np.float32) # for each cluster for l, label in enumerate(unique_labels): @@ -1208,49 +1206,6 @@ def print_topics(self, *posargs, **kwArgs): def id2word(self): return self.gensim_kw_args["id2word"] -class StableTopicHelpers(): - """since there are so many helper functions involved - (EDIT: there were more of them before I improved things), - a class is to be prefered over functions in functions - to be able to run unit tests on them individually (TODO)""" - - @staticmethod - def len_of_parent_labels(parent_label): - """ - - """ - if not isinstance(parent_label, set): - # TODO make sure this holds always true. either an empty set - # or a populated set, but no weird stuff pls - print('TODO: parent_label should be a set! but is', parent_label) - return 0 - else: - return len(parent_label) - - @staticmethod - def keep_only_valid_labels(parent_labels, valid_labels): - if not isinstance(parent_labels, set): - return set() - else: - return {label for label in parent_labels if label in valid_labels} - - @staticmethod - def validate_core(core): - """ - - Core is a dict of {is_core, valid_parents, labels} among others - - If not a core returns False - - Only cores with the valid_parents as its own label can be a valid core. Returns - True if that is the case, False otherwise - - """ - if not core["is_core"]: - return False - else: - ret = core["valid_parents"] == {core["labels"]} - return ret class CBDBSCAN(): @@ -1262,39 +1217,40 @@ def __init__(self, eps=0.1, min_samples=4): def fit(self, amatrix): self.next_label = 0 - self.results = pd.DataFrame(index=range(len(amatrix)), - columns=["labels", "is_core", - "parent_ids", "parent_labels", "num_samples"]) - self.results.is_core = False - # otherwise sets cannot be written into the dataframe (will be converted to integers). - # The dtype should be object for everything to fix this. - self.results = self.results.astype(dtype="object") + results = {index: { + "is_core": False, + "parent_labels": set(), + "parent_ids": set(), + "num_samples": 0, + "label": None + } for index in range(len(amatrix))} tmp_amatrix = amatrix.copy() # to avoid problem about comparing the topic with itself np.fill_diagonal(tmp_amatrix, 1) - min_distance_per_topic = pd.Series(tmp_amatrix.min(axis=1)) - - min_distance_per_topic_sorted = min_distance_per_topic.sort_values() + min_distance_per_topic = [(index, distance) for index, distance in enumerate(tmp_amatrix.min(axis=1))] + min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[1]) + ordered_min_similarity = [index for index, distance in min_distance_per_topic_sorted] - ordered_min_similarity = list(min_distance_per_topic_sorted.index.values) + num_topics = len(amatrix) def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None): # count how many neighbors - neighbors = pd.Series(tmp_amatrix[topic_index] < self.eps) - num_samples = sum(neighbors) + # check which indices (arange 0 to num_topics) is closer than eps + neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps] + num_samples = len(neighbors) # If the number of neighbors of a topic is large enough, # it is considered a core. # This also takes neighbors that already are identified as core in count. if num_samples >= self.min_samples: # This topic is a core! - self.results.loc[topic_index, "is_core"] = True - self.results.loc[topic_index, "num_samples"] = num_samples + results[topic_index]["is_core"] = True + results[topic_index]["num_samples"] = num_samples # if current_label is none, then this is the first core # of a new cluster (hence next_label) @@ -1313,7 +1269,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors # parent neighbors is the list of neighbors of parent_id # (the topic_index that called this function recursively) - all_members_of_current_cluster = list(parent_neighbors[parent_neighbors].index.values) + all_members_of_current_cluster = list(parent_neighbors) all_members_of_current_cluster.append(parent_id) # look if 25% of the members of the current cluster are also @@ -1332,11 +1288,11 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors current_label = self.next_label self.next_label += 1 - self.results.loc[topic_index, "labels"] = current_label + results[topic_index]["label"] = current_label - for neighbor in neighbors[neighbors].index: + for neighbor in neighbors: - if self.results.loc[neighbor, "labels"] is np.nan: + if results[neighbor]["label"] is None: ordered_min_similarity.remove(neighbor) # try to extend the cluster into the direction # of the neighbor @@ -1344,23 +1300,19 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors current_label=current_label, parent_neighbors=neighbors) - if isinstance(self.results.loc[neighbor, "parent_ids"], set): - self.results.loc[neighbor, "parent_ids"].add(topic_index) - self.results.loc[neighbor, "parent_labels"].add(current_label) - else: - self.results.loc[neighbor, "parent_ids"] = set([topic_index]) - self.results.loc[neighbor, "parent_labels"] = set([current_label]) + results[neighbor]["parent_ids"].add(topic_index) + results[neighbor]["parent_labels"].add(current_label) else: # this topic is not a core! if current_label is None: - self.results.loc[topic_index, "labels"] = -1 + results[topic_index]["label"] = -1 else: - self.results.loc[topic_index, "labels"] = current_label + results[topic_index]["label"] = current_label # elements are going to be removed from that array in scan_topic, do until it is empty while len(ordered_min_similarity) != 0: next_topic_index = ordered_min_similarity.pop(0) scan_topic(next_topic_index) - self.labels_ = self.results["labels"] + self.results = results From aff3287c786ee29947d31754d207250e4d9f59a2 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 7 Apr 2019 17:32:23 +0200 Subject: [PATCH 007/166] fixes for automated checks --- gensim/corpora/opinosiscorpus.py | 11 +++++------ gensim/models/ensemblelda.py | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py index eee2229849..446c5e7804 100644 --- a/gensim/corpora/opinosiscorpus.py +++ b/gensim/corpora/opinosiscorpus.py @@ -4,11 +4,10 @@ # Author: Tobias B import os -from pathlib import Path import re from gensim.corpora import Dictionary -from nltk.stem import PorterStemmer -from nltk.corpus import stopwords +from gensim.parsing.porter import PorterStemmer +from gensim.parsing.preprocessing import STOPWORDS class OpinosisCorpus(): @@ -49,7 +48,7 @@ def __init__(self, path): print("year:\t\t2010") print("organization:\tAssociation for Computational Linguistics") - path = Path(path).joinpath("summaries-gold") + path = os.path.join(path, "summaries-gold") dictionary = Dictionary() corpus = [] stemmer = PorterStemmer() @@ -76,8 +75,8 @@ def __init__(self, path): # overwrite dictionary, add corpus to test or train afterwards # the following function takes an array of documents, so wrap it in square braces processed = [ - stemmer.stem(token) for token in re.findall('\w+', doc.lower()) - if token not in stopwords.words('english') + stemmer.stem(token) for token in re.findall(r'\w+', doc.lower()) + if token not in STOPWORDS ] dictionary.add_documents([processed]) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 116d9d8070..e2a439f077 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -17,7 +17,7 @@ >>> from gensim.test.utils import common_texts >>> from gensim.corpora.dictionary import Dictionary - >>> from gensim.models import EnsembleLda, LdaModel + >>> from gensim.models import EnsembleLda >>> >>> # Create a corpus from a list of texts >>> common_dictionary = Dictionary(common_texts) @@ -59,6 +59,7 @@ .. sourcecode:: pycon + >>> from gensim.models import LdaModel >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)) >>> elda.recluster() >>> vector = elda[unseen_doc] From a545ddf1a12aca42e346e55a8ac8c0b13a17ddd6 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 8 Apr 2019 21:24:23 +0200 Subject: [PATCH 008/166] improvements on logs, comments and variable naming. Changed save function to simply pickling the whole thing. --- gensim/models/ensemblelda.py | 198 ++++++++++++++--------------------- 1 file changed, 78 insertions(+), 120 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index e2a439f077..b8cc0b9e9c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -229,6 +229,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: return + logger.info("Generating {} topic models...".format(num_models)) # training if ensemble_workers > 1: # multiprocessed @@ -275,29 +276,29 @@ def generate_gensim_representation(self): logger.info("Generating classic gensim model representation based on results from the ensemble") - numberOfWords = self.sstats_sum - # if numberOfWords should be wrong for some fantastic funny reason + number_of_words = self.sstats_sum + # if number_of_words should be wrong for some fantastic funny reason # that makes you want to peel your skin off, recreate it (takes a while): - if numberOfWords == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None: + if number_of_words == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None: for document in self.gensim_kw_args["corpus"]: for token in document: - numberOfWords += token[1] - self.sstats_sum = numberOfWords - assert numberOfWords != 0 + number_of_words += token[1] + self.sstats_sum = number_of_words + assert number_of_words != 0 - stableTopics = self.get_topics() + stable_topics = self.get_topics() - nStableTopics = len(stableTopics) + num_stable_topics = len(stable_topics) - if nStableTopics == 0: - logger.warning("The model did not detect any stable topic. You can try to adjust epsilon: " - "generate_topic_clusters(eps); generate_stable_topics()") + if num_stable_topics == 0: + logger.error("The model did not detect any stable topic! You can try to adjust epsilon: " + "recluster(eps=...)") return # create a new gensim model params = self.gensim_kw_args.copy() params["eta"] = self.eta - params["num_topics"] = nStableTopics + params["num_topics"] = num_stable_topics # adjust params in a way that no training happens params["iterations"] = 0 # no training params["passes"] = 0 # no training @@ -309,13 +310,13 @@ def generate_gensim_representation(self): # the following is important for the denormalization # to generate the proper sstats for the new gensim model: - # transform to dimensionality of stableTopics. axis=1 is summed + # transform to dimensionality of stable_topics. axis=1 is summed eta_sum = 0 if int == type(eta) or float == type(eta): - eta_sum = [eta * len(stableTopics[0])] * nStableTopics + eta_sum = [eta * len(stable_topics[0])] * num_stable_topics else: if len(eta.shape) == 1: # [e1, e2, e3] - eta_sum = [[eta.sum()]] * nStableTopics + eta_sum = [[eta.sum()]] * num_stable_topics if len(eta.shape) > 1: # [[e11, e12, ...], [e21, e22, ...], ...] eta_sum = np.array(eta.sum(axis=1)[:, None]) @@ -328,18 +329,11 @@ def generate_gensim_representation(self): # right sstats, so that get_topics() will return the stable topics no # matter eta. - normalization_factor = np.array([[numberOfWords / nStableTopics]] * nStableTopics) + eta_sum - - if stableTopics.shape[1] != len(eta): - raise ValueError("If load() was used: was the correct dictionary used when loading the model? " - "because stableTopics.shape[0] {} != len(eta) {}".format(stableTopics.shape[1], len(eta))) + normalization_factor = np.array([[number_of_words / num_stable_topics]] * num_stable_topics) + eta_sum - sstats = stableTopics * normalization_factor + sstats = stable_topics * normalization_factor sstats -= eta - # Note that this will cause some stuff to fail if nStableTopics is 1 - # if nStableTopics is 1, the result will be (close to) uniform anyway - # inserting the new sstats classic_model_representation.state.sstats = sstats.astype("float32") # fix expElogbeta. classic_model_representation.sync_state() @@ -458,9 +452,9 @@ def add_model(self, target, num_new_models=None): # 1. ttda array if isinstance(target.dtype.type(), (np.number, float)): - raise ValueError('ttda arrays cannot be added to ensembles that are not memory friendly, ' - 'you can call convert_to_memory_friendly to discard the stored gensim models' - 'and only keep the ttdas') + raise ValueError('ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, ' + 'you can call convert_to_memory_friendly, but it will discard the stored gensim' + 'models and only keep the relevant topic term distributions from them.') # 2. list of ensembles elif isinstance(target[0], type(self)): @@ -479,6 +473,9 @@ def add_model(self, target, num_new_models=None): # in this case, len(self.tms) should # always match self.num_models + if num_new_models is not None and num_new_models + self.num_models != len(self.tms): + logger.info('num_new_models will be ignored. num_models should match the number of ' + 'stored models for a memory unfriendly ensemble') self.num_models = len(self.tms) logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) @@ -501,37 +498,10 @@ def save(self, fname): """ - # parameter descriptions are similar to [1] in - # order to make the api less confusing - - gensim_kw_args_persist = self.gensim_kw_args.copy() - - payload = { - "ensemble_kw_args": { - "topic_model_kind": self.topic_model_kind, - "memory_friendly_ttda": self.memory_friendly_ttda, - "num_models": self.num_models - }, - "gensim_kw_args": gensim_kw_args_persist, - "ttda": self.ttda, - "amatrix": self.asymmetric_distance_matrix, - "stable_topics": self.stable_topics, - "cluster_model.results": self.cluster_model.results, - "sstats_sum": self.sstats_sum, - "id2word": self.id2word - } - - if not self.memory_friendly_ttda: - payload["topicModels"] = self.tms - - # Version of the save/load api, to eventually make load - # backwards compatible if the save-function changes. - payload["version"] = "1.0" + logger.info("saving %s object to %d", self.__class__.__name__, fname) with open(fname, 'wb') as f: - _pickle.dump(payload, f) - - logger.info("saved %s object", self.__class__.__name__) + _pickle.dump(self, f) @staticmethod def load(fname): @@ -552,45 +522,7 @@ def load(fname): logger.info("loading %s object from %s", EnsembleLda.__name__, fname) with open(fname, 'rb') as f: - data = _pickle.load(f) - - # update() on the dict can be safely used, because they don't have - # keywords in common that overwrite each other - kwArgs = data["ensemble_kw_args"] - kwArgs.update(data["gensim_kw_args"]) - - kwArgs["id2word"] = data["id2word"] - # prevent training - kwArgs["iterations"] = 0 - - # now reconstruct an ensemble LDA object based on the data: - eLDA = EnsembleLda(**kwArgs) - # since no training will take place, num_models will be set to 0. restore: - eLDA.num_models = kwArgs["num_models"] - - # some stuff that was not created automatically, because the "empty" eLDA object did not train - eLDA.ttda = data["ttda"] - eLDA.asymmetric_distance_matrix = data["amatrix"] - eLDA.sstats_sum = data["sstats_sum"] - - # depends on memory_friendly_ttda in the save function: - if "topicModels" in data: - eLDA.tms = [] - for i in range(len(data["topicModels"])): - eLDA.tms += [data["topicModels"][i]] - - # As long as fit is not called on cluster_model, no results will be generated, - # so the CBDBSCAN constructor is safe to call without consuming cpu time. - # No need to provide the original params from the constructor, the result are - # going to be written into the model from the pickle. - eLDA.cluster_model = CBDBSCAN() - eLDA.cluster_model.results = data["cluster_model.results"] - eLDA.stable_topics = data["stable_topics"] - - eLDA.generate_gensim_representation() - - # message copied from [1] - logger.info("loaded %s", fname) + eLDA = _pickle.load(f) return eLDA @@ -629,7 +561,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): def worker(id, num_models, random_states, pipe): # since models cannot be piped to the parent because messages need to be pickled for that: # memory_friendly_ttda needs to be true, which will not store ensemble submodels - logger.info("Spawned worker to generate {} topic models...".format(num_models)) + logger.info("Spawned worker to generate {} topic models".format(num_models)) self.generate_topic_models(num_models, random_states=random_states) # send the ttda that is in the child/workers version of the memory into the pipe # available, after generate_topic_models has been called in the worker @@ -734,8 +666,6 @@ def generate_topic_models(self, num_models, random_states=None): assert len(random_states) == num_models - logger.info("Generating {} topic models...".format(num_models)) - kwArgs = self.gensim_kw_args.copy() tm = None # remember one of the topic models from the following @@ -744,6 +674,7 @@ def generate_topic_models(self, num_models, random_states=None): for i in range(num_models): kwArgs["random_state"] = random_states[i] + tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs) # adds the lambda (that is the unnormalized get_topics) to ttda, which is @@ -765,6 +696,9 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering. + Afterwards, the model needs to be reclustered for + this generated matrix to take effect. + Parameters ---------- threshold : float, optional @@ -789,9 +723,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= if threshold is None: threshold = {"mass": 0.95, "rank": 0.11}[method] + logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) + # singlecore: if workers is not None and workers <= 1: - logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, @@ -809,7 +744,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= # all other nodes from a chunk of nodes. # https://stackoverflow.com/a/1743350 def worker(worker_id, ttdas_sent, n_ttdas, pipe): - logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix...".format(n_ttdas)) + logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda, @@ -921,6 +856,9 @@ def rank_masking(a): create_mask = {"mass": mass_masking, "rank": rank_masking}[method] + # some help to find a better threshold by useful log messages + avg_mask_size = 0 + # initialize the distance matrix. ndarray is faster than zeros distances = np.ndarray((len(ttda1), len(ttda2))) @@ -932,6 +870,8 @@ def rank_masking(a): mask = create_mask(a) a_masked = a[mask] + avg_mask_size += mask.sum() + # now look at every possible pair for topic a: for j in range(len(ttda2)): @@ -947,7 +887,7 @@ def rank_masking(a): # then check how much of mask_a is covered by mask_b. By doing so, the check # for <= 0.05 falls away which 1. seems rather like a magic number guess # and 2. jumps the distance to 1 suddenly when this threshold is not reached - # Do experiments for this. + # TODO Do experiments for this to see if it makes a lot of difference distance = 0 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1 @@ -963,6 +903,17 @@ def rank_masking(a): distances[i][j] = distance + avg_mask_size = avg_mask_size / ttda1.shape[0] / ttda1.shape[1] + percent = round(100 * avg_mask_size, 1) + if avg_mask_size > 0.75: + # if the masks covered more than 75% of tokens on average, + # print a warning. info otherwise. + # The default mass setting of 0.95 makes uniform true masks on the opinosis corpus. + logger.warning('The given threshold of {} covered on average ' + '{}% of tokens, which might be too much!'.format(threshold, percent)) + else: + logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) + return distances def generate_topic_clusters(self, eps=0.1, min_samples=None): @@ -1019,17 +970,18 @@ def generate_stable_topics(self, min_cores=None): # min_cores is a number between 1 and 3, depending on the number of models min_cores = min(3, max(1, int(self.num_models / 4 + 1))) - # counts how many different labels a core has as parents results = self.cluster_model.results - for topic in results.values(): - if topic["is_core"]: - topic["amount_parent_labels"] = len(topic["parent_labels"]) # first, group all the learned cores based on the label, # which was assigned in the cluster_model grouped_by_labels = {} for topic in results.values(): if topic["is_core"]: + topic = topic.copy() + + # counts how many different labels a core has as parents + topic["amount_parent_labels"] = len(topic["parent_labels"]) + label = topic["label"] if label not in grouped_by_labels: grouped_by_labels[label] = [] @@ -1045,7 +997,7 @@ def generate_stable_topics(self, min_cores=None): amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels) parent_labels.append(topic["parent_labels"]) # - here, nan means "not yet evaluated" - # - removing nan from parent_labels + # - removing empty sets from parent_labels sorted_clusters.append({ "amount_parent_labels": amount_parent_labels, "parent_labels": [x for x in parent_labels if len(x) > 0], @@ -1056,7 +1008,8 @@ def generate_stable_topics(self, min_cores=None): sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True) def remove_from_all_sets(label): - # removes a label from every set in parent_labels for each core + """removes a label from every set in "parent_labels" for + each core in sorted_clusters.""" for a in sorted_clusters: if label in a["parent_labels"]: a["parent_labels"].remove(label) @@ -1167,6 +1120,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): # if new models were added to the ensemble, the distance # matrix needs to be generated again if self.asymmetric_distance_matrix_outdated: + logger.info("asymmetric distance matrix is outdated due to add_model") self.generate_asymmetric_distance_matrix() self.generate_topic_clusters(eps, min_samples) @@ -1179,28 +1133,33 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): def get_topics(self): return self.stable_topics + def has_gensim_representation(self): + """checks if stable topics area vailable and if the internal + gensim representation exists. If not, raises errors""" + if self.classic_model_representation is None: + if len(self.stable_topics) == 0: + raise ValueError("no stable topic was detected!") + else: + raise ValueError("use generate_gensim_representation() first") + def __getitem__(self, i): """see https://radimrehurek.com/gensim/models/ldamodel.html""" - if self.classic_model_representation is None: - raise ValueError("use generate_gensim_representation() first") + self.has_gensim_representation() return self.classic_model_representation[i] def inference(self, *posargs, **kwArgs): """see https://radimrehurek.com/gensim/models/ldamodel.html""" - if self.classic_model_representation is None: - raise ValueError("use generate_gensim_representation() first") + self.has_gensim_representation() return self.classic_model_representation.inference(*posargs, **kwArgs) def log_perplexity(self, *posargs, **kwArgs): """see https://radimrehurek.com/gensim/models/ldamodel.html""" - if self.classic_model_representation is None: - raise ValueError("use generate_gensim_representation() first") + self.has_gensim_representation() return self.classic_model_representation.log_perplexity(*posargs, **kwArgs) def print_topics(self, *posargs, **kwArgs): """see https://radimrehurek.com/gensim/models/ldamodel.html""" - if self.classic_model_representation is None: - raise ValueError("use generate_gensim_representation() first") + self.has_gensim_representation() return self.classic_model_representation.print_topics(*posargs, **kwArgs) @property @@ -1239,7 +1198,7 @@ def fit(self, amatrix): num_topics = len(amatrix) def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None): - + # count how many neighbors # check which indices (arange 0 to num_topics) is closer than eps neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps] @@ -1254,7 +1213,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors results[topic_index]["num_samples"] = num_samples # if current_label is none, then this is the first core - # of a new cluster (hence next_label) + # of a new cluster (hence next_label is used) if current_label is None: # next_label is initialized with 0 in # fit() for the first cluster @@ -1262,9 +1221,8 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors self.next_label += 1 else: - # In case the core has a parent, - # that means it has a label, namely the - # label from the parent. + # In case the core has a parent, that means + # it has a label (= the parents label). # Check the distance between the # new core and the parent From d1a6854a413a92e50a805998cf43e6be03415345 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 9 Apr 2019 00:17:09 +0200 Subject: [PATCH 009/166] minor fix in log message format --- gensim/models/ensemblelda.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index b8cc0b9e9c..7f3f071e77 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -498,7 +498,7 @@ def save(self, fname): """ - logger.info("saving %s object to %d", self.__class__.__name__, fname) + logger.info("saving %s object to %s", self.__class__.__name__, fname) with open(fname, 'wb') as f: _pickle.dump(self, f) @@ -883,12 +883,6 @@ def rank_masking(a): # now mask b based on a, which will force the shape of a onto b b_masked = ttda2[j][mask] - # TODO instaed of checking <= 0.05 and cosine, rather make create_mask(b), - # then check how much of mask_a is covered by mask_b. By doing so, the check - # for <= 0.05 falls away which 1. seems rather like a magic number guess - # and 2. jumps the distance to 1 suddenly when this threshold is not reached - # TODO Do experiments for this to see if it makes a lot of difference - distance = 0 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1 # (not 2 because that correspondsto negative values. The maximum distance is 1 here) @@ -978,7 +972,7 @@ def generate_stable_topics(self, min_cores=None): for topic in results.values(): if topic["is_core"]: topic = topic.copy() - + # counts how many different labels a core has as parents topic["amount_parent_labels"] = len(topic["parent_labels"]) @@ -1198,7 +1192,7 @@ def fit(self, amatrix): num_topics = len(amatrix) def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None): - + # count how many neighbors # check which indices (arange 0 to num_topics) is closer than eps neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps] From 3650895d825765d67aa8a20c0ba3cabe56901fdd Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 9 Apr 2019 21:13:43 +0200 Subject: [PATCH 010/166] added tests --- gensim/test/test_data/ensemblelda | Bin 0 -> 9923 bytes gensim/test/test_ensemblelda.py | 318 ++++++++++++++++++++++++++++++ 2 files changed, 318 insertions(+) create mode 100644 gensim/test/test_data/ensemblelda create mode 100644 gensim/test/test_ensemblelda.py diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda new file mode 100644 index 0000000000000000000000000000000000000000..905aa8bf8a9ab4e3f8c2fc54758af605b910d30f GIT binary patch literal 9923 zcma)i2|U!@*Z9~OOJpyKkS#l9nHiNVH3_4nEMqXtFvHwg5*lfKg_0v!RJt)e zGMG$b8N&*h9!Mk8f=Ot{1$Z-r0ZM-#5a0qqJ#SeQ3dN3OP=koPHpFmhL@*HY7J_Kl z3J80nAr!%(^T{kAf*1CdfhfKpVt5ph#0+HtQ7#bE^A?4KphzZz!vf-dKtj(Cp!M)5 zyg&kw#0wxFDZC)^ktX{AnFY!sybywt#S0@JIlKt+S&SD&KJs`mH~9GCjVC<@`=H_)Rq-Wd&XWX|6@`1KN6gSO5yv1Q_c4Rm?!kiimEV#guj|z&UGdOHA6IkI<-Xf5P z!l9EQz-qjJw*(?D0@6ZAL1eH7FUaeINoFx3p}&DOUWn%u7Kgcu91U!s^AI1gqFHP* z9oXVUyoC{wR5}^h;YE2B4hssx;Kg_$KxVK27B9{d%4U+uEP%sH@M@t<5`zNl@n~L$ zbZSH-lLgk|B{@6=;1KZa0PDD5J#;k13NJuejTfY>!3$BW@xl}vya>e>FG{h)i%~Fm zaS9eMLBZkC6nnfRWi5v%J2->|9JpWu;#Wc59}1Sj@D+pp6oIc0ghk;i4qpjaL&H}R zFUVH`iP_-DAQo_h8Y6B&5R6C`aKfwqU5|)}0nT_ePT=2R&YKG?-~vYy87T&d#e!CE z#G7-x6=0oCrbjZPi6KlXIU<-AO=Pp-B-xb2n?}J@7Mm0iL?%M-hm+xO;Gu)mV51-! z?B(wn#w0VKf5;JR5)uqx^PebLn6q%!@gOvusIJH~MgvzEAX~V=O^-M7!R%-T8Mp^Q zdl=DZUJVUAyal145KBNn=3650B!I0j?7amb76Z0z*|KHRJbZWs9>oS;ffN&p883X0 zae)CuXMyc_DQ`*G05OC~4TK>V47`&m$XFn87DVAhgtMqIWU#~au^3+$au6vR>~yw& z1L^SLjag(!2#d@HKF;y6 zpuEJIW-0TZEov(#YE*Z>Uf(3Idq4Dx&yPN-x0B1B9#Vh)TX+#}-11R=P5uut+D3P2 zy>#6yHM7d{8PdajUfmyRMMfzsomb=)er3sSCqwf?muK3(*;XS?_PU;XztPt}Smpg) zAusfNbW!QD2ai_fNA0|N-fQ!Xgn$%-Qp>Y_e;EwocT^RzNG;#L)q2+BLwiLtH@a+o zjMJ~rG)jzJ-Lxs^X7q&gqF1E!C89%=7!%3p9pPQ;?gg^#T1E7m49#*dX=fuo%6N9>sjHig(y=#xh3yM`5*lV`%3yu3X>2<9abi@hVwKBX;)9i0b^@9!cZ$vv zFw`8=Jl&MdN{<$)ZgJQia$qpKV*3(}mYqKrg+BLeOVRlHz$jBo^^(h|Yl%&gd4Fnv z;``qD#141f+3o+xAn;9|a=Gu6c2?TMw{r*8OWSc{A7Yf%)I}}c4e!;d zwYhO*)T#H5+_m?lL6=Q!YT=9>BR(&#GBRzBb&<6`Rb18?pW8UPX*^P==L6@MpXACo zL(KbTjiF3?ol!G8eT_l$q>InaQamo>n_Sk#tX2;wzc*}&)gdPO_&&dw=G6bK-GO|u z?wPXaIgi+e2bdDAqk?|D8|aDl<8FIQB`glSC$+Q-uA5ZY>P(Ru7(Fc8P-wfit#^Pj zy6T8l5o&2~?S{{nbJEt-m<#SavgFAP=dZbA))5<(mTNf_$9L!Vc$h0+af|7%^^7Wg zKk4E!e6CdMYV9m3Ir{oN?7er-YZ-gDYiZr7IcXuzT*p)%mZx92)bjPsZpzX9D|3qE zJ)hQh3dXP;~D7Jb3vM=m~dQ!Mp%f5deUezN=!TIsx{u80BYuW2F4{Yh?TZ=Fc^ zl{zKm?K0+aC7TlQ(x&imuGd%nTc(3sKc$zRY-ry6xjOf2!ohKs{IP0p8piRg#jljt z6T6Q_|MZQpxoEXze@6SkYhR;A z=(!K+MF*VhZ++?2e0MNq&(rYCU0gz5+##8lzg=c@zqg$st|@sSws9ujEB2^t=*=%f zwr5|O1_oA(ZTY5heEpi>DWz4s&!xsUHGyPHMus4Y7fTxxc0rLlF_wv+ATCdJ~Y zcRtI-Hm%;U;fY0<^(Wb;*o;j@_uPh*LJC`5Of6q#OCNgHp3t@TN0vp@%8&!*ttW~i z?1i;E%QlfJ_axNJaPA726Eo~xK61G8+(E_11@}efdUUw#y*FxIfCr}qJuPN?Po2~x zlxms#`AEgyfuR-6o+3~DXR?d2A7dmkMJ9BXoVaiq`?MhG#^!AnXvV0x13r#e?>iosq0r^?Akvt>ACI1wU6A7*P$zld-aE=^IzLE99!@I&Wk4?3|k& z1L}U#o#0$(#@W6EL1v29`|{j!FaO3DQT1KSL6r!_ zRVFnp{gB_U;yXHe;eAL>hK$|%5!XWz#DX`1%e@U|m-L1G+-mOWJ|Pi*blaZ!{I9-- z6DIJU7il$m z`8vM|r!0-9IeS)*-AxJ0I!h8R-B@C6*D2h-)EGl(tF^Z}9~Enu9eW@D40UqCkZ0@<$8%1n%tkBoH(3jkF_>SqYnA)OKbIM%VAsx_q|LjJ;8z#tzcH?^fGz114S(QY5Jju&Y7 zp|$g#?aR$s)~eZW=A1s1ba&To$mo6bOW~GHUwxGPa>}(CGt*f)p9$OZ#UFFt;3(oJ zH|hGLojWDp=;|{ygKirYe;00<+7Xp}KtlMhx-BI7San2rRgv43uBG?}@Orz|!0b&0 z&tUlzR~mN?M0Yh=I0v0p5y=RwxqUZ4W&N^diQV|u1x52VFX*?QS8^q#W?wBAJ^qe* zz_Y~L?!&t+_MM00Hfu@kJc~Kw(I1+E(Jad_*tA<=h`B~LZ$&w2^QEt{WAre?5|d4u zU<^?ui+np7PK7Ysc7y)r(qA2Q1p@Z!VO-w05Y`@+1Z1{oJqN*ZVs28TAf1Uu)X_7o9~6+yNADPlYTj4UxdM_RIL|_wGJo< z`xFet&RGjloi;qpV$8cvxRM`g&*n3tKesBB9MX^a6z_O^k(uV%h-)na-(p9+FzX{^ z*9F#&?z6~fuH2mPm==7Y@}u_m@{IJZ^~=JJ*Nn{PtsLyAzueKZzP*oBJOBGldSuAr zN0x5SoSN05Jwlbto+Ovr^t$~#UgdJW#@%&|S5yC0#r~~1eSVr%$)24Bx1*v3a{8W% zb(@!Qn!=xDJX_TS%0!O+Y{pzYVVJXB@yJ}=VXf?I+AQ1P}kA(51HR> zV6H|tb&~`cg)X;^Pjr51f1#`$xcyU{l!|Dx*i25J$!eKvOUNtB1TVcvjC*O7;zU-1NvMpA8^k9)UB+zbwE~LT&M4Th-K&Ap_IbG zA?{@p2l!;xN<6h>HHJ2TDk3W^PZFt< z8cP?5AZUU8Y;YhDzZ;KohD%i5!v+WWdo3#5a_}}lEN}=e^O4ngD2Yxd5!n>DJEBC= zfa>?E zO|y5uCTicDyKSu$gbhCluf2b%(>}U9zTPG-q5jm!R!QOO>z%Cj1jPc%Yh(qAw zqHZk5ycD}JFyBro@tobtC%JZ?R{%`^B)}X)GckF?G>r1D0_>Bra7;0T6$)q=oloJI z{mDCUtyMd4cjFasAe#-&1abl?yC}$h;H)0y?>#^cqEapaocpr}I8TY^FYFgf=cNmc z8?W1{^F1;qotr21rfeO@qG7DZ+e^=>$z@MX;BTDZL+lVq)^T6-DVnVJr-t))ey6p~ z6Pp89^xf)Q)K9eWU0#AyH0T=?IJsqOv5{7E|1zl{Zf@%Oq{FvQt2}nE53Wkm_qRr; z3bxIB43_<;nVko#rujQsq&E7&8D%YIi9u!YbSGsB?zd_)4>dC&_ zRXc|!rYw(Gev`4hACTejQ2&cGN`akDGW7J?6?`msQ}r5xK(?eujp;V7+0ngIXM}uH zLXP>JXBcSd*(Va=d1S4RSJR_Y_05&k-O@(UcNJdj#@$~YSu?+E z@>S41m7(RB!cW+hx9!hj#U+l|w&|uP_*pD}JV8p6OY9XkEF5^p`epcSr*z7e4epbw zdQ5)}L+tx3ft|QxVYRLlfu>{=@Xg6WZK(!suViS*y*J`Nju3Qh_Gun*yeljDGo_0* zRL~f=eCy;vXW><&b>hcuo;nJZG$f0vqCSZG7$)D0zCk-z@LNFqX;)7Ax^;!w$!QU! z^CdA#F-|}B6a+mzmU^nySg6_1pvJY|$J140>E4_esb!9D_xk8aY~$Rq2x;BC+wIG8 zC;d*t21N^|q3yQ4Nm)}Wx0MU$sgB1lNzIdN)hiDNy|j&fv^!8X$?DEov+ni{Z+~B+ z7e@Yw|9nJPmcHpkS;iiDW%SW+11EpJzSZf)8q|x8%S??uIXI_j^7O{-!0?1sl_Q(u zoc#M{b}B}2{HFu`lfP`1el61xD;Xl~d8uc=wckUnjn;W8S2G2wz{Yf;iq}89jHjND z)_pj0GbdH<=Jvv+y&6Tw?~!^>x~6E%l$ENK7fxo>KQw71gaebD!OKQbmt^821s@2O zUfwg?QF`yt`VChagW881SdF@_lRu`|hexkme5|?JIsYVHTSb-Y(C_m*zFqQXscvq{ z_W+CxJu^$tQs#krvHCc-FRS~hSiAHhnKO^8M@(NG__c^0i_O&>=uwc1EKc*@=C@st zy;#IlkG^)9{>I|tsIy9w0Y>HSvy1#Rm2l+><5t?6068E1!>6QTUk{2CUcGi({Sc?S zSJ3S>qY<7d{Wxi|T=aXujuo1u`j+iYv(3#vzKyeDDAvZarOcs>q{Exu9jN z`9MwPT59c4S>-(irEfK7#!PIP8bW;s1184A`nO+q`)M>Ulb#tfvp(&)X_c>si;31! z?$(Fen71Le_P$##t!P&~6^_UnT}1BLH2j%HheQ?vTX)ng(F zP0P%*2-lUT{H_n|tDe;9Hyg8dmUs5Zb}ajC$w}NNP*y>kb+ER^q}+MNB}W|e7z55x z#zeOzS!&IRSENqV#qX;#(WTz^Kk`AVKzY9is_$68zD&OM{7{*=>+|R5N)uK;{JItU z%U6&>zhdK<@XRk(xXbTGq2Qqx&JEFq#N+8#QY)59*w6K=sGnlk%ik2bP#5P&7tHIk z%quSZ@-0?&>*_{h+ry)$y!Lv;1L;ictjVt09p#gUAD<;un$^Wyaviip^%qx1QLM9X zbFP@Waz8%2pf4W5*mz)1%oly5TcP)yOKaCoJb8WNduf|?8BY8uA<*y1c+$<%OOba< z`njGeJ01<)r8y{m`jI@lNZDdsvi$Oc-d!i!635bBt-31e@JXR#;>$LVcrOLK;+__C zr|en1x|!346UIMQ?AYCWJnsl*SX-}6Zp~edziulxr{9oO7(P0hW^nb&H#?J+&$D8^ zJwN3Wx}DYvr*{;WCJw4oe9L@e^&?VnL+_W{F6pUM%~4ttx3nY9_}#SX>ACUxs)2L=v)_uh5yec~(m+hi?fQV%0Pq)#*y%A8)0V zrCNox4#`=+8pscO=i+$ldr%je;N6gNEQr`VUt(6ROggC_o5Zpy=%`Yhj1yVb@}Y5Q zt-!@DW)fGkE2J58sO44`iY7$VD6rDtp_!DN`F;y4g)bvn>9~y%o!MMQcGclV9WNR7Tq3ZY#%CSawuKHG)d~~M? zc_is^&Jj$WLt|-m4^BjRPo88YD&MB-(LS#O^1BYFchqzRu}@QPtD83ymgifeaJaWx zRu?r%<<{;LX=-{G+vUn?D{t5E5yyeZQ_ZFx)v^Y=MFQTfC=Hr!kPgd#aOcyTqh2g| zjk`k^z6xAF{Yu22F%|E6_OD-4ArUS8p@nS?wY!HV2lVr9Nv?^WDfQp*dA*|6kAy(B za1ScNf7Rm=<=US?Me9PvoW61%|0tTeA(FJ(eu+}n;rVLwaZY0VHlay_F`xN@ZlTZ` ziM+8{=V{XE$hC_tB+iQ;U2$E#WyMU*@25Maj{9i@nx5(4cFuj9ER9iri0hrEHGa~L zxOL_0hNR*zl_A|_)Kbx+W4{MTmKuY}}Fh`O^XE2f8g< znR)3i>PqI8^j=JLl}BeP=X^+o!f8|Lde5pI z(|e5KUOuc%y7{K!R)3we7iOG0Hr>KhNSqjCs&%}O@d{8ZYw0%X_If{%^W-m)V35ZK z`GGJO^Pda*0{+6D#al)bSsXgJ@IM@&vw}!85fnxl=y*CetHVYv0VDI(7lYTDz6Yg8M4q7N0#N~aG|?C9%ec;PNK10chJ~1O=zrB zDjNF(+kve$Lt`BvToQxEavadusn0Z=h$rt6w$2ER-3{TdyU^IT_Gs+zdm2t$fsX6g zsE;#fZo=kVZ^BB$*R!(;o7GRjo=GsnxkI>ZQUbSqvfqG}&9CylwH5!O$1_6~>hTYdeo$))c=F;|PkaBfpR^w8Ag><7r?0hG zpeq*g`7_#i{IX8cd_8@%dGabKoA~Yc+MpQTQhYj7heyBdJH@9L;rRM3*ypabU>`o6 z*T2T1qkrt@^>qGc=R!Tc57GPl<92>s@W0rX<5>GV-i3yX#{I)D`KPZqxg{j8~78)j1(cpm}c!(Du9RjTc(8iOrz)uZk zfkzZbZr7t+T$mhGYtX@;9FGa0^S_@QPXZmpkxKx`1D+;;X9Um%JLq%}NAlmu1G*DH z4*@(U9|FAu@Z#@O?H`k0!aOr__QJRJ6+DSSMi?ewsli0PaxBn?Xx7hD{51i*;o#xz zlL0+SOA_TiLK(!Pph-gnFihdu@D{NFs(}12KprrX07eO5jE_ZxArE+$0NxY82R;@V zV&nni3E(3Ee1htJCV&YFRFCosW|Mg;eBlZDN&u4_ydW2RgDNERRhZ%_H%$QF`6|pH z*A}`ch}Ay`;3w<`1-lX8ck_$KJWBw-Id~y1nES_)c|-sc1^*inm;$imb;cBg5d7() ze4s$Jm_j_NFhc!L<(MK6!)q?eV-SNS2QSQJibLRU2}}uu3q}AF&Et}UC0~dX!Una3 zDl?^dG#P}(7r|GYxd>u-jb(WZa+8p}^x(ge6~u62b=U7~|QY z%%iFx)PLEb3NgIqOLz=RVadUZa+zun_(zC3!iCtejK{ScmV6-^2pcl^&<;%=Z3RO6 zmmOLV!)vU~W6*&m2QS8Du7topB6JZh#11_kmp&}{A`B2Vo*f^c5JMhy6+-=&9Yzqt zYi`VAFo7k90}tVl!%9{p6aEh)F8q$Ln5K~E$25bb9+zp3e0cXo;9U+5oXr24(@YCU zhQjarF)a~BN?RD>#4uz8$o%UwEad(IHJE7yQAtSHak$LY2yfUwZPy@m%5WGGWnqYm z!Vvq&VSn17IAq>LgzGVQ1#IEU1JfGvEc{nx+8}UfBGVRr2~0a!!ZCq&X`~XE7=(nJ zc0NvejD2X)g{?5|!mfIik1dgK#p++P#~DHR!wnSf)#)y*>eVs~D(wnp-1!P-q(l1;2!fSWHwU{; + + +import logging + +# TODO remove this before pushing tests: +import sys +sys.path.insert(0, '/run/media/mango/Data/Code/gensim/') + +import pandas as pd +import numpy as np +from gensim.models import EnsembleLda +from pathlib import Path +import unittest +from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary + +num_topics = 2 +num_models = 4 + + +class TestModel(unittest.TestCase): + def setUp(self): + # same configuration for each model to make sure + # the topics are equal + passes = 50 + random_state = 0 + + self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state) + + self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state, + memory_friendly_ttda=False) + + def check_ttda(self, ensemble): + """tests the integrity of the ttda of any ensemble""" + self.assertGreater(len(ensemble.ttda), 0) + a = ensemble.ttda.sum(axis=1) + b = np.ones(len(ensemble.ttda)).astype(np.float32) + np.testing.assert_allclose(a, b, rtol=1e-04) + + def test_eLDA(self): + # given that the random_state doesn't change, it should + # always be 4 detected topics in this setup. + self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) + self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) + self.check_ttda(self.eLDA) + + # reclustering shouldn't change anything without + # added models or different parameters + self.eLDA.recluster() + + self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) + self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) + self.check_ttda(self.eLDA) + + reference = EnsembleLda.load(datapath('ensemblelda')) + + self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results) + # sorting will not be the same in case the sorting key is equal + # across multiple clusters. hence use assertCountEqual + self.assertCountEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) + np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05) + np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, + reference.asymmetric_distance_matrix) + + def test_memory_unfriendly(self): + # at this point, self.eLDA_mu and self.eLDA are already trained + # in the setUpClass function + # both should be 100% similar (but floats cannot + # be compared that easily, so check for threshold) + self.assertEqual(len(self.eLDA_mu.tms), num_models) + np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05) + self.check_ttda(self.eLDA_mu) + + def test_generate_gensim_rep(self): + gensimModel = self.eLDA.generate_gensim_representation() + pd.DataFrame(gensimModel.get_topics()) # should return the stable topics from the ensemble + topics = gensimModel.get_topics() + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + + def assert_cluster_results_equal(self, a, b): + self.assertTrue(isinstance(a, dict)) + self.assertTrue(isinstance(b, dict)) + np.testing.assert_array_equal([row["label"] for row in a.values()], + [row["label"] for row in b.values()]) + np.testing.assert_array_equal([row["is_core"] for row in a.values()], + [row["is_core"] for row in b.values()]) + np.testing.assert_array_equal([row["num_samples"] for row in a.values()], + [row["num_samples"] for row in b.values()]) + + def test_persisting(self): + fname = get_tmpfile('gensim_models_ensemblelda') + self.eLDA.save(fname) + loaded_eLDA = EnsembleLda.load(fname) + # using Path objects instead of strings + self.eLDA.save(Path(fname)) + loaded_eLDA = EnsembleLda.load(Path(fname)) + # storing the ensemble without memory_friendy_ttda + self.eLDA_mu.save(fname) + loaded_eLDA_mu = EnsembleLda.load(fname) + + # was it stored and loaded correctly? + # memory friendly + loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() + topics = loaded_eLDA_representation.get_topics() + ttda = loaded_eLDA.ttda + amatrix = loaded_eLDA.asymmetric_distance_matrix + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05) + + a = self.eLDA.cluster_model.results + b = loaded_eLDA.cluster_model.results + + self.assert_cluster_results_equal(a, b) + + # memory unfriendly + loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() + topics = loaded_eLDA_mu_representation.get_topics() + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + + def test_multiprocessing(self): + # same configuration + param_ref = self.eLDA_mu.tms[0] + passes = param_ref.passes + iterations = param_ref.iterations + num_topics = param_ref.num_topics + num_models = self.eLDA.num_models + random_state = 0 + + # use 3 processes for the ensemble and the distance, + # so that the 4 models and 16 topics cannot be distributed + # to each worker evenly + workers = 3 + + # memory friendly. contains List of topic word distributions + eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations, + random_state=random_state, ensemble_workers=workers, distance_workers=workers) + + # memory unfriendly. contains List of models + eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations, + random_state=random_state, ensemble_workers=workers, distance_workers=workers, + memory_friendly_ttda=False) + + np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05) + np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05) + + def test_add_models(self): + # same configuration + num_models = self.eLDA.num_models + + # make sure countings and sizes after adding are correct + # create new models and add other models to them. + + # there are a ton of configurations for the first parameter possible, + # try them all + + # quickly train something that can be used for counting results + num_new_models = 3 + num_new_topics = 3 + + # 1. memory friendly + eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_kind='ldamulticore', + workers=3, ensemble_workers=2) + + # 1.1 ttda + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(self.eLDA.ttda) + self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 1) # defaults to 1 for one ttda matrix + + # 1.2 an ensemble + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(self.eLDA, 5) + self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 5) + + # 1.3 a list of ensembles + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + # it should be totally legit to add a memory unfriendly object to a memory friendly one + eLDA_base.add_model([self.eLDA, self.eLDA_mu]) + self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 2 * num_models) + + # 1.4 a single gensim model + model = self.eLDA.classic_model_representation + + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(model) + self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics())) + self.assertEqual(eLDA_base.num_models, b + 1) + + # 1.5 a list gensim models + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model([model, model]) + self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics())) + self.assertEqual(eLDA_base.num_models, b + 2) + + self.check_ttda(eLDA_base) + + # 2. memory unfriendly + eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_kind='ldamulticore', + workers=3, ensemble_workers=2, memory_friendly_ttda=False) + + # 2.1 a single ensemble + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu) + self.assertEqual(len(eLDA_base_mu.tms), a + num_models) + self.assertEqual(eLDA_base_mu.num_models, b + num_models) + + # 2.2 a list of ensembles + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu]) + self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models) + self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models) + + # 2.3 a single gensim model + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu.tms[0]) + self.assertEqual(len(eLDA_base_mu.tms), a + 1) + self.assertEqual(eLDA_base_mu.num_models, b + 1) + + # 2.4 a list of gensim models + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu.tms) + self.assertEqual(len(eLDA_base_mu.tms), a + num_models) + self.assertEqual(eLDA_base_mu.num_models, b + num_models) + + # 2.5 topic term distributions should throw errors, because the + # actual models are needed for the memory unfriendly ensemble + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics())) + # remains unchanged + self.assertEqual(len(eLDA_base_mu.tms), a) + self.assertEqual(eLDA_base_mu.num_models, b) + + self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) + self.check_ttda(eLDA_base_mu) + + def test_recluster(self): + # 6.2: see if after adding a model, the model still makes sense + num_new_models = 3 + num_new_topics = 3 + + # train + new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_kind='ldamulticore', + distance_workers=4) + new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_kind='ldamulticore', + distance_workers=4, memory_friendly_ttda=False) + # both should be similar + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + # and every next step applied to both should result in similar results + + # 1. adding to ttda and tms + new_eLDA.add_model(self.eLDA) + new_eLDA_mu.add_model(self.eLDA_mu) + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) + self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) + self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) + self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) + self.check_ttda(new_eLDA) + self.check_ttda(new_eLDA_mu) + + # 2. distance matrix + new_eLDA.generate_asymmetric_distance_matrix() + new_eLDA_mu.generate_asymmetric_distance_matrix() + np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, + new_eLDA_mu.asymmetric_distance_matrix) + + # 3. CBDBSCAN results + new_eLDA.generate_topic_clusters() + new_eLDA_mu.generate_topic_clusters() + a = new_eLDA.cluster_model.results + b = new_eLDA_mu.cluster_model.results + self.assert_cluster_results_equal(a, b) + + # 4. finally, the stable topics + new_eLDA.generate_stable_topics() + new_eLDA_mu.generate_stable_topics() + np.testing.assert_allclose(new_eLDA.get_topics(), + new_eLDA_mu.get_topics()) + + new_eLDA.generate_gensim_representation() + new_eLDA_mu.generate_gensim_representation() + + # same random state, hence topics should be still similar + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) + unittest.main() From 00a06e96b9711a4ba8915a9338b7050b3304c87d Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 9 Apr 2019 21:24:31 +0200 Subject: [PATCH 011/166] fixed test --- gensim/test/test_ensemblelda.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 97e3b31bf0..c6e12dbad0 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -5,11 +5,6 @@ import logging - -# TODO remove this before pushing tests: -import sys -sys.path.insert(0, '/run/media/mango/Data/Code/gensim/') - import pandas as pd import numpy as np from gensim.models import EnsembleLda @@ -19,13 +14,13 @@ num_topics = 2 num_models = 4 +passes = 50 class TestModel(unittest.TestCase): def setUp(self): # same configuration for each model to make sure # the topics are equal - passes = 50 random_state = 0 self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, @@ -67,6 +62,9 @@ def test_eLDA(self): np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, reference.asymmetric_distance_matrix) + np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(), + self.eLDA.get_topics(), rtol=1e-05) + def test_memory_unfriendly(self): # at this point, self.eLDA_mu and self.eLDA are already trained # in the setUpClass function @@ -126,11 +124,6 @@ def test_persisting(self): def test_multiprocessing(self): # same configuration - param_ref = self.eLDA_mu.tms[0] - passes = param_ref.passes - iterations = param_ref.iterations - num_topics = param_ref.num_topics - num_models = self.eLDA.num_models random_state = 0 # use 3 processes for the ensemble and the distance, @@ -140,12 +133,12 @@ def test_multiprocessing(self): # memory friendly. contains List of topic word distributions eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations, + num_topics=num_topics, passes=passes, num_models=num_models, random_state=random_state, ensemble_workers=workers, distance_workers=workers) # memory unfriendly. contains List of models eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_topics, passes=passes, num_models=num_models, iterations=iterations, + num_topics=num_topics, passes=passes, num_models=num_models, random_state=random_state, ensemble_workers=workers, distance_workers=workers, memory_friendly_ttda=False) @@ -314,5 +307,5 @@ def test_recluster(self): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From f5f1c9c15a1272d03d6160598d033ebf33b8e500 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 9 Apr 2019 21:34:43 +0200 Subject: [PATCH 012/166] removed some dead leftover pandas code from test --- gensim/test/test_ensemblelda.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index c6e12dbad0..b251fdd8d6 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -5,7 +5,6 @@ import logging -import pandas as pd import numpy as np from gensim.models import EnsembleLda from pathlib import Path @@ -77,7 +76,6 @@ def test_memory_unfriendly(self): def test_generate_gensim_rep(self): gensimModel = self.eLDA.generate_gensim_representation() - pd.DataFrame(gensimModel.get_topics()) # should return the stable topics from the ensemble topics = gensimModel.get_topics() np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) From c32ddad75e9d4dca604de92f94953a73f2f3af21 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 9 Apr 2019 22:01:21 +0200 Subject: [PATCH 013/166] removed pathlib from test --- gensim/test/test_ensemblelda.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index b251fdd8d6..c4ac16b454 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -7,7 +7,6 @@ import logging import numpy as np from gensim.models import EnsembleLda -from pathlib import Path import unittest from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary @@ -38,7 +37,7 @@ def check_ttda(self, ensemble): def test_eLDA(self): # given that the random_state doesn't change, it should - # always be 4 detected topics in this setup. + # always be 2 detected topics in this setup. self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) self.check_ttda(self.eLDA) @@ -93,9 +92,6 @@ def test_persisting(self): fname = get_tmpfile('gensim_models_ensemblelda') self.eLDA.save(fname) loaded_eLDA = EnsembleLda.load(fname) - # using Path objects instead of strings - self.eLDA.save(Path(fname)) - loaded_eLDA = EnsembleLda.load(Path(fname)) # storing the ensemble without memory_friendy_ttda self.eLDA_mu.save(fname) loaded_eLDA_mu = EnsembleLda.load(fname) @@ -125,7 +121,7 @@ def test_multiprocessing(self): random_state = 0 # use 3 processes for the ensemble and the distance, - # so that the 4 models and 16 topics cannot be distributed + # so that the 4 models and 8 topics cannot be distributed # to each worker evenly workers = 3 From dab067f3a2f52e4f294d05b8409133d707b7cb1b Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 13 Apr 2019 00:49:42 +0200 Subject: [PATCH 014/166] tests work in python2 locally now --- gensim/models/ensemblelda.py | 30 +++++++++++++++--------------- gensim/test/test_data/ensemblelda | Bin 9923 -> 9332 bytes gensim/test/test_ensemblelda.py | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 7f3f071e77..79b399fc40 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -84,10 +84,6 @@ import logging import os from multiprocessing import Process, Pipe, ProcessError -try: - import cPickle as _pickle -except ImportError: - import pickle as _pickle import numpy as np from scipy.spatial.distance import cosine from gensim import utils @@ -105,9 +101,9 @@ class EnsembleLda(): """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. - They are clustered using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more - true topics and therefore undesired - are being used to support topics in becoming cores, - but not vice versa. + They are clustered using a variation of DBSCAN, for which nearby topic mixtures - which are mixtures + of two or more true topics and therefore undesired - are being used to support topics in becoming + cores, but not vice versa. """ def __init__(self, topic_model_kind="lda", num_models=3, @@ -497,11 +493,10 @@ def save(self, fname): Path to the system file where the model will be persisted. """ - + logger.info("saving %s object to %s", self.__class__.__name__, fname) - with open(fname, 'wb') as f: - _pickle.dump(self, f) + utils.pickle(self, fname) @staticmethod def load(fname): @@ -521,8 +516,7 @@ def load(fname): # message copied from [1] logger.info("loading %s object from %s", EnsembleLda.__name__, fname) - with open(fname, 'rb') as f: - eLDA = _pickle.load(f) + eLDA = utils.unpickle(fname) return eLDA @@ -967,7 +961,8 @@ def generate_stable_topics(self, min_cores=None): results = self.cluster_model.results # first, group all the learned cores based on the label, - # which was assigned in the cluster_model + # which was assigned in the cluster_model. The result is a + # dict of {group: [topic, ...]} grouped_by_labels = {} for topic in results.values(): if topic["is_core"]: @@ -998,8 +993,13 @@ def generate_stable_topics(self, min_cores=None): "is_valid": np.nan, "label": label }) - # start with the most significant core - sorted_clusters = sorted(sorted_clusters, key=lambda cluster: cluster["amount_parent_labels"], reverse=True) + + # start with the most significant core. + sorted_clusters = sorted(sorted_clusters, + key=lambda cluster: ( + cluster["amount_parent_labels"], + cluster["label"] # makes sorting deterministic + ), reverse=True) def remove_from_all_sets(label): """removes a label from every set in "parent_labels" for diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 905aa8bf8a9ab4e3f8c2fc54758af605b910d30f..249196da763e8cfe508117c34fa4db5870cd7747 100644 GIT binary patch delta 6162 zcmZ`-dt6N07oX`pQ|ala_d}&ArI%Fh%v8E0MN;LUw zYtsUAZ7NxaDJ5gdG}uJG)xtyxQ;`>#O_9f{PLioc3#9z`M5=(p7Kzx2R7?#bWy0jB z>`+Xd29Y5RNF!@Pgd<6eN2r(vtQ6x7ksvYisBCT0v;pXfXM(@{vIzkaNxs~2+SJGH)e5=Vs_TjxABz`J%y(K| zY&lJqP^N(M^XTeTh#4Lt=pAhe`r%w{V%jyf3b-nC!QlVGMTTh+e9TV4&tLtK;2&lZ z(@$s@X)~eS!U6(cCs#2}9$=-fe(65GfGYWFDV22o&TM8d|*QC0L`-?hMfS1S|M z&4J}_RNZ92P-TDnVLVxM-x0%XQ!RjyHYlZEpHfkL3FCc7@b8Dh45Tga@m3T76^&1G)4ZuKoH3Sd@%wl`%i zv*L3Av#K)NPv+d1?UzvJ?I($nM!}PW@e(edy8&UbdGRP4v(AAH_*?;(4=fZf5=OD3 zxIC^T5t|z5yJ^Ef;0ePdn2M}p^H|6_M3jixKvb9#jYuO0Xu95I>hEjE%zO zXZ$pU%TK$p*|3~3??xrVyrq&+vq-=&8F)hvPoK^_3)vyw3{T9_JGF|e}`90hMkGfWkbu_AUniupiP1yh_TmLPm= zHl(UxN<;|o^8#p(g?$C7cJz+11}e042Tpa!*Rt@(K;j%V?runDu9%V z0n-;!SKy#hK3jnKL1YCBxDJB(Luz3ZAjcH|lo)n2g3W=H6wu%t#)HF&U;&V_6jtWM zh_PRjvAHyG(3KRft^{gj@KXUlRnStCfrdJ0Xn>jmYAX0?LP}D_A!7*y8?bqh8LR

H}$_-V@1%;J=qm+qbCb>G*M z@=G#FLGc5LTkLv~aalp;x=2zv=VgcR`HbU6EjO+Qf7>A3Lx~+JwWpX=?Pf>4HHNO- z|J#3XajhG6*~9jm(}=XKQRUbrS!mV0*<0Q?%!fyR$;eo+>cNy()Z3r;Sj_BPD-L|r zwbs|*;m3t#{^RGVf)@HM#fi~)pbU%dCjx-e94lwKI0a%ir*HEmgetY(Ddnq z-oCnhr@t9wTAJTwVePT<{!J)7SV`2h>Q`&evaF+RiVwc2e#4JRIiUa1>Hozg4zYUsU(c*pRFH z;J%kZwFmBJ=NwoWy@X3*+T{(GWm$$=ofru*d0#b(y@C_NT99iO;mbQ%r%l6KhD=&! z`)2R5c;TON^{$%3+k^IhrYw}@yAC>xX_wUX6eRSnSkTlo=c?+A z6*ui}1w@Ob~zen_ajeh&bz1*h3#X?)pyk@fO(P}A= zywdFAtes0j)ce)@3`M4yZ7cgmbbnhjet5}+a!2>QCObY<8cmB)dt*B0Y4eYM_11?A zXS$?%4elD@z3^P4w(92VW#KCiFIq)6Djd{d_icO2NA^~yb*Im-CEqzUW82mq%l`lD z`{%^$^wK*eWwqw}9t+G&Lq&&jO`d$pm0cXI@Vk3F{Ai_F;m(w>Jl7;?2eQhbJJTfD z(ROt4o_VK-qRuowSI(+-uaHIP@B2K8O**;pQ1APtHea&C`}XY5GrE15UG9cCf7AKO z*d38E`*6@e{%=CVsnP6vm%}fd>0dab==_|ZBv`EM8|p`rR7xdljPI4r;m*k91pb)Xispud$#b6vJvHmLx|EXWyhr& z9p}?24;M3Qoj3li^{n;6f;7vnpcKpLsrI?&7s@u?cy~Rev^uo$*4);esrSwcKe%5k zdUfeWe76%A##YhflR+vwHWk?0OfEF7C_dlWy{X{(cAZN zpIEfdx*}8cF+Sw@b;j!k3ypVaH~rpnW6NGWY)c9L)cfbfjmze1l<@1&w>l3Jons>= zHMEQ+d*@GcH5JV5;!E>GTB*^4aAiO)f4Na0Qa00eP9#!c-ETd}e6zjg7Tx%**}#`Mu${<--S|K>wccZC?8?I*oTn zTVD+x4&J>rs(_lkN=Llea@Jrf6M$Kl~{hT6a6v^Xd4WW%{8> z-@4~^ok^De&Dws-MKgRiBk;47x^-dP^E)*+FRgLeu<=HXY)88P%FcOfcJ_Q)KmS_n zry6ap$;aAVvZI;hp58leo*w@7ZvCd2?V-O|nUstej6OfykhYQGrQ~s8&bSBD#qq6b z_43co(M2m)47^&Qv6)Aj_0Z5>lZW<|#M}H8uWdpO2;UsH+K`so%NVdoKVZD0M!G+t z;&c7G+Stcgr-D-)WzJcPTt|ZQ5=z$8kGq zRI4j*ki0|BOZTa!=8hG8`#64U1oH?DyE3om!yfUO6@r<6a=$bkzUMZ5!{t@CE_hl! z{Tda~#Br$~oYHAMb;Obq;3<~JhHigkNWWTBM?HAu?#QP0(LJTjvZ5r9S&nD&2EroT z(4%P%Jk^qzsk_q?saDH!BNm$uRES@hR!$mME|WpsuaAdU?~h9(vByoFNJkpftYOyV%Pp`dAC~q^q_3ob`bG_8Pawx5F-D#7H*DjZ? zynWU`vgWAr)6VuC2^PK`YMQyX^or(9_vx^s^}tOd=Y3WjGC1vqt!$qb{3-NI!fDAr z&rsF%S236LDyL$vV*M-%d2oTV)|X`lr3NrOFiYsR{!Q@jN6$Ym`S_RYYAR(kVQ!Lf z)6$h;wHii)7nBy<9CI+%+P)yczzONnP?a_=(qGGZCbNMuU8`R;KYr@LxO?6E&#Qq` z-nH4hw!JQP4Q*v!clK7+Jpt`nw?27m`=NdkbBP}|?}X3b*vA9;+*)C32E|pjWZo&4 zVkbexSFL8Do9nh)RazO7Kg#aX*H068E}$5y?!b@;^9GjNu><$#6yNyEdVr@&IW_f4 zi_UoXx5V0i+Pex?_1EhBGQ4|iEc%-6oB(q9*RajIeEu#!`q{38v92uR@1bL~uYn($ z@@1)opU*kPg@}*#6~8BQwf(--)E&@gEaIwZwK}GT#|5q37^HX5`|zn>pUCP+9V66J z>7=^}X`f*z6BK zBw)%zO&#a@(6_pBUC}pFFY$fZ74HjCU#?r99NRUxBFW-<0XtvaTdtR6kdyiu{yiMI zAWU)TtCvJUJctwsUcic?2&HQe`Q-sm6xr1@d;})Z8 z#w~`^y#tI&O0Yb3nz`!JN=-($JZIW;BMI?R)sE&#!P{QM;^QGUwiM5-*vXa&F*q4p zP6IIxk|1I6*inGD0uOOOn7|UV`SCmu0EB@KW&HIY9_V{+4_=C3b?{7=@<24YG8tP1 z9@EL#Y8tE!0yi#)70(tS0>FxhX0f8AT%H7Av8W(Iqhf0!Fcw5+XNF>tuy(wULXenN zjImL`jUu?wuqvSp2M^f^Ko*xH#t>X6h6d6Y4<2vf1@uUX9(AGv?^%cIu^t6eqxg&<+m$deVyqlCYwM=C6r7f5Yh=M<6o6G0iQyp zs6vIzA1bIMV_6db*~Fxq34k1el5GJ96Dn-QRZu9f?ZJYsx4K^6R~ zzz$pi388@e2Y^~KwsQjDH)6^{0$`VdLa^QcDzHak1Kfs1KgO#kV|ypY|4vM~j~HJ} zP$mB=P)aBu#R2wz2atkPhYiR#+BzJ_fmI|D4jU``=`0?^&!PsAds&uBCXuY=WOGQS zGE8AMK5k?ntWx^7k`j6ZRzja#PiBNy)zj}huBX2k7-BrPeZnZ6ahv`av{x83GmP@; z>8CyHn8Ok~rjYrH@hSBK!{U@Ov%ON8dFsJ1oqpyWeTcl4*$LXF=UW+<&L}gB*}e1t zv)lCK%{_F-GgL;!BPyeCh)QKhwo)0zZtaZo@>}LMK3ZoR8B1?AGWPVNjJ;WI%*CL6 z{XK=b;%Os;8C6Ob94Mh*5tq=pr?na5+LIV*7Lypoj#P%K+|6Q&Of!eaoce^ubkJ>N zRDNt=^t^3gG#NE9boO!?-3@Nc2GCw)OJ#OXZ(`V#m(oK9O6dC4C3KLt(8beXY(X@F zXYdpWxN`h76KO1#Sjxu^fn%TxtGpP0>JN^#pgILo3ZSY+BCyjFoEnf_Aj`6_ zGe7~r`UB)3Ky?;U!>d0*A_7$BfP5{D7<--&xIl2~fWH8Ni$I_r_a4x`1O$|_u*(GJ z3g9$=s|@Ef5}-}Q&}P7C0SOMkxGG~uV68NQxCRm)pzk_h-NX zq3%tB(@Ai;5CjDoEh((4NUstf2UZ+cz`?pa92cNeKa&0-Gc5$z)2&+hHSo2Dv(TkAHHAv_aL$~ c8M{wIi6Q+oRD@=r1!xIAh=k&7L>e{Yzt{M=mjD0& delta 6764 zcmZuV30zET`|ZnALP-cQs;P(;lx8MmiKE428IozHId!IHd(X_=6k{T7c8%p06(uQD zwtLgHHxjG_jx`9RR^V!1^ObL^l92Cc1%$3DMagrbK58$pQOD2N)3m=pYTek2&fpvF!JjQ}}AtV{M$E}sEY6o>AON9}D@sIC8Lj|R5epTm0W_R61WZ$d z>2QCsiHJTn2{FJXBZinSVubl2#@H0Z1oMY`fH4s>jD?tEY=njdAQsqE6)9X8qX2vU^ps$c02_~al2Jk!5936l0E-bzt_5Tul*L5?7+FHV&BSIA1{3iVjj5#Z3Q-aY zmIU80rIbOV`H5iZ?0`oQ4EZ|4SD}W$`G=wjfV|)<3ZN8I6*ymcQkdNd1 zL>fR5y;Kkrs4oDq=>X#Z5gB)>AP!rQ8Au$>0OAGlN+1@HpFWu!9vtW=9<{0@lR7d= zK9FKUj9gBJ!L{TZ0Eo3LQvpt_HGx=1JJV?FIG|7hrGWIs&&5#4ugWB0IA1E1Nq8`( zlqmR38)`OMli1~2gR^Yfkz5`PFwB808vfK^afu-qUOs4*&FR6bxe6)+kO zNQMD}{r^x|BgmY79&UeW`H=bVGgFdjzBvx=dixt2c%No&G`zGsV9U<(ZLxgv ze#EWR;dj3qS+igITq~$7_}5fCC(Mem%_ZBx^Jw`O{*^V0nmVhDJ1~XQKj`SlvdqvO zu?4XscTRb>QWAa*}Rt?q7*rR@VRUw+%IE!;;2Z zBrb`+J*`%t^shH|)wz4-GPmxL4Oy_Z(!r_KA>u4HsQ4!D@yYJ)qaO<|S}gZou{bj|G__wlvSHw*R&n|y;-x>xP$JXFuyq8tgFK#QS->#;MBh)W(ik%@+rpc)@=hH211QynM;i zhJ| z^A97sAkDO-NyDScYukNTPQ3JpWp|6V&U*0SdLUYKx^b{c{`{5aE;CE$n+zhGXGqcm zUWTrkVD6prf?s#tVA^}z1+y{B){gc4&mEYex!T-{b&TCeFV-8@Ts7m(;oPm0E4>Vs zZX9~^)a-Y8&wQnG>_*Z94<-mjhTkfmH#y(p7QOjI zTWmk&`PIqXXsu>y`Z@+W=R^>D*1)4JUk0Xcjq*+0zk8$X$=>$wr_oR0Ey2wpt_IqT zMUA0@O1WXz)M3J#BU5R1dwpDt-S~feGPGEm5xVE>_Ox$XK3Z~vo&_Dr!KC;74y?~x z^v?C{gvSeBZ!6t#u4dkwlX>sb*1fbZcy^L2X3pH>{cXd;t`(a!iC>pV{fZ`rtj)T< zPTQSOzwh+ZrbX%FBS!tX@W3?V$h+^hPRV>;l-~HnCb{aN-cw256-jZ*tbns`n@2rg zw_(+-_?^G0Ir~zv`I5Y7<);3zf4yy+vgiH;f#9TR$On7v z^vObP`JkBQT8q@MmrKj4vpJgiRb}ghrJg68J$!F3+;JV%4LX?cJYuBj+(|QL-1Ls~ zecfNTGHY&eZD^Za%z^r#2|o98tWq1Vr`^{4o9&(89FyWzzr9!*U^J$oY%c%kswornPN! zO}O#R1absHygdzs1VKUFekbeL`4@Vwaz@jFX%ry_^_)7@`>|NZk@y%NLKM>BN8Itw1y zFYD;o{~{(gtDpb$r@^UGUg0BykzBX#p)GM=7kGt-b(tq`TDYpW;N3F!uDYn+yj8qH z*>{gfH8$LErXu0d_k8(8R>a}7uoDFV_b0H|+}~PJpLgwA)qpc$uV)PN#=4##*HmPD z*>bHu=I~MMTXyzeubvit3f*_4=NmJn%&FIm$uTiFr|a&tvRRUz5pZv7C8NOUJ7<@V z$0yaQNzbY`#AWZ{899@sckr`O-Nz*fnJMN*f1D2COP-yS#vd;ZJ#u>(at=IPJh8R=FWYdT z&GsV~mbNC|)_Kp4{@vaF`58g7-W+cz_i}wS&g{&aJ4r7;eO#XX$C79JCt2719%a3BxtC$Vu&PPkuOCZ0*xkL3m-~Leu+YFmw*4lH_9_fj?)(R98QRdaM++6}RA|It@tK9^^0yFGnGoVN05?>^_pf1f%0w{H6N7JgOl_ea}g zF$1spgf`BqaY&pWYv*}0v(&FS^sDxG(B8_h;K_@04~}UDJy?+25;^L4W_Uy4`GiFM z+?HFWOv8+%T#r{|n_Iq&)=qnrU%%!zETh~Gzj z8Q{_K>7Zls#fU$vvcgsst)y-4&_zUzvq;`8nm5)p;n?3ykKb*co_`+_Q~2?Q!9y7*a1cFQKLa$_0zKfW@hqsqRpJ+b-C#>j!LuFWx` zX_A6euj+I7B$}>?Zy-Mqblzio!`tik29FUee!be#-lWF#OK!{fN&QX?MV-qG4&6&% zec!)1fYFolcHf2EOXc!!hj7PR)aF>`9U z6ywvNY1?q%ahv+^_&}REOMJ)9baUYymTxa@3i)uIRaCKMh<5R}-J=fLMBO=jqx@;T z*Tjk8oBin_oBhAH7dBlmTkGdN_k-P$!LP9S9ieTGPWQ?pa$l`isTA%#)@N`>x{q;A z*ON~3>1AlkZT{|wQ9kyXCLamMt2MOaq%7Cv_wcHuw3OrgPv3d`t}oTx(1RK2yM|6; zid*%HQQMIj#-}ZxZPS->=wPi9qzI4|h~8{?^&}r9SVv_%BA65qX^2z-QsK3nxJ}3M zB@#YQiNPENlZk~OZFWGIAo%9nl5Xw<;)V_r2RUCEjln!IJ>*B)^dsZRfY*2;N2M;U znc63Yqc!VcX0t&JHZm|N2B-{_Kr6teA6M7S#MN~R2W%zp+=yiw*hbuxqx{C#2h4G6 z2slq~2(T#K8W48EB%mg5;Z#dFc7JU&_0pk+fW-3TGk&Yn&g8#IKeMS8Vb}M}V;^!1 zX77!k$=)*RazL&oCy-qVC;sL$*`@ymvMWl`r+VnprzW29oeHv)Kr8r>4wC8Oc9@BV zVGCeJNMuRq3i(X2ELK2V%Xj=xcPF9lE)Mwpr@Gmsx*4`btXWq}Si|c|Sbv2|Swmaj zFu7|r?rdUmu(*k(@+xJ`5fu2_rRV!Q-^}xWGa4`-ya&w9G@QAwUCbQ(TOsRaSv>O~ z9NQL(nNF|cnQJqbu=vj5*l!q-3G88DKX3oJgid)JkpJWMzZXlU zuJp%i?a!Mtz&<4?5Ww{LKZBr<3IYYW9Xy3f0`_m4KX{2EnlI+#G*E;s2sG;K?U)7* zq=8}%I7n_FQ4>L!#RfuMUHSl8Q~l=VlT^~bEYT5B-K&dX@_VG z-LtifB( zF^_pWvEM44*b??j)~DnTtdU0z**8iI*+(C~W3o@aWj@u9V>iOF!(W$K7fu?o^ZCup zSq^uYt23IIZYOE1gKad{&L=b)OSyr@%Jux4by|CFFkNHyr;Zigpkrk}z*xJ}JlSDz zeD%FKdvS-3#TM*kN(=It7Zv$T?6?){yH!7yW<)<$o*Rv2y6ifu;bgyC2i}u#a$z1lpxF{Z~dxkIV(Pz^Z#!@Va6_nzc{KPdN_rGY>D(!3a7 zP)VhEr#PVMf2VnW!8A{gC?foN!mi$IBT`$v*IQ2!eOJHD)Ews3Y&cIYyxC?-62Sd4 z(|6W?tR1g!XOWD@)Kk!8ZJ55ZYf>S}J=ObGGYKAafSf01T_@-LXIW9JMriZ(?fm3i zOKOTc9V5ukFZ=kIWGiNqa|j}Zhtd0#L$;C2Ici_)4>3({Ar#xI?Pp*=V8qV9Nyct( zigB)XvVr3$$43N(-2DP2rD-S0E%#9NnU<_wVc^m~`7&u>@LFn#IBmIsor_+XpwBS4 z0ywP%X9Qd;IE)snVA%|(3ZURDETKAS&cQF+c{RAez*w^bPGu8M4*A)DB@3Q{FOiF3 z$>Sm{tdP}HbcK>9<_l0UsD>b-Bn3aQDlz;J>eQeHmdVthmdfo#3LY#KqM$B=6fNiD zuv$d%D!`>^fl4G+ilhn+4OU=i;Bq9mg6Q*8K|KdtB}Mfyh=dAo4TI?G46LXxN<_Zk zZz@V|a6rTVj*^>#Kr^BiNIc+H8ffHz+i-z~Kr^BuMm(S?4cy^qz+E&IG;_ecpLL!e zM&E~JDdK;CviBeOuR!<^R&PW?9;KZEv=FL2AT@u;0gqG&EJe36u(}NF5`lS)=s}y> zIG`OPjd(&B0d*k0Kg0u`YSKUl2Rx&|gf!v-&(pvQ4(OzygqMj2yi5bHIN&wZ_YDVh zVNgHpA6Sbbb$ClkddC6pRfvHae1JM+QaXGjwfn>YpD7)_5XH>f7-9Cm9H99M*U^LP z=u_)_BdNPN;JXSjRD+&hw)7Gba6R}9;pM)7L=uAMV1<2{IT3IE^H;fGs7(5{?M9q0+b&iR(w;C>fOIxHW{3 z%>7A%0kBme#%g>Z9Q+btLr@Wh*pgI(G_a#&*b#Kll4qnPgGsbKf&DK_hCm3(Ka?aG z23r+kqQ)KI;Fpx)1QlV(2$E_fY$+j*1R3FfXvio6r@==P$p5m04k0A>7=i+Kf~^WM zRpZWZ@JogZK}A@?AgNqoOUZB}$Vfvvp%ix#JC?xymm%XI1hQ*VaSxJWJZx1ecx52& zPYM|hYpT4y?}Y-N0FjZnCu|vN+>7YQLNZ?>Q^7<1UuPTlhG;1LQY7v}5Mo#3h<%JB ze1KAwbp=rZ6$$Z)5TwZekDdx8&m;Wb z7l_fmvMKHhv3-9txF0c&O~CV?VuXE(H0297s=T4KSvrR^E$ RZjG}bMv1dw3-2f%{|83-Sk?dl diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index c4ac16b454..057ac64bd5 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -3,6 +3,9 @@ # # Author: Tobias B +# TODO remove this before pushing tests: +import sys +sys.path.insert(0, '/home/mango/Data/Code/gensim/') import logging import numpy as np @@ -53,12 +56,18 @@ def test_eLDA(self): reference = EnsembleLda.load(datapath('ensemblelda')) self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results) - # sorting will not be the same in case the sorting key is equal - # across multiple clusters. hence use assertCountEqual - self.assertCountEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) + self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) + + np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05) + + # as small values in the distance matrix are subject to rounding differences of + # around 1-2% between python 2 and 3, use atol and select a 100000th of the + # largest value instead of rtol. Large values of the distance matrix are not prone + # to those rounding problems. + atol = reference.asymmetric_distance_matrix.max() * 1e-05 np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, - reference.asymmetric_distance_matrix) + reference.asymmetric_distance_matrix, atol=atol) np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(), self.eLDA.get_topics(), rtol=1e-05) @@ -301,5 +310,5 @@ def test_recluster(self): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) unittest.main() From eb9ea2761467406a67bbe0da85576bef37662026 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 13 Apr 2019 01:14:45 +0200 Subject: [PATCH 015/166] updated ensemble test reference model --- gensim/test/test_data/ensemblelda | Bin 9332 -> 9332 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 249196da763e8cfe508117c34fa4db5870cd7747..cbca32ca7dc6285620cb7c7b53796fd352ed8ec7 100644 GIT binary patch delta 2532 zcmV2us7%i2$R7KM`qP~_>n>u>|u%y zN9F-Le1}rp!p21WcFIB9m(V&Q9Ki0q#kGa`_iCyc--1=sMcZIlXFL(yD*T#@p6Zj{ zBWwg!(L~&9%|Mh*PCo_>@DS1XhecWBCYO>$n1f>UlidykG^a1l!N+aP~_zp~TEdn*6a06>!0lHWU>{+P@j=c=|DuL{lqM z69c5~Z&5tnr_+CP2_G50_}%FYM5+Sit5!L=EC^Gl1Y_JQLc1Rw$}_$(O1weu;q-Ft zzl)udGSJ|7o%ni@CiNbl zsgRO62W#5{14HShY)2jm=D@Hjzd4H9sMUBFg%h0tpPoOp_Mu&qpwx+BwM7Hjrq$T{9@7v2c)7uKGWk(KLaP0a zNImnmrEs&u=u_f)v@@Ww%^-G&BZ;Ul{+wTFd$=!FB{xK)8IhhK${ zn`D2q<;6w=U4$}nc%AUBQVUG@x7N!{!3cYs4M&493!n<(f^l%Qp2uk18 z!&Ss|?K-$Qx;+T1k|gmoL5>V~B}WD3mzaMr&Kud_pSdc|FhWUi<9w|iF$?&7VAU2o zmf5cU*TW5`2c1~z-l;l1gISlts69XCDK4|^2HdjBKYu*8#!5dN-nY~~V?*mme#%#r zPkeEOyfYaK?r=)MU5cJQc|?K6iijpmF_-OS&-!U@ogDK)p-!dfTk72gr*QyHDdK-s zjxW3QjvWZ+x|HB`bWrFwwhnHKiS{h-7mf(((9^qg5`nY5p+~NZb*L7I4!!XXSft^z z`t&d15t1-)P44N05$2*rB+d8Mb^d&FpU9N>lq1e4$dlh?!a9BEq!=)CFPw4^GBXs7&AVr)ZL=lb4z5(?4;i z8{;h7{%bqbQ}r9BpA0+$tz);W0Uf4fe(;UpVJ(UF8JmHF82WTNpl#E|s{Nuf+IrS3 zi=U9J0k8fG__CqeL2MNPARtw~NW+eYi2|Z=hkN{Nh>i3vj@y)YoNw*8zMht z6E_gO6}8}1%K+MLMT57>`RjGLFJk#0tgF2)v9-=N>UVNpj5{q^E{im2$b5VoHefZ{%Smy(s2=#4xCwyUf|L-^$5tuK`=jbu83AzEUXCeMGwMB|<3U%T^+ zi~6^5Z{g z3$l1zQ22vW6L*H#F8!S1PpWhz zcpzx0G99RXz>5R1kdW@97|x~XnPfoeAUMK3i-QU#64_J+Hx2(64^vb2H6Tf86IqF% z8f8WHFs^KQZG;4k{@EoCAsBQP5v(zOs~-P)a&=KAJ*#J4Y4jp_=NeT{3%TrU=GYuz zpM@}!P@yv#(6@iIOy|c*e+IQ)kn5o+6oVO6-M70oyy~3oGvy*$!zizFG2bx%Q_kGL zy3a?*ZZV!3H*jnmkrVtNfcklK#!BsFKjaRM9yZfl!7c86H-W1K^cfq8NZ&80J(moS zS}Za=LwV``!JAffH^YE-2D(WrJz0*YC1feaVgZ6%F|>bC8Y;p&)&uxCH=5ExKkQKZ zE8q0T2kbD}u5SfYpGBHxN1AVx3#WfHq1V1DV1}50Mo(KbPw#c6J9L<05 z24(8W-a{M01`fIWPBjU&&88(peOAgN9IRf%EV^S{JtFzu{lO@|F$GT0odz%5#DrA!>26Uy5C(j8v9U zeA8@%gVOxoX=vvuf4-pZMZoRgr^)jcjyOR#BJ_W8`y7&(#E3q9d<{%mB*qM<;}F!5 ze1O)>q6CKZt%A60t!C|z_wadxZNRf1(4QSFQ`eA8A*>mL|0gUf=YY2H{-7c|k0$jn z|ERRg-PsmIz_MMm?o2V^hQSi3X<%-|I$-Jn|l(_Y-BT*j$V4Mnn?Rv0YPU=yGqKSWJ#ja uel{3m2|YRKwbHN4wQaTHjKIC5P8djK#el)oU^25c9tPke`JB zNpRtLiY3`)|B;1uU;w3L>(FoMF0B~H%*##wdT*HtXZWBe2^z4NVPfzZOTyjmLE=!l zF>%2&B>pV;bq#?@ioNS#j{;)@hv`$~JyHE{etS`QYBA2AmQZ2+tTjxGD$SAdk_oP6 zM=#zZx#9FX&9vX?Bb9T+bWfYjT(uB)dLYAqbG>4Ei5QyYb_5t z@{*BXAtn5OCzO8+LcQ$#8j4MiNBESbIvr4 zEx{}1$~JZA`RijdMOQaw3P3(S(HR*Sp(TW+@AF}Y>GJxu<*r&uz>t4juVi+^eoa=rKc{WSK$7RaCRl$;9vm05*F2KZlnd>Op?U+6F1`q$jw3 zOpq|*Dfo{uKsmXs+6@oZczc(1FtJN-IH*d+#X`shE?3bd)k9;Zo?Cb<@*I{?#*A~& zo7)c>DLQJ<-`_eNFZ@)8?+W9FCOOLG7V-mA+dhAv2QUK2VDhx!cns3AL0zCh>For* zD4Ga|`}IAXUEBO)@V4BI#o|zJBsGu50pnbCYynyvzBZImP6yrx+Z1veh0a>r_z;y| z|EE#Br7AL;8j|#>7$#;0?;HI!Bli-ri{MN&FMv1Vm-uYzHC_f=(d|@OTBlE2JQ$YZ z4`F}XjPPw`o3eq_f=Idn*s(5*i`F0B7M=I5I)b3spQE}Ro#bvD8&YznjT+|mjlTM* zK-#TYtEn88lzv!`Fne{-Wm^-}g&KV-B>GU9MziH&wZ`cKhO#oKe_axt`TB7VuY9K2 z^HU@HhgsX1p^q5X!C|E_aV`E2{5_amgF1hwN#T%{ZxkY9Vcfx4ytUp;E}FYSNmjgO zeFC(Bz0Ub=@+uX7D7+dS0*s4-pLil?nW$@?In2JLe*~WhMBX|lC zy?n3}0k%NdO+7fIs5mWg5e_habIRaJ+bLM@s*DL+_k*!i8L(hMB zEZVo0?*kYK%qdO+)dMP12++HN3E)sZx-ER}4Cu+cN`W8KNq!$IfG3T+Onl7o%YLA; zQpD9o$&`THyKnR|zMAa7%y5u3SdSj+#~D%6RhxNXM>EvoCui`g{bb7Inl0J>MByK{ zff*F-XqLK3Co;m@YkLz)X#!+=&!vBVzLJ*m;l#;u6|YGIl7x{bBNdh#sGhsh)P#}C zj&Bl{;TyN8jzwRj<{r;B!UZ`QrYig{?ZX}j@qy7m@M&kG?BLFRP4?NizI;?j2%l}e zXz&l;c`avM4#UU&eK?RNFdJ?~(`|K-PRR;m;#H|bjcryKM`fclBtl?is3(8Ud31`! z&>P4N7k-un8DQ}r*#I+N`FuieZs0`I=D&~}2_hiO_N?=_Q1KCe;obu2xXu_-HMz#J zsNbkjqZKt05}?pGGWrY1Pu+v4dF;)fsVpk-S+2f7A3X*Ie*0iSo!(m3P&B%wnl=ga z|GylRbLYVd-V8HrdDq)SIZ*8hw* zp}s8t)MFvYP4Z2cgox!Snj2Eku92+~utT()0@mq9_2vTPvi z!@vrYJ*eW}r>~3&eoh2nBP}R5_Uh73I1-B_0gY_|j!Wtn48imA_J@Bk8q)J4;^eUt8Ajdl-BllNXlpS zv<+z*^}3gTtA(XCIF!+|@M%MJ1bG27)G`MN|#i!gcD|&yyTh+WZAm{vIUBzTC zzv36tAtLx62}3n=lV?(n;}kr|w7m+X!rJ(R&-$65#+rLGHY&A`;aFWUXsdxKYz3cY zBA9}C3LsRCT~Hh0sB`QasSf`Gqkl`(?X6j|p=*HvVgDN}0II_gdwb^}8Oe@fw#H1v zA%f3^-{r1`Hv)f!_)!Midi37US8fBq8?)A=LPDsj!JQv?;YZMc*q2}8osUT&67|*| zuY*;4?OS-AY6fS=a`x>w8VT@&{@I}}e=xw*TIGSncC#A3%E6#o*SA4nw5tQ>($A83 z9zW0q3yswgnno)*&?P9=RmJ$bIaj6=voUd6&n8XwQni10vvl|9Xa#r3aW8_fES@)> z5{3x@;z}(H`&1L46IE5=Hg7R^?@gSmyXl|x&B%ZW`glcq8O2jtSh@%p;=BS%(fuhI z430{86D(!Z2nBV-PZDZh=zk+ig)p-0#^mQSJ=)7SxX4Er@y;XdB+GL!QprBcEjR-Z zu1lA8cYS}0&!*l0K2bn%My@&H{q>@dX}WKKg9$KyQAV*Zo-A&t{0zo#F))nPundD0 z_1ZjlCT=xK2^0mAab;Z{!(TD1qTNEC%*`R;Yy}CiBErWH|5^Tdy7$l1kz3xn4?FUh z{rzLg5JE!%rTkckmpSgGtNA6LJ$9pm?&Yi~{6~NE#gKo1mif6XYff{k+n)0QX$?U8 zw7sGdJx^%{49O~hS!+pZd`S_cIH$2h=Dh&W_zJOt0NQwg`Bh6!yw*9z>7O4j5XCg) zel|mEO$rZjH)ZNyA<{vu48&71YUr(D>=W87hh2vc)8s|HWeDCW0u^ul<+eS9>1IkztSrIFk{36N!NNRh(I6yl;Ll+rM;TOKh`-67CV=Bj(-! From 6b0dc77c091c3c3a8a644e4d73d6e2b23ef923b1 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 13 Apr 2019 01:15:00 +0200 Subject: [PATCH 016/166] passing tox8 --- gensim/models/ensemblelda.py | 10 +++++----- gensim/test/test_ensemblelda.py | 8 ++------ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 79b399fc40..6b96655238 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -287,7 +287,7 @@ def generate_gensim_representation(self): num_stable_topics = len(stable_topics) if num_stable_topics == 0: - logger.error("The model did not detect any stable topic! You can try to adjust epsilon: " + logger.error("The model did not detect any stable topic. You can try to adjust epsilon: " "recluster(eps=...)") return @@ -493,7 +493,7 @@ def save(self, fname): Path to the system file where the model will be persisted. """ - + logger.info("saving %s object to %s", self.__class__.__name__, fname) utils.pickle(self, fname) @@ -898,7 +898,7 @@ def rank_masking(a): # print a warning. info otherwise. # The default mass setting of 0.95 makes uniform true masks on the opinosis corpus. logger.warning('The given threshold of {} covered on average ' - '{}% of tokens, which might be too much!'.format(threshold, percent)) + '{}% of tokens, which might be too much'.format(threshold, percent)) else: logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) @@ -993,7 +993,7 @@ def generate_stable_topics(self, min_cores=None): "is_valid": np.nan, "label": label }) - + # start with the most significant core. sorted_clusters = sorted(sorted_clusters, key=lambda cluster: ( @@ -1132,7 +1132,7 @@ def has_gensim_representation(self): gensim representation exists. If not, raises errors""" if self.classic_model_representation is None: if len(self.stable_topics) == 0: - raise ValueError("no stable topic was detected!") + raise ValueError("no stable topic was detected") else: raise ValueError("use generate_gensim_representation() first") diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 057ac64bd5..fba8ed5acb 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -3,10 +3,6 @@ # # Author: Tobias B -# TODO remove this before pushing tests: -import sys -sys.path.insert(0, '/home/mango/Data/Code/gensim/') - import logging import numpy as np from gensim.models import EnsembleLda @@ -57,7 +53,7 @@ def test_eLDA(self): self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results) self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) - + np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05) @@ -310,5 +306,5 @@ def test_recluster(self): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 6dc60014b96dadc2109a805f4d1f1c5c836675c1 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 13 Apr 2019 10:47:41 +0200 Subject: [PATCH 017/166] improved determinism of methods --- gensim/models/ensemblelda.py | 20 ++++++++++---------- gensim/test/test_data/ensemblelda | Bin 9332 -> 9316 bytes gensim/test/test_ensemblelda.py | 20 ++++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 6b96655238..ded79117a5 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -964,7 +964,7 @@ def generate_stable_topics(self, min_cores=None): # which was assigned in the cluster_model. The result is a # dict of {group: [topic, ...]} grouped_by_labels = {} - for topic in results.values(): + for topic in results: if topic["is_core"]: topic = topic.copy() @@ -1052,8 +1052,8 @@ def remove_from_all_sets(label): # list of all the label numbers that are valid valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]]) - for i in results: - results[i]["valid_parents"] = {label for label in results[i]["parent_labels"] if label in valid_labels} + for topic in results: + topic["valid_parents"] = {label for label in topic["parent_labels"] if label in valid_labels} def validate_core(core): """Core is a dict of {is_core, valid_parents, labels} among others. @@ -1067,9 +1067,9 @@ def validate_core(core): return ret # keeping only VALID cores - valid_core_mask = np.vectorize(validate_core)(list(results.values())) + valid_core_mask = np.vectorize(validate_core)(results) valid_topics = self.ttda[valid_core_mask] - topic_labels = np.array([results[i]["label"] for i in results])[valid_core_mask] + topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask] unique_labels = np.unique(topic_labels) num_stable_topics = len(unique_labels) @@ -1172,22 +1172,22 @@ def fit(self, amatrix): self.next_label = 0 - results = {index: { + results = [{ "is_core": False, "parent_labels": set(), "parent_ids": set(), "num_samples": 0, "label": None - } for index in range(len(amatrix))} + } for _ in range(len(amatrix))] tmp_amatrix = amatrix.copy() # to avoid problem about comparing the topic with itself np.fill_diagonal(tmp_amatrix, 1) - min_distance_per_topic = [(index, distance) for index, distance in enumerate(tmp_amatrix.min(axis=1))] - min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[1]) - ordered_min_similarity = [index for index, distance in min_distance_per_topic_sorted] + min_distance_per_topic = [(distance, index) for index, distance in enumerate(tmp_amatrix.min(axis=1))] + min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x) + ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] num_topics = len(amatrix) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index cbca32ca7dc6285620cb7c7b53796fd352ed8ec7..0ccd9d4c0d96da2ce49e3cd6e1af0d22623e8e88 100644 GIT binary patch delta 2576 zcmV+r3h(vwNaRSc!U%sOBX{F;>8~%%v)YGxxO1{1`t5N4<|jt>^P@D)T9;O=H#Dh< zUvB;>m?uc9cA^T~m6fJQ9Uyx%1@UZe7>wf`-Zq4TH!vYe+J&H3YHs~@RTS4 z>?{yT;%m_yxaDH0RACtQ>Wm3&qMfc%lUyNar#;EWQTXMSQ;C19JWhEn`VppYAdaED zNP{8ckzY&|J&@qgLLTtIGe|Qvcev0a^w;*{-4Py5buGIN z{x_NeDfo0^O$m>ZWc+aFB6SK{NT`wP@eTR%mWXFAKnAeAQlp%!(B28YZA)iFIBsp1 zM(9*RetbC*X1sqzk!T-XxnDH3cRX}((xg2*a-p4dPbbbZ>rPCaH3ji)WfqjHr0lfH z(I_cZUb3P_*BwkxHekcjL{;a}M>$K%ri3i%0!kB;9H6#CM-!&u^$i^GC;7ygXl58o zcKQI!k);A1ZDpv1VPbQ{fllebnmcF@YTAqkNnUHVj z*6l2cT(pW41$*#|2WoGY3Rrz=uGUl z7az*+m3!$akRc={)X_>K{z~4BMJx;k>7|^drJOjDFH%y~9k`scnBU}$Ut+@EEXh97 zD_eih`X6T;5Fh!zrJFPO^SbgkTEi3YI197L3c6>MH{|d|lE8%C^=jXl4hR^D9E4hw ztegxnFhi!$m7Z7?VWt(Q1kg|bRJN@-^sp+g(mls^ZE>5K97bpMeDmAg05c#pq&6QR zwG^`czr#RJ#FV7(Gv*OixHoLfQA3FDQYL@%x|`n2fj!(N28i65Y68~ccv`Y{9Iy4& z8o_067}mTV)|OvUm>`{ajV#Wl5fZn~-W(_SFnF?xhijLT80|Fn^$v_@{C)NUX_|!; z9gsqBOTa>5$c#$v+hM2SSo=ITk$Cb)yrQ^+9J2LR;$_}&Sh^Du zE4d=VeS^U678UL%TK;RNwW3k@w!?p>N>t2qIo6003rv8Zfu~uIS?q*AJk=YLNEejky{}xh3chK>r%l}Qd42p5<3*7F&%wuhh?a(4Hoi(VQthVVmYLD02EmfEuoph#=U;ww75D&30Q^ zv-gYP;xAL4uWGfEvz}-eN;AIBj{YlAo!q_T20NwLIjxqx&E2#Qtckd%@%?}(3dQh* z;J%w;13(*D91iGV zP=JT&!L?CO-H_*b{l1znbExtfI#%xiGP96xI7it~R_Cp?wv@|~l) zIY@TNT>X+Ee(sM3LK`3oD5f=9JqoF*Meax_TF(rpYWZxdj9Z7Pw8vZe6@K*FBh>b% zVTL?mmLR*gIbGoJ)-sUTJrkkV^#54k-Ht8yRx;B=)5)zo@dLrO0qLdRsPz=KcD@d8 zEyC*LH#W1s=>y1T?pc4g3|UiDg}9d*o{L_v8Y>CzXgevqaZ%B7yw%B zQnoRs4=c-MjGcW1yCVE!+o`!<3_U&Tsf4acklgcu=ux^|=N5l^E$U*uaOUUaXkLJZ z6tRf|(alIv`l>?Kg-C41;_R3dDYU}4*cqkdi<{5n2^`;fJGO>#7P2#bb$HU0@wSy4 zSH!)MG@dVJL+%ZgkmYp>=RC125X+3WPi+0qtOa0rv9*80F!JkVTDu#68U+m6-;%f1 z6@!9-#NKQkXf1!;JN}Rf`_9xUqO0la$2cg|Y1+JC2sZ#>5hg+|a@)td*D}4j#8UHf z!DR6%0%7V(PySeU-ZT5Z%7x~u{XjVX|4gu!W9X(V-VO?XtOyw;J%{M?4+y>R#Kfiy{~*?k+V{tkD|=AMfGO=hI~%drqH5v|P|G_cmC~=q^<&~I zg{1jhM4f*?rTrGPJ<8b)lNMAxy=%Ex`0?3-KNgk;H(-k581gsy5(H zR7_~o7Qe}pAHEwpOqmig6WEM1W_sM`P#U&2r%`_s;3j5Pstz_mN14*h4Sixfoki1n z;(9EoN?3iaS?MJ%GaibP*<@JJhrg2OO~~q%5v5rv+CCtn$M8suRehNggg2&#+5yz) zDHrd&20mf9LSPpy#R>C=bc2LS?F&5Jvn6eK*U_Hh7(fFV#GAtTMc6Kt!qLME)#7T- zzh+m{K{E3K#5v|@ZR$87)1Ynv=M<@~5Xco8gXyn-S8xoo+r{ZK;+xL!+E$?WDIpL# zKE=D#_3&p9#6BcaDJdt=)Z}=`1i_^u5`!@xipJ?cHI#H>OL((}5+nfxU2$6|ldu!^ mlc*GLlNl91lUfx&lbaPiliC$OlN%O2lUf!(0cEqA7TOKepZpU5 delta 2599 zcmV+?3fT4JNc2ds!U%t%$quyR=|Ts=Er{nPJ7f>2us7%i2$R7KM`qP~_>n>u>|u%y zN9F-Le1}rp!p21WcFIB9m(V&Q9Ki0q#kGa`_iCyc--1=sMcZIlXFL(yD*T#@p6Zj{ zBWwg!(L~&9%|Mh*PCo_>@DS1XhecWBCYO>$n1f>UlidykG^a1l!N+aP~_zp~TEdn*6a06>!0lHWU>{+P@j=c=|DuL{lqM z69c5~Z&5tnr_+CP2_G50_}%FYM5+Sit5!L=EC^Gl1Y_JQLc1Rw$}_$(O1weu;q-Ft zzl)udGSJ|7o%ni@CiNbl zsgRO62W#5{14HShY)2jm=D@Hjzd4H9sMUBFg%h0tpPoOp_Mu&qpwx+BwM7Hjrq$T{9@7v2c)7uKGWk(KLaP0a zNImnmrEs&u=u_f)v@@Ww%^-G&BZ;Ul{+wTFd$=!FB{xK)8IhhK${ zn`D2q<;6w=U4$}nc%AUBQVUG@x7N!{!3cYs4M&493!n<(f^l%Qp2uk18 z!&Ss|?K-$Qx;+T1k|gmoL5>V~B}WD3mzaMr&Kud_pSdc|FhWUi<9w|iF$?&7VAU2o zmf5cU*TW5`2c1~z-l;l1gISlts69XCDK4|^2HdjBKYu*8#!5dN-nY~~V?*mme#%#r zPkeEOyfYaK?r=)MU5cJQc|?K6iijpmF_-OS&-!U@ogDK)p-!dfTk72gr*QyHDdK-s zjxW3QjvWZ+x|HB`bWrFwwhnHKiS{h-7mf(((9^qg5`nY5p+~NZb*L7I4!!XXSft^z z`t&d15t1-)P44N05$2*rB+d8Mb^d&FpU9N>lq1e4$dlh?!a9BEq!=)CFPw4^GBXs7&AVr)ZL=lb4z5(?4;i z8{;h7{%bqbQ}r9BpA0+$tz);W0Uf4fe(;UpVJ(UF8JmHF82WTNpl#E|s{Nuf+IrS3 zi=U9J0k8fG__CqeL2MNPARtw~NW+eYi2|Z=hkN{Nh>i3vj@y)YoNw*8zMht z6E_gO6}8}1%K+MLMT57>`RjGLFJk#0tgF2)v9-=N>UVNpj5{q^E{im2$b5VoHefZ{%Smy(s2=#4xCwyUf|L-^$5tuK`=jbu83AzEUXCeMGwMB|<3U%T^+ zi~6^5Z{g z3$l1zQ22vW6L*H#F8!S1PpWhz zcpzx0G99RXz>5R1kdW@97|x~XnPfoeAUMK3i-QU#64_J+Hx2(64^vb2H6Tf86IqF% z8f8WHFs^KQZG;4k{@EoCAsBQP5v(zOs~-P)a&=KAJ*#J4Y4jp_=NeT{3%TrU=GYuz zpM@}!P@yv#(6@iIOy|c*e+IQ)kn5o+6oVO6-M70oyy~3oGvy*$!zizFG2bx%Q_kGL zy3a?*ZZV!3H*jnmkrVtNfcklK#!BsFKjaRM9yZfl!7c86H-W1K^cfq8NZ&80J(moS zS}Za=LwV``!JAffH^YE-2D(WrJz0*YC1feaVgZ6%F|>bC8Y;p&)&uxCH=5ExKkQKZ zE8q0T2kbD}u5SfYpGBHxN1AVx3#WfHq1V1DV1}50Mo(KbPw#c6J9L<05 z24(8W-a{M01`fIWPBjU&&88(peOAgN9IRf%EV^S{JtFzu{lO@|F$GT0odz%5#DrA!>26Uy5C(j8v9U zeA8@%gVOxoX=vvuf4-pZMZoRgr^)jcjyOR#BJ_W8`y7&(#E3q9d<{%mB*qM<;}F!5 ze1O)>q6CKZt%A60t!C|z_wadxZNRf1(4QSFQ`eA8A*>mL|0gUf=YY2H{-7c|k0$jn z|ERRg-PsmIz_MMm?o2V^hQSi3X<%-|I$-Jn|l(_Y-BT*j$V4Mnn?Rv0YPU=yGqKSWJ#ja zel{3m2|YRKwbHN4wQaTHjKIC5P8djK#el) Jb+d{V+6|Yx2PXgk diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index fba8ed5acb..aa419f24f6 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -51,7 +51,10 @@ def test_eLDA(self): reference = EnsembleLda.load(datapath('ensemblelda')) - self.assertEqual(self.eLDA.cluster_model.results, reference.cluster_model.results) + self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results) + + # the following test is quite specific to the current implementation and not part of any api, + # but it makes improving those sections of the code easier. self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) @@ -84,14 +87,11 @@ def test_generate_gensim_rep(self): np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) def assert_cluster_results_equal(self, a, b): - self.assertTrue(isinstance(a, dict)) - self.assertTrue(isinstance(b, dict)) - np.testing.assert_array_equal([row["label"] for row in a.values()], - [row["label"] for row in b.values()]) - np.testing.assert_array_equal([row["is_core"] for row in a.values()], - [row["is_core"] for row in b.values()]) - np.testing.assert_array_equal([row["num_samples"] for row in a.values()], - [row["num_samples"] for row in b.values()]) + """compares important attributes of the cluster results""" + np.testing.assert_array_equal([row["label"] for row in a], + [row["label"] for row in b]) + np.testing.assert_array_equal([row["is_core"] for row in a], + [row["is_core"] for row in b]) def test_persisting(self): fname = get_tmpfile('gensim_models_ensemblelda') @@ -306,5 +306,5 @@ def test_recluster(self): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) unittest.main() From 3ec31e7b5a57addca9945d3ed787a42801167f54 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 13 Apr 2019 11:13:37 +0200 Subject: [PATCH 018/166] improved order of assertions --- gensim/test/test_ensemblelda.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index aa419f24f6..285fa293c7 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -50,16 +50,7 @@ def test_eLDA(self): self.check_ttda(self.eLDA) reference = EnsembleLda.load(datapath('ensemblelda')) - - self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results) - - # the following test is quite specific to the current implementation and not part of any api, - # but it makes improving those sections of the code easier. - self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) - np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) - np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05) - # as small values in the distance matrix are subject to rounding differences of # around 1-2% between python 2 and 3, use atol and select a 100000th of the # largest value instead of rtol. Large values of the distance matrix are not prone @@ -67,6 +58,12 @@ def test_eLDA(self): atol = reference.asymmetric_distance_matrix.max() * 1e-05 np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, reference.asymmetric_distance_matrix, atol=atol) + self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results) + # the following test is quite specific to the current implementation and not part of any api, + # but it makes improving those sections of the code easier as long as sorted_clusters is supposed + # to stay the same + self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) + np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05) np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(), self.eLDA.get_topics(), rtol=1e-05) From 7afd192e6c8a0817ca5c5e884776edb81074bb7b Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 13 Apr 2019 13:16:39 +0200 Subject: [PATCH 019/166] trying to achieve higher precision with float64 to avoid some sorting differences across architectures --- gensim/models/ensemblelda.py | 4 ++-- gensim/test/test_data/ensemblelda | Bin 9316 -> 9316 bytes 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index ded79117a5..01b27c8da4 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -330,7 +330,7 @@ def generate_gensim_representation(self): sstats = stable_topics * normalization_factor sstats -= eta - classic_model_representation.state.sstats = sstats.astype("float32") + classic_model_representation.state.sstats = sstats.astype(np.float32) # fix expElogbeta. classic_model_representation.sync_state() @@ -854,7 +854,7 @@ def rank_masking(a): avg_mask_size = 0 # initialize the distance matrix. ndarray is faster than zeros - distances = np.ndarray((len(ttda1), len(ttda2))) + distances = np.ndarray((len(ttda1), len(ttda2)), dtype=np.float64) # now iterate over each topic for i in range(len(ttda1)): diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 0ccd9d4c0d96da2ce49e3cd6e1af0d22623e8e88..b161f268193a28a80ca1950db4081d379df29885 100644 GIT binary patch delta 2532 zcmV3M+5D9;_T9Q)S21paD>1~Jmlu(%_{80Sp{ z{Lglcm{ao=g)wRrusnqouKVF%-9K|Zs&cg^v#W0ed-hahBSOYY!@+(;kn~T**}^=_ z3hN>8Si8#2UPcl+N%`R)%~<6}ibG|f!eTa4G4tF+8io`tg+YIYvWrJCa&g~z$Fjaa z5$lQ$&ttcE-REs|@y9fm6y~Fiw=lxfvn-1v_OzKqX_>*;#X$qyqzlmnRtA=rhSt|> zBG$Z2(#lS)4$hN^7rhXVCM5n*Bfvm;wUm$zf_^d&_-uMc`!{MQXT~-YFfg2Og6wN` z0$5PFN^+A3`&)l?Zye?Mph`!WpeI1=BRcyOo$B}*pGb7I&*baBzSJ*+!Xi31=Ihir zubtJJRjqU0L~4a;EP9PvZ$0}_+x^zI>V@Q|QA}|=$3qAxw!#o_-A;H<(2ygLiclbf zvBBmvMH)M+H)M;JZqul$AF<>k46aE2b#`P2XIDn%2Ld14GFIy3?wm70onj2^90Vn+|=HfJu zS|7v~uws8`@EOXVp|PIpb-n;9pO$xU=R^D|^gUZa8Tzj~PuGz-UZ0YLKy5Jh3Cp6r zRcbC5E`kl07B?&Qrx)fbyN}cVT|?bF`UE}dXCMvOH?6O)D?~NJYGE9`s6t? zNMC=l_tJiFmjlG6;-F!3#ssw2XVZpxrUIm!oer_IbIRi_F|`vg*DvpWr-ab4HXWdL=qTxW*OTgMLoI4AFTb z;MS1Q`aJyPaQ65;$cJ!N_J^qJpdf1&ejPrWjg9dq&T4=*POv}pOqIViYvO>B!H*-y zRwU=27CD>roO3T_%D8I9k{kR$MKyo5=*RY%GwXc+@MYw~$3VO9H*~ z+f|W7^JnxoO&s7a0xyc(rz-ddcqm)7Umk~fnRU#tR7B~>zxR&~L4{CHPcnbG++4MW z;->F^nTC_Ef#rZRHj-P@#FcDre0uR$>K!V#iBUmhxLgXys9WBN0Rws-TiB_z;igAf5?-fBv~_`9Tl`ION(Du zH(;LB78AOwz{OB{^j(RnR!o0!e=gmP&Lx}!7p0#gA^0v?dCWQ>J9vqwKu}yJFfF4xOn=bva9{(+I7V?I zDY`irKV;k1Iv8H|rBLtaKDG_%m@+k^Sfq|7fIzCH)%O{4B#=+*#=n36%%Zw!9|MR` z;%?q%f>kdDC`<&*O9c&3!wM>E)&w;vI_=YX(tImYtXmW3n%3tnza@jiCOKx#N!or= z56!^hQ`0^+zG<@WO~}h&IYIdIb|0|W4QsL*0T+E~2LN)rXT^kr9*W=cTQ_`+bzIA! zNwyOH^pCIilfRzVzz2UQ0dBe`;;Ut;_N8}477cjFU@7smfymMkoH2oWk;vhZa-SmD z%tYFRgr^hwEP9{*y4AR~sO1>ii3)7!gBvvjaBM+J(6MsFlsrXK&0RZoCW!vw1&S!A zAucnVm|8=$(T6ySE~Y~~$ZG(L&_i=^T|)F2jO+yjNgbdF38a6aD2?PjlZ|ChoZOyX z+Ga$35Ulo2j^=yHqy0G%vB$@5IGHxH&N+V{@CR7JA+(RIMY*#Hx3dH9swmQ9svV+d zr+^#YVD#e1p&W_)1dO=ktCq#E9q&0@MvO}`>k2;0gVF&9BYZoUyKQ<_DZW(;{jxFX z&bM6D0h{IYOjm!%?SIC`Mwu=TC59J@x8#AIVFTA^fNE_>k_8*Ot7&H9RuhsoGN60O zrD%RM9KvNEEaBA~%ri{!soL-rF8h-Z>>Ep#3yrMcVbet?qNG}Ke1t^!4=04}EX9=( zfnghjg~$6WaaF&l6#C&mPlXxK#Z_C6qcaIBc^@c+-1&bZHx0Q;V*)cg)`j7Q%v?}1 z9R1LGiLTdQA{a#Ve%(j;4y>>!+z(3<-)LDaNLkTGKG$RQwd#jDr)1eN7hc97BrsXv z`di+v2#h-ITyEC$UrYX}qO1^#5r5Hjw?xtC>-yUrStH+@1R@Zr>y>l9`60H|E}d&n z?+=@D6li}u4kD)pb(<@6BN6sEP7Ve&du?nBsBIB7Jn3S(0PoL&D5b z!T6p5m0v^l1r|g-R1}qrpZEDOw8FD|zdTWeMZYfI?56!NCZA+h2uq zwFe}qxh1rn+n4*7s@9Pourk9^(~%Nmh-0h1EHLo=#^74`1p;O21GWSkM=-**sA%&X zKyp{Zt-=rp=d~2e-!hT67S=1bp5XfQ(_RGA&hFPF7yJGS4I(_IrMa5OK`J>65bK2YwP9! delta 2532 zcmV8~%%v)YGxxO1{1`t5N4<|jt>^P@D)T9;O=H#Dh< zUvB;>m?uc9cA^T~m6fJQ9Uyx%1@UZe7>wf`-Zq4TH!vYe+J&H3YHs~@RTS4 z>?{yT;%m_yxaDH0RACtQ>Wm3&qMfc%lUyNar#;EWQTXMSQ;C19JWhEn`VppYAdaED zNP{8ckzY&|J&@qgLLTtIGe|Qvcev0a^w;*{-4Py5buGIN z{x_NeDfo0^O$m>ZWc+aFB6SK{NT`wP@eTR%mWXFAKnAeAQlp%!(B28YZA)iFIBsp1 zM(9*RetbC*X1sqzk!T-XxnDH3cRX}((xg2*a-p4dPbbbZ>rPCaH3ji)WfqjHr0lfH z(I_cZUb3P_*BwkxHekcjL{;a}M>$K%ri3i%0!kB;9H6#CM-!&u^$i^GC;7ygXl58o zcKQI!k);A1ZDpv1VPbQ{fllebnmcF@YTAqkNnUHVj z*6l2cT(pW41$*#|2WoGY3Rrz=uGUl z7az*+m3!$akRc={)X_>K{z~4BMJx;k>7|^drJOjDFH%y~9k`scnBU}$Ut+@EEXh97 zD_eih`X6T;5Fh!zrJFPO^SbgkTEi3YI197L3c6>MH{|d|lE8%C^=jXl4hR^D9E4hw ztegxnFhi!$m7Z7?VWt(Q1kg|bRJN@-^sp+g(mls^ZE>5K97bpMeDmAg05c#pq&6QR zwG^`czr#RJ#FV7(Gv*OixHoLfQA3FDQYL@%x|`n2fj!(N28i65Y68~ccv`Y{9Iy4& z8o_067}mTV)|OvUm>`{ajV#Wl5fZn~-W(_SFnF?xhijLT80|Fn^$v_@{C)NUX_|!; z9gsqBOTa>5$c#$v+hM2SSo=ITk$Cb)yrQ^+9J2LR;$_}&Sh^Du zE4d=VeS^U678UL%TK;RNwW3k@w!?p>N>t2qIo6003rv8Zfu~uIS?q*AJk=YLNEejky{}xh3chK>r%l}Qd42p5<3*7F&%wuhh?a(4Hoi(VQthVVmYLD02EmfEuoph#=U;ww75D&30Q^ zv-gYP;xAL4uWGfEvz}-eN;AIBj{YlAo!q_T20NwLIjxqx&E2#Qtckd%@%?}(3dQh* z;J%w;13(*D91iGV zP=JT&!L?CO-H_*b{l1znbExtfI#%xiGP96xI7it~R_Cp?wv@|~l) zIY@TNT>X+Ee(sM3LK`3oD5f=9JqoF*Meax_TF(rpYWZxdj9Z7Pw8vZe6@K*FBh>b% zVTL?mmLR*gIbGoJ)-sUTJrkkV^#54k-Ht8yRx;B=)5)zo@dLrO0qLdRsPz=KcD@d8 zEyC*LH#W1s=>y1T?pc4g3|UiDg}9d*o{L_v8Y>CzXgevqaZ%B7yw%B zQnoRs4=c-MjGcW1yCVE!+o`!<3_U&Tsf4acklgcu=ux^|=N5l^E$U*uaOUUaXkLJZ z6tRf|(alIv`l>?Kg-C41;_R3dDYU}4*cqkdi<{5n2^`;fJGO>#7P2#bb$HU0@wSy4 zSH!)MG@dVJL+%ZgkmYp>=RC125X+3WPi+0qtOa0rv9*80F!JkVTDu#68U+m6-;%f1 z6@!9-#NKQkXf1!;JN}Rf`_9xUqO0la$2cg|Y1+JC2sZ#>5hg+|a@)td*D}4j#8UHf z!DR6%0%7V(PySeU-ZT5Z%7x~u{XjVX|4gu!W9X(V-VO?XtOyw;J%{M?4+y>R#Kfiy{~*?k+V{tkD|=AMfGO=hI~%drqH5v|P|G_cmC~=q^<&~I zg{1jhM4f*?rTrGPJ<8b)lNMAxy=%Ex`0?3-KNgk;H(-k581gsy5(H zR7_~o7Qe}pAHEwpOqmig6WEM1W_sM`P#U&2r%`_s;3j5Pstz_mN14*h4Sixfoki1n z;(9EoN?3iaS?MJ%GaibP*<@JJhrg2OO~~q%5v5rv+CCtn$M8suRehNggg2&#+5yz) zDHrd&20mf9LSPpy#R>C=bc2LS?F&5Jvn6eK*U_Hh7(fFV#GAtTMc6Kt!qLME)#7T- zzh+m{K{E3K#5v|@ZR$87)1Ynv=M<@~5Xco8gXyn-S8xoo+r{ZK;+xL!+E$?WDIpL# uKE=D#_3&p9#6BcaDJdt=)Z}=`1i_^u5`!@xipJ?cHI#H>OL((}65bIm5$yQ@ From 16d035707968f8ccf4ea3ba05f4402f9999e6af2 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 14 Apr 2019 15:33:57 +0200 Subject: [PATCH 020/166] better approach for comparing with pretrained model --- gensim/models/ensemblelda.py | 6 ++-- gensim/test/test_ensemblelda.py | 58 ++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 01b27c8da4..d1d5c87254 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -209,7 +209,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, self.eta = None self.tms = [] # initialize empty topic term distribution array - self.ttda = np.empty((0, len(gensim_kw_args["id2word"])), np.float32) + self.ttda = np.empty((0, len(gensim_kw_args["id2word"]))) self.asymmetric_distance_matrix_outdated = True # in case the model will not train due to some @@ -854,7 +854,7 @@ def rank_masking(a): avg_mask_size = 0 # initialize the distance matrix. ndarray is faster than zeros - distances = np.ndarray((len(ttda1), len(ttda2)), dtype=np.float64) + distances = np.ndarray((len(ttda1), len(ttda2))) # now iterate over each topic for i in range(len(ttda1)): @@ -1073,7 +1073,7 @@ def validate_core(core): unique_labels = np.unique(topic_labels) num_stable_topics = len(unique_labels) - stable_topics = np.empty((num_stable_topics, len(self.id2word)), dtype=np.float32) + stable_topics = np.empty((num_stable_topics, len(self.id2word))) # for each cluster for l, label in enumerate(unique_labels): diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 285fa293c7..b426a736af 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -8,6 +8,7 @@ from gensim.models import EnsembleLda import unittest from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary +from copy import deepcopy num_topics = 2 num_models = 4 @@ -49,24 +50,57 @@ def test_eLDA(self): self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) self.check_ttda(self.eLDA) + # compare with a pre-trained reference model reference = EnsembleLda.load(datapath('ensemblelda')) np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) - # as small values in the distance matrix are subject to rounding differences of - # around 1-2% between python 2 and 3, use atol and select a 100000th of the - # largest value instead of rtol. Large values of the distance matrix are not prone - # to those rounding problems. + # small values in the distance matrix tend to vary quite a bit around 2%, + # so use some absolute tolerance measurement to check if the matrix is at least + # close to the target. atol = reference.asymmetric_distance_matrix.max() * 1e-05 np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, reference.asymmetric_distance_matrix, atol=atol) - self.assert_cluster_results_equal(self.eLDA.cluster_model.results, reference.cluster_model.results) + + def test_clustering(self): # the following test is quite specific to the current implementation and not part of any api, - # but it makes improving those sections of the code easier as long as sorted_clusters is supposed - # to stay the same - self.assertEqual(self.eLDA.sorted_clusters, reference.sorted_clusters) - np.testing.assert_allclose(self.eLDA.get_topics(), reference.get_topics(), rtol=1e-05) + # but it makes improving those sections of the code easier as long as sorted_clusters and the + # cluster_model results are supposed to stay the same. Potentially this test will deprecate. - np.testing.assert_allclose(self.eLDA.classic_model_representation.get_topics(), - self.eLDA.get_topics(), rtol=1e-05) + reference = EnsembleLda.load(datapath('ensemblelda')) + cluster_model_results = deepcopy(reference.cluster_model.results) + sorted_clusters = deepcopy(reference.sorted_clusters) + stable_topics = deepcopy(reference.get_topics()) + + # continue training with the distance matrix of the pretrained reference and see if + # the generated clusters match. + reference.asymmetric_distance_matrix_outdated = True + reference.recluster() + + self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) + self.assertEqual(reference.sorted_clusters, sorted_clusters) + np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05) + + def test_not_trained(self): + # should not throw errors and no training should happen + + # 0 passes + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=0, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # no corpus + eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # 0 iterations + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + iterations=0, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # 0 models + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=0, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) def test_memory_unfriendly(self): # at this point, self.eLDA_mu and self.eLDA are already trained @@ -247,7 +281,7 @@ def test_add_models(self): self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) self.check_ttda(eLDA_base_mu) - def test_recluster(self): + def test_add_and_recluster(self): # 6.2: see if after adding a model, the model still makes sense num_new_models = 3 num_new_topics = 3 From 01b68e4480596a8185f91a328cf95eee8a92fcc4 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 14 Apr 2019 16:27:43 +0200 Subject: [PATCH 021/166] potentially fixing the tests on windows --- gensim/models/ensemblelda.py | 2 +- gensim/test/test_data/ensemblelda | Bin 9316 -> 9820 bytes gensim/test/test_ensemblelda.py | 13 ++++++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index d1d5c87254..e49f946c21 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -96,7 +96,7 @@ ldamulticore=ldamulticore.LdaMulticore ) -MAX_RANDOM_STATE = 2**32 - 1 +MAX_RANDOM_STATE = np.iinfo(np.int32).max class EnsembleLda(): diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index b161f268193a28a80ca1950db4081d379df29885..9b1df8521e47cfc0c68e80269b5aa0bcf5849454 100644 GIT binary patch delta 5345 zcmY*dc|6oz7awaxM1?HbGpUrslNvK)4P&o)Dv_~Fezq7hviBphK1B8`%7 zA`xX9MZKhyJ*nQA-}QXF@BH(<=X>tC-+RwJ_slS9IAW6U4n~J%Dqt(RaFbqpqQ!4? z;ZP`4guFH#tLI5!zgWPsLympvrD2=0-1xm~5Wj`ov;r3TEc79#?*nW8#s}?4I8ckd z@W)!v_!Ywcsq>4(rqP58_kek%iUe8g56&dtI;OpzO))C%`T#Q*0D=E5lg)m?<7_8u z0eVO8Tn<(hv#zqgi)9}RjTOF!oo^F(-i+8_qb3VuO9gkja8Xxy<8CD&8$^~`T*~4Z zzJ(k>E-pP1>(s0M^KBK&#-n#0rDHEI5AOcc18?s1U%_JTIjHOGJI*+ zgl>^D_u>CZ&A%sX=eG26KvD<Z-7!bB z+%ZK(=dhEn&SCXacVf58WoWSrWoYpWXJ{!HWoX@O6V{>#NMTvyE3|1X2ipuS!$kN* zpIFcdOP#$wze$BJPocyNjRkE;+Dgxk;$Dm_Qy*s(9@?5#I2#tMoeRS zAm2kaYcO@#b*k<{>S?0cf~K6A$6VW0y4qIP?S)z5xF^?p50i$8=5}i0LVL$>0rm85 z`f-hw7uI`TsvES1B&CRj?&T*TRwl`g2a*vX;#77 z2mjNsAc3OTQ^q~IUAR^3XziMUJVvw6w)Ic)8^ytcnh#RvGTWu~6kn?fg*v%hugo*&Kf zKEL$%Ye4d;s~sjDS!4Xz2;(bJXR~Y_9+2Bl)?e+Oz@D=ik`X^!LJMD3g}GIEBQAXV zed4!dtdnxGupGM|uGT$MF=C|uvyCY(rTDsNl3aUj{DOulTq^%ueR+F$j@@4nj+QP8&KUKafiHRJ`5#NE1&vn7on{}?NhxI!e z2t}12+*@6A^oP?@L68ljai54Mfqs3oQSM-%Y$aRgC)sB`>poQj`hh%|`yVc+On$u< z%{9Bi^O|iYs5=o}H&87(*kQ%9tMs+0bfRKusKc*?J$Sm=|Qw_~{uc36~~*1xOr?GU4DhecJ;D4SqcC=4&Do?*W=Iy-;Oo${0X zN;kXkP>j>4tob7s*~yfUEqY<{9P&z)fz-FZ!%CkEhC5Z&sfPw9_KY}Z?b><$%&oyu z!`tsjiszcY_TV^%t!d}4*q6k=dE}FN?BKijs6bo&8ed&oBWAr>&W7D{828PAikn^nzU#H1NZdsqb+e8^qw*M1n_P}ox{^~eV0Ytr(*u~9&( z7yaiH^q;*^=%_hMcCER)7j&f4&k|g$4~hs|nkn7Rv#Ea-2}^G4Zkk+-uNz9^UrWhJ ztR5~}oOkd2IrHWq{VvW@Vt~&;FfHS+%iXd1jV6Y{&h}?Y^G_;|YDjv`dbze3X3u}w z68Bxlz%f%G%6sP?$(D+=CdxhYD}h1$-cb#`v(Fk950b{$Dp%R{PDL}S-$q#S9kcpX zYyUOml4PwnLGy3;H_opmV5Jpu?baBkLfVV@yHBrpz58Mt@x#o?dHQYOS#7^nl!3G{ zN49gbp(M%21TSuAZm8{T#2$Iv_3`TrO#v(;$Is~YPqEzpO#V|_5<_W`l^MCFJLP$F zCrRw~U&Y-YQs3VQJr_R|TV)XIT%hbZf55Rre?WYw?`hnjZ1^G}*ll5CsWop-=O0d6 z-htgo;}t2hiL}>@_5ibnQ#)it3BHq&Vryby;S>7}6G(r%b9<_whuFr8DyCQKf>7O9 z4}s`HuNKK-g+Hg~I4hI*LrEPk;?|u%Ha1%o8lNx?J)1GnFFkNbTeU0a@1Bmik%ra| z$^uG=rljSd)ByiUL!%azD&O;eFje@dd(U#nd4zVc#4STmK78bJnrK!Twb)>;+bi`& zYp8D3$rS~!FZcIG3N#YrwsnRth#uhRyKdFvD4a@Bjw$XDQEAhjS?%eTfCO>B!{RGn zeYEd-Se~ipNw5u&@6u_+MIVh>N$-EbP-?a$$7xn$aN}^jc~_UM`fFwWxI~)(<3563 zyC9MCo|RsgRMBBcbJ^#+o|1l9dLMAZx38h1UfWb>=w0<`pfd8W8JzJPo2fhfhe3Q) z?W_@FX4aQfB-|#YKsv9-+I8j?auO&fS^y@)xXi<@UFfTa{lbc-=HI*;Vp& z0{P4~}yjb=z*+gY1_ThVZ(VqUO)!{wpPT>zuvj$|^HhlWR&j2@^XXWv2k>)?# zGUkP-(;{yrryOD8mO+xEP40+Eb`bry@Rhx^j+w2OzY2A26Za=8w z0=quOOU-&GEO|h`3mxv{7UZR0+{vIT?bG)&ux&M8g-BHVLiTs4drajFRbd5`W{awG(V5t8x|ws86#+>-FC+2 zX0D!Rg%l?&0o4Y!OrjFmq!g=tMu%w&VX4~{@#-2QRmw;ShR5ngq7o&CRxrjF(C%UiTg|57nM z@qNKT|AdZ|_7}Yq?eQ0rKUZbom5b-b?*yH{@i3D|c+CBK;`mg^$o+x(zF1|h-RfAH zfk1OrxryDKeG0|JI|>-@tj9gR2S152`8)Ef+*W%ux}0XqlQ^_kLULM;Ez9ign?Ibr zqJyz!o!)=b5JPx=&U=8m6$vOuQ4fJ-xo9h^cZO_0A8 z5qoHZPmec6Y`Vuu1Xn$?x>@sjGw28o4J);=?6VfB4_J6iEg8YpQvmfV(OC!~HnXRk zHDNVa^%)BvZ19E(xMr4ZQFarVht}Pe%(9yfE+w&0=ImS&u4dnk<7_Yo(`0=jtSBy! zGXxQNxB$UoTmiFXlW`+ov!b{Y`7Zf**ir30yxhrDiU)y%xW$EP=jiE8rd*2h38r|W zAzSEzEyas~vS-exK=v%l8(37xiGYI0EQW@Lfdc!cf)8LgH<`W~D2BBmO!NOaSqTz`C(1pTv@P0rJV4 z@+lkPsX#Cdgr@^5V^cm8$X{k9FKZ(l%aqUFh`$L0b3lA9u;`oec|hKk6`v2{lOjd- z!?&V|oHUw~J>@pU1#uHl*NAKmkyexfrW}EXz#jqeMxRE3I%B-eKnjvXPD3z6G9AGt zzDxx10}@C<)3+W*3L3S+qa3kFL5B!tgZPvDNI}yz7n4|jwxj3{Uy;gY7+4`sDx}65 z!6xik1ZA=<5k#H*h&)M|6l283=KwxL)Eu!HRe)#2-$b5b)BSt|W$gDMPZDnuh|CMY zh9O9b4ub*FYJwmhJB$c0;FhG3* zcgZOFgj@j$4R%W3;7O;zNt)aYkrR-(a=;ZR-9nI23GSGYvr(iA8$~*`f&GBTyNDwP zkcmNx6h`6yb>Kji($hedHcj#&S3p``0I5eaiZb6fs-gb|UA@pHJG%He3uDLxg=+loB*#`hQQXl=;L=Ys$nHc;T`r zqJT5`64k}S&dt}(-rJc&G2%IdqPvBy~0|GY-9ul^&;1P5BV>Bxij-b?o84bYI z$b9;nrF5dfxQnor zMZI8pyEmwpfa)ReNwZwNOxG(k*l!A#{u{rd zuY)?CTMf|*spNr?|GQA!8|P_8?3>GINK0)C7kG@sce@$&lX~1pqgSLDf+fT=2#>xv z{Ij{V6TwN+3_Q+q4Z-t3q(Kx4V(idHa%+c%@rVFV?7tIWER?;`w-q;xGbYg6jl|6V zZvy+1kO^$4F$Aav|8D|Z|2u&;#sp9hGEz_ZMr6U^Bt`=^O5XnwjAPlt8STKY;A1TV zr_Me5??{b;kp^6hkR2W?{ugHmQlHrU8>3L(c7%Q9!^}-gHX4UJaHr+^aP^XXICNng z&bWddN6RE=?Sy@~FRWVaMSZw{XVhA3U|4O=Qh04#D<16rV+awx*Nug;|h=Ks_IOA5lZ6lvjFzP0LFEnQS5i9s&BD1;GBc0-(a!9T1sX!kP!m zgH^EqC~p{`s1^bSb~z`6!n01u^k<$}mmpzhS4h~~XfN!88CUEcpIl(%sS7qA_PKLi zvC>~$u)&e$U`M$**i01w%Wtvcn4z#M3k#Z4DA5-Se$)<3&;GJ~o5QuNf38KatMvDj zypW{roi}3r)#DSJa&AR1ROOmL2{u&l>E@L6IF1YydR5Al7kRMWSU-PN@?MDqzT)BUOc#b>E-LLEBd=N4Sr6EbzPc06{X{l-DG#{oXX0Wu1J`GVyHG0Ru+9q z#f$XbudS?3i*GS%SC4&_@A$l%_t&-~$pUk^u~o`V1LX?QlIs#g>W`^pgO*?Ay*TdRKvkX}uKxO)$FCi7 zYCXoUteox{{K47>_b|gBsDh$GCuwg&N}T~*)y|P zUpIL8R+EJ0@wvr8^=BDF_X$tP6S~(zofZ6JFM9x+rlY@ypMMOU$ul)1ooLhH#5`}} z^&CIzYupk$!Q z;~VaK{|ah0vak*^wpgCmiD{P^R;5mIxg}p0$-JGOIX>39#=ogf&DlIj^8pT9@RdAQ z&%xgpby{RP^jl$W`pBKjVl<%=JBYX}{5tDFY33rO4keeBblr1a=a<|nV4*F%@$7{0 zn|O^YS$9LU+?2m_b>!3%t|{$N3g%80IxfGSw`aLg@~9yDs~fN6GWW<+qjR|1Tzt-c z-PbxOKKOKMbUI_LL%8rF)L(Ghr{>pkl8bvQ3vS}T(w8dI;{~*M&im>2`0RA3<%4we zTymP}2Sdf-uJ7UFD}2j&wxhn(3sPz1cPePe;yYoa;=b1+Jbttf+12e12x{fKb&cA0&P<(u8t-e9fkr7Pqd-dQbU)97Vyz zu#xZ4ogACK&7j3!0rrDkMaP?zf039d!+3_oeLAv123Z{p&CZ)QS3G)r|>*Y3F zT8nZ-8Q*8os(!TJ6W;iFJ_Co2nFs3(_&-4 z9_qeEJ$AMPw`!PJf7E4~8sB(Fvh9M@e3p<#+G;xau*2&Y*IMq1{?gJr3RV8yww|iC z81T=s!%XuG%fikNj+(S?Y)PzKO2|T)gn`ln^>l#=hvT)Q$#h}T_lT0L&GZ}Kqn;i$1ZxgQ+97>i2l1L1zD zl+Kd#x(n^~-|uj1hng50t3uUdmX%@CkC&j-u!LtpvsATXcP$2JiLUMe{$DSC+%H!Z zX{vqog(bQz-(n(?iA^N8dvKui`L`EezU(w^oGNr6n@{iKv&nv`hVQa&Gz1K_bb=dvZE_6{2j?H?;XVkINo zE@dK2du}>&puAo>wS?R{5cU3>#qod`ie*Q(!SmgJR_{Ffo=~6prhx;(FuB*rOcyy6 ztUtQ0C&caBZX^4(BB;HeFI^?*Mtu8JyjPaQ+fLngAt8DE8w&nee`newCwycg{!Fn&Ve$?tsimh})~VO_{zcu(u+uGdK zQ;(H*Yd~Vl9G997R@^Dpt1joNDrZ?L#Pr)2?$2?`ql!-0tWLK-*cbVWHRi=kQBq6E z{v{1d{TRIJA{y+t)z8Es8E_)yrJMf+**e0G-(@Nb-Bp%@Olh;Lh8Lms&roZteyS3W zbl9G#s+pinTb4IY)N5|1@3M*36gfLiU*NxcOf~z)t4GAZBm0^h#1y86L_5_CzZSe( z7F60y;Qb_uPudY%JZU?icPQuKWv_q`-5)%MLOv_BB=QB>iiU)?Z7Prm_4xuDQ#j*L z;ocU4#hqL=E_pvOOz7B8Na8qm^%;9+HSBQcR9L5_iR!-Jz5bC;-&jlR)m;l5KlO80 zaS3LO2fzD0(OmJgdEY7Ujs05X$8gQOt0O9VuC$6vDVtAjSWG5Z+!)<&F8MxOmC0gsOvkB<}7fQ)? znWGhsofsKMy&B<6Um2bsuWl5r8mDG=Sxbmn9qpIp<q8Iqq;Y0rBWK{Kbtt)#Ln^;ox*}sVc{90i zI9{wowb^VSez$#ueNml)^760N$;&_3m>oW{JZBX>rQB4RPh8oj<#n$4NfR&Ue5HaJ zXX%#){Rz)srQQBtoOg{$e!>97`Djgv*{{QE^YOpH4l&&KK+e2IqDF9-nsrWLGp0`K24E;zI)z*w->g z13FS@fThkAw3>GX+hk?I@<@J7CWv+GPj;9Y$(iUxBoa8;w=>rqLd{4ngdOe_H*2!3 zo2RP-ndFLRA#ga8t?fO$97t{vl$#`X4AF{s#fs#CN7=&lo$xLs(APNg>pnUR7JelXJSt?hn8Fz`wgB11lOTRv=yAC3q{AoNHiirSWs zrpq(LW46h$Tjbk_+8qdC$01?-w)kDTxHlr6u+6`>#ZN@!k`Q__5~Xa5r!w|VAKd+I za@rO-9Z}1G$r$*(WFleKwsHJ|m&itB$D=|COm^XB zq=#@ho)f<_jK~{#p2~zm9ZW<6LrrMlI1LTx-$4UEfgYen1wMkXU*HcPNgXt>w&V)3 z3A=#~MzY|~P8yI`PXlo9^Q4;wSoISDlMoen0f?0bW~88M8Nt3(6&AJ9P3EE@2RMgy5D zJ-|yv2o3nc!8ac?FpEP2Gap?+P60Qt$3Pa8YoP%dRWx8H{G98f0jU#2AT3M(@u75*NE2x$5p z>E?0O!IexJPMDGwka%N7dW*S98o>}a;p@`J*46nc=^cDqSdm6CM0;WsF(pA*X5XPP gJQI8=jbn%iHi02}5pNOWiCGSe?~x8WQ0$cc2PHxbm;e9( diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index b426a736af..33f9901af5 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -3,12 +3,19 @@ # # Author: Tobias B +""" +Automated tests for checking the EnsembleLda Class +""" + + import logging +import unittest + import numpy as np +from copy import deepcopy + from gensim.models import EnsembleLda -import unittest from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary -from copy import deepcopy num_topics = 2 num_models = 4 @@ -337,5 +344,5 @@ def test_add_and_recluster(self): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 9314cb4c81304bc2a40f080b55a6810dc255daf2 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 14 Apr 2019 17:52:20 +0200 Subject: [PATCH 022/166] potentially fixing the tests on windows --- gensim/models/ensemblelda.py | 133 +++++++++++++++--------------- gensim/test/test_data/ensemblelda | Bin 9820 -> 9815 bytes gensim/test/test_ensemblelda.py | 8 +- 3 files changed, 69 insertions(+), 72 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index e49f946c21..1075b63041 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -91,11 +91,7 @@ logger = logging.getLogger(__name__) -GENSIM_MODEL = dict( - lda=ldamodel.LdaModel, - ldamulticore=ldamulticore.LdaMulticore -) - +# nps max random state of 2**32 - 1 is too large for windows: MAX_RANDOM_STATE = np.iinfo(np.int32).max @@ -115,9 +111,10 @@ def __init__(self, topic_model_kind="lda", num_models=3, Parameters ---------- - topic_model_kind : str, optional + topic_model_kind : str, topic model, optional Examples: - 'ldamulticore' (recommended) or 'lda' (default) + 'ldamulticore' (recommended), 'lda' (default), + gensim.models.ldamodel, gensim.models.ldamulticore num_models : number, optional How many LDA models to train in this ensemble. Default: 3 @@ -188,11 +185,15 @@ def __init__(self, topic_model_kind="lda", num_models=3, "input space dimensionality. Corpus should be provided using the " "keyword argument corpus.") - # Store some of the parameters: - # - store topic_model_kind for generate_gensim_representation, - # so that it is made sure that the gensim_kw_args fit into - # the constructor of the gensim model. - self.topic_model_kind = topic_model_kind + if type(topic_model_kind) == str: + self.topic_model_kind = { + "lda": ldamodel.LdaModel, + "ldamulticore": ldamulticore.LdaMulticore + }[topic_model_kind] + else: + self.topic_model_kind = topic_model_kind + + # Store some of the parameters self.num_models = num_models self.gensim_kw_args = gensim_kw_args @@ -299,7 +300,7 @@ def generate_gensim_representation(self): params["iterations"] = 0 # no training params["passes"] = 0 # no training - classic_model_representation = GENSIM_MODEL[self.topic_model_kind](**params) + classic_model_representation = self.topic_model_kind(**params) # when eta was None, use what gensim generates as default eta for the following tasks: eta = classic_model_representation.eta @@ -551,23 +552,6 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): # multiprocessing is started. random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)] - # worker, that trains a bunch of models: - def worker(id, num_models, random_states, pipe): - # since models cannot be piped to the parent because messages need to be pickled for that: - # memory_friendly_ttda needs to be true, which will not store ensemble submodels - logger.info("Spawned worker to generate {} topic models".format(num_models)) - self.generate_topic_models(num_models, random_states=random_states) - # send the ttda that is in the child/workers version of the memory into the pipe - # available, after generate_topic_models has been called in the worker - if self.memory_friendly_ttda: - # remember that this code is inside the worker processes memory, - # so self.ttda is the ttda of only a chunk of models - pipe.send((id, self.ttda)) - else: - pipe.send((id, self.tms)) - - pipe.close() - # each worker has to work on at least one model. # Don't spawn idle workers: workers = min(ensemble_workers, num_models) @@ -594,7 +578,8 @@ def worker(id, num_models, random_states, pipe): # get the chunk from the random states that is meant to be for those models random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] - p = Process(target=worker, args=(i, num_subprocess_models, random_states_for_worker, childConn)) + p = Process(target=self.generate_topic_models, + args=(num_subprocess_models, random_states_for_worker, childConn)) processes += [p] pipes += [(parentConn, childConn)] @@ -617,32 +602,23 @@ def worker(id, num_models, random_states, pipe): raise # aggregate results - if self.memory_friendly_ttda: - - # collect results from the pipes - # will also block until workers are finished - for p in pipes: - answer = p[0].recv() # [0], because that is the parentConn - p[0].close() - # this does basically the same as the generate_topic_models function (concatenate all the ttdas): - self.ttda = np.concatenate([self.ttda, answer[1]]) - # in [0] the id of the worker that sent the data is stored - - else: - # now do the same thing for a memory unfriendly ensemble - self.tms = [] - for p in pipes: - answer = p[0].recv() # [0], because that is the parentConn - p[0].close() - self.tms += answer[1] - new_ttda = np.concatenate([model.get_topics() for model in answer[1]]) - self.ttda = np.concatenate([self.ttda, new_ttda]) + # will also block until workers are finished + for p in pipes: + answer = p[0].recv() # [0], because that is the parentConn + p[0].close() + # this does basically the same as the generate_topic_models function (concatenate all the ttdas): + if not self.memory_friendly_ttda: + self.tms += answer + ttda = np.concatenate([model.get_topics() for model in answer]) + else: + ttda = answer + self.ttda = np.concatenate([self.ttda, ttda]) # end all processes for p in processes: p.terminate() - def generate_topic_models(self, num_models, random_states=None): + def generate_topic_models(self, num_models, random_states=None, pipe=None): """Will train the topic models, that form the ensemble. Parameters @@ -653,8 +629,15 @@ def generate_topic_models(self, num_models, random_states=None): list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles RandomState if None (default). + pipe : multiprocessing.pipe + Default None. If provided, will send the trained models over + this pipe. If memory friendly, it will only send the ttda. + """ + if pipe is not None: + logger.info("Spawned worker to generate {} topic models".format(num_models)) + if random_states is None: random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)] @@ -669,7 +652,7 @@ def generate_topic_models(self, num_models, random_states=None): kwArgs["random_state"] = random_states[i] - tm = GENSIM_MODEL[self.topic_model_kind](**kwArgs) + tm = self.topic_model_kind(**kwArgs) # adds the lambda (that is the unnormalized get_topics) to ttda, which is # a list of all those lambdas @@ -683,6 +666,33 @@ def generate_topic_models(self, num_models, random_states=None): self.sstats_sum = tm.state.sstats.sum() self.eta = tm.eta + if pipe is not None: + # send the ttda that is in the child/workers version of the memory into the pipe + # available, after generate_topic_models has been called in the worker + if self.memory_friendly_ttda: + # remember that this code is inside the worker processes memory, + # so self.ttda is the ttda of only a chunk of models + pipe.send(self.ttda) + else: + pipe.send(self.tms) + + pipe.close() + + def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): + """ worker, that computes the distance to all other nodes + from a chunk of nodes. https://stackoverflow.com/a/1743350 + """ + + logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) + # the chunk of ttda that's going to be calculated: + ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] + distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda, + threshold=threshold, + start_index=ttdas_sent, + method=method) + pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory + pipe.close() + def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"): """Makes the pairwise distance matrix for all the ttdas from the ensemble. @@ -734,20 +744,6 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= if workers is None: workers = os.cpu_count() - # worker, that computes the distance to - # all other nodes from a chunk of nodes. - # https://stackoverflow.com/a/1743350 - def worker(worker_id, ttdas_sent, n_ttdas, pipe): - logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) - # the chunk of ttda that's going to be calculated: - ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] - distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda, - threshold=threshold, - start_index=ttdas_sent, - method=method) - pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory - pipe.close() - # create worker processes: processes = [] pipes = [] @@ -768,7 +764,8 @@ def worker(worker_id, ttdas_sent, n_ttdas, pipe): else: n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) - p = Process(target=worker, args=(i, ttdas_sent, n_ttdas, childConn)) + p = Process(target=self.asymmetric_distance_matrix_worker, + args=(i, ttdas_sent, n_ttdas, childConn, threshold, method)) ttdas_sent += n_ttdas processes += [p] diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 9b1df8521e47cfc0c68e80269b5aa0bcf5849454..3d77142cd210d18ed65595fed46a0f1a5a081e32 100644 GIT binary patch delta 2625 zcmV-H3cmH+OxH}X!U%sW&9IAA`d!~Qoy`cY9KHA*FphqN&ZKKFS$}cjzUXPDtL^ub z0Z{8(#^xReNe@W`c1>qV18(N9XR?Ry;74i{?GF`X2@^F71u^ml3^>YA^}g|m-(aw+ z?EdB9YR5s@z@;Ka)e&DsAW@wlXdRr&$(&BuI|MQHb!NymSe$=z=8oRHixqKGbpXQt;f%j|zq@xfpd*Hx>lqOjp60$cadgan4!i$oE`J~ip$%6bv2?X zn{KUk*aEoMSWp+)>qv!!<+y2>6Q2Z?!;9w&hkkH@xf@I|#%&F0uuw-BP=M-PTc0Nv znb1pZXqw6QG)KmlFRSU-r+*y5_wHJKE@m`H1G;K(_9=fAeBR$mMDCPm&56d2ETAPT z0GIEDQ;mbmvDUfoC|pMxTki2$Sk-COEp_;V4=No^eja&T)V6 zh*~bDE*V^KjJMX$;A*v%_K^*6#VnU}@>1KB7v*Cjq(tze`G;@s-D&RiiT2}JS(BbX ztto(J8o+;pbcM_BRdh|zlfbwNW%{&k0^o+1um5m8uTU-sr@^ZKxdhh0g=Hqe?pie@ zTon?D1Dg#736WW)wyZ7Q=xVjDH{>PnQoyl89(Fd-5vJ}=8+aqT{IfNX^$azPmGEsh zlZ{by#GcaDw=}%QKw+D^#-2BQ$R6VDd+-q3zG;6YFQ`h%i51C&?9s{$UxBjS^_J3p zV`^1aQSbP9H}+0HXH3^+en#h;Q#E#kY(go4v@)3*-0LMC-V|-3lPzY*odMu};oT1C zO}dFTt*#jI@Hf`aI<4gdc<{D?G9O(_j&4fn?5b{!<2LDR68U<$RSa}EVQ*^gk3{6Y z)=Gcl)@__JsJdrez*AYSIT*n*`&~xsBaPOp=A8!ldiPzvZBIEzgz3UWKRCJ1)}m!D zBJXSu{%DQwK=TlfyZ~%}$_Pye{S`Q983mk8x)jwice3zXFzAF*&0aQnLBH@iuYs5G;MuXfHZAowT0#gFOQEd^ODHsqDWzg`1A<+^W)TJD@ zbQ(E(UZAjx{-oD1Bl|yqG`K$=3;jfm=p9SpHjEi#ls$IEkbYPDgmFid!D66qQNk)T zV7n`GgB>g-Zt1`d1H<0USklmSy?}o{+>2&H+2#9^Hf(`(ZZ5z9-EcG>M=x7P1XG@Gs}3pgX_e?mQnG@3%J% zBpvuseb*40*6R@PtX3jDJ9~98k{v1*(O@9AR8!xrS4R`qjNZsHzg5}HiqLbh>!iy&Vn^e{LMUb|j zk^T!?I$$DWEZq0!fDkhZKdNI>d=Wd*`&V4Lvb&38Z+s_L@}ErT$aa4a8Pu*jdzv&p zmHcnY+vW5Otr20Ffq;GAJVfSP{=o`rru{{g}e(NFQbauNi0`= ztt>Jh%>FhQ+f|Pv)RLmEjbl&Xs9GU`-+GWo2k}eQ}J_-)|k~m(VZZxAm)?| zP^2@rxoJ%34LtPLW zy)fd06NW-PAZcgXLJj0G<%glW97@;Do{RF%0^mL`y+HH<{{MU6vzHH)ezfeNp}oJc zMp-d=#`{f)Tb#Bm0mcuuFR2J#jk=dnB(;-zZP|a!oei~~wb$Afb-UCnu^7aJFQpIX zZ|2!w0r@n{@qB|7)Nfr-pcIb^!b8et^m8{~2>UN(IuBh3KNl;WBTL&TT76>wTkjW+ zj7$>>O`CKVMBU#*#=`=k#Klg#{9Zt-r2I?9IqWd!f$Qd%EpZuahLjt!{Y|Rh7S@#5 z@BUZs6FRxKE7tY6G2E3Q*~zV4>kiz8Ntp@4Ag>K#sRxMFrSarB#@F|ydJ8JUxvS8W z8KuQF;SeoqIN|T(?u8M(t8VQgt+aoUJzVO??ClMp5)C)`T`b$xN& jC}`S^O+)Dv zi_d6xFI@V6bK7d=wqBN0RZjCcB2I1Z#e$$S3ThChn;JjAkI*n++f`3rGa3;l-amT0 zpv|BXa0%yk0yjc^BLcT<}%vz{E2~0iq11^BeUpVC*DX zbwK+_rI2wY&#MOA5E8@~G_?98BRo0MUBv&C)+N_1Ik=YZn#UhRB#YggCxn~TE)zsN zKI(xXz2@eFb9H|?y*Ocz+8tA&nB3B`nz74hN>o}}?39aWB4GlUoe@VWobb&v8+|=t_O+S^Wj~A07$Fz>~Hcfx3Bgr-P5QMN59e3*(AIpIH z-of!liXLy1PBLf6`+FLfuy%HApR}ffTl+5YCm0quo*EMlAz9|-OOQ4V4z9KDqA+Q0;=OM<={B^O_MSBREE4lWSO_OAP#*DvXuH8E*`!z5>tsa zP$iOh4-0DyeHP&wS3^n2^z47_pAi#q?VNrZ0+Q z-62o9X(8m5fCof_BLgERuzPda{)4K?4}@f~wl#r$k~WnP`jQC3G&#IA%WG z)ck*&NAGoRK`vQhJ}nKItq`p|Sx8bP}47GZK0i2R2*)N|k%de{1a zbO9jWp&(!EF{1Q5mr5UPQhG+?pY+s0xq4!z&#E1|TmW}W!{WZa4js@a|5Iq_CWt7A z?@ZYkq@i5A(T9H=4b8yh^p3XJmkj!qo{_TKr}Xk{(EIAv zEugPIQ5)3@Mh};nH+Id7MZQc(eP>};s-j0K-8LL>>~LzsNSyNR2aWMUMq`=|h;kGj z9K)`cz;U%tetUcja)`aq?8d$HEnwgJu=oN*Q;FcR*@jUII8yzzVeWu#9JF#vIaYuE zKX}7>_rz#w%{JR}XhHO%8C3Uh+U;G2`A%eK>DhZ&L3sE8MjuK8oM*vE9AI@!KpIF- zNI`N*0*5+kyVIFD4nE+XcuA!D7@z3mjI^nWal;`X+=WBsZ!{EO7^F9-&EuBhl7U%{ z*o(17i)W=OZ}Kc-$VJr}*wVs{FPwj7OMPqe-1Wqv??Q|OT?y40li#kD?2&ZS;KzDS zy;Bn)7fW~KhZy=8gM{HLNRVJwYzA*GDA)ntr>^PvwtN81K5Y((rf|a?s3iYX?*y=u z41r+C%#HkK=DxvFrb;_afmoU0)F0I^K{3moR?o=q+`Yueaq|EXbSFJxC%u3FmnZ=A zC$QoS<4q*yL$y)ZXWVpVscV@JBZS=RmlvC>bErn|&2W~?#DPPxNB1Of?YAa}4!%nw z4a$V`7c2wPk5bQL5|(i)im1;PF2zCW_|MH7Xb(UBgO0G#=3&dZt(rt{OI>;+%R;_C zi8PA$nAFVRC&5u=jXAPBKi_}6Ps__)HPb2#jgee>O43V2#}8EmwNgaO9j7oHPa(+_ z!eISTax7~=$NB(>(_FHdM1pm_bKs(dMptj%>b6w>;8nV^XEusXPSTU762%=PU|Tr? zT{ojzw~lFJ#!66Gtn6@k5436qc~K(xBlV@zz})1^sqW!G){5oKlxu(VtLBtUF4&8? z;v*N&)QI@o9w^EZ<7DhcWpoWdnDhI3Yd>$WIt_Z-5p(Gl*&O9#e^m$9U}IdL z+)SK&p#Ee0c)7utsx+LCeHxkdr%<7_ zjqsf_^A$rm{1*_?-RUO<`Rte>T!m}V5< zpeHFsct&B$r)_@;B(ISttO1`Yh=3pAcEjMu|6xXkzwqHKtC&dnH))3D=axbZ)UC^C zW2AsJq^Y9n>ocC&D&9YkBx+%4nYDm*q{u&7yZ}wcIwhwOX^D(gwq=#SHnzJ!gU@%@ zy{!Sk%u>9Q(!1%9!O4jxgNhq(iVsdf#aCRNpG0r29Rzd4 zGMa?06}kSl4^?gKYNrdgZCQ?gE?Bf6Wo-k_Iza{5AcTiDJ)Ph2>WL&UN!a{(Gf%X_ zbY48+(55^gt~?N*P5sxD(`acIBhFhjt&ZoVJka$g#IAYNmoYqLC1`UzsQ=((*8z^=~|^Hei3ktYN;%8q9H2%3>#aJ&|lfLxBuq z^DJc@Yh6+2;bY3}wX+reT&3rt!*J=Up>z)97}hxw!2l*v{QIAf0gCcK2co@jjhv?EG|MOK`J>5=Ik5RRe5fVR6<~ z3T}06Uu|z>Wo&bC*GmLd4|8vFbY)~;V{CPEbY*gLaoAmP*(iN++9*{BX>(t8VQgt+ zaodP!VO??DlM5y#C*4Z{b$xN(C}`V=XklG(-zaEvXnkmlXqjbbWJ>^O-Ae&=Wp!dM DED{+p diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 33f9901af5..3f1ea8b0b1 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -14,7 +14,7 @@ import numpy as np from copy import deepcopy -from gensim.models import EnsembleLda +from gensim.models import EnsembleLda, LdaMulticore from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary num_topics = 2 @@ -199,7 +199,7 @@ def test_add_models(self): # 1. memory friendly eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_kind='ldamulticore', + iterations=1, random_state=0, topic_model_kind=LdaMulticore, workers=3, ensemble_workers=2) # 1.1 ttda @@ -245,7 +245,7 @@ def test_add_models(self): # 2. memory unfriendly eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_kind='ldamulticore', + iterations=1, random_state=0, topic_model_kind=LdaMulticore, workers=3, ensemble_workers=2, memory_friendly_ttda=False) # 2.1 a single ensemble @@ -344,5 +344,5 @@ def test_add_and_recluster(self): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) unittest.main() From 0b7febc8ceb86ffb7e8fe6a7a945198a67a80b8b Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 21 Apr 2019 23:26:07 +0200 Subject: [PATCH 023/166] changed citation of opinosis --- docs/notebooks/Opinosis.ipynb | 77 +++++++++++++++++--------------- gensim/corpora/opinosiscorpus.py | 20 ++++----- 2 files changed, 50 insertions(+), 47 deletions(-) diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb index bbcc2f6525..3fc845040c 100644 --- a/docs/notebooks/Opinosis.ipynb +++ b/docs/notebooks/Opinosis.ipynb @@ -47,7 +47,10 @@ "source": [ "# Experiments on the Opinosis Dataset\n", "\n", - "Opinosis is an extremely small corpus that contains 289 product reviews for 51 products, which is why it is hard to extract topics from it. https://github.com/kavgan/opinosis" + "Opinosis [1] is a small (but redundant) corpus that contains 289 product reviews for 51 products. Since it's so small, the results are rather unstable.\n", + "\n", + "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions,_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348.\n", + "http://kavita-ganesan.com/opinosis-opinion-dataset/ https://github.com/kavgan/opinosis" ] }, { @@ -61,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2018-11-29T20:19:17.891961Z", @@ -77,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2018-12-01T17:57:39.575650Z", @@ -101,28 +104,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2018-12-01T17:57:40.842440Z", "start_time": "2018-12-01T17:57:39.905273Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data source:\n", - "title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions\n", - "authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei\n", - "booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics\n", - "pages:\t\t340-348\n", - "year:\t\t2010\n", - "organization:\tAssociation for Computational Linguistics\n" - ] - } - ], + "outputs": [], "source": [ "opinosis = OpinosisCorpus(path)" ] @@ -149,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2018-12-01T18:10:13.690983Z", @@ -161,16 +150,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "Spawned worker to generate 42 topic models...\n", - "Spawned worker to generate 43 topic models...\n", - "Generating 42 topic models...\n", - "Generating 43 topic models...\n", - "Spawned worker to generate 43 topic models...\n", - "Generating 43 topic models...\n", - "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", - "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", - "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", - "Spawned worker to generate 640 rows of the asymmetric similarity matrix...\n", + "Generating 128 topic models...\n", + "Spawned worker to generate 42 topic models\n", + "Spawned worker to generate 43 topic models\n", + "Spawned worker to generate 43 topic models\n", + "Generating a 2560 x 2560 asymmetric distance matrix...\n", + "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", + "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", + "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", + "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", + "The given threshold of 0.11 covered on average 9.9% of tokens\n", + "The given threshold of 0.11 covered on average 9.8% of tokens\n", + "The given threshold of 0.11 covered on average 9.9% of tokens\n", + "The given threshold of 0.11 covered on average 9.8% of tokens\n", "Fitting the clustering model\n", "Generating stable topics\n", "Generating classic gensim model representation based on results from the ensemble\n" @@ -185,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2018-12-01T18:10:13.712818Z", @@ -197,15 +189,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "- 0.145 free, 0.043 park, 0.033 coffe, 0.030 wine, 0.027 even, 0.025 morn, 0.022 internet \n", + "- 0.132 staff, 0.082 servic, 0.081 friendli, 0.075 help, 0.021 good, 0.015 quick, 0.010 expens \n", + "\n", + "- 0.166 screen, 0.063 bright, 0.040 clear, 0.022 easi, 0.020 touch, 0.015 read, 0.014 new \n", + "\n", + "- 0.146 free, 0.043 park, 0.034 coffe, 0.031 wine, 0.025 even, 0.025 morn, 0.022 internet \n", "\n", - "- 0.161 screen, 0.062 bright, 0.043 clear, 0.027 easi, 0.021 read, 0.019 touch, 0.011 size \n", + "- 0.142 batteri, 0.099 life, 0.026 short, 0.020 charg, 0.018 kindl, 0.018 good, 0.017 perform \n", "\n", - "- 0.127 staff, 0.075 friendli, 0.074 help, 0.070 servic, 0.016 quick, 0.012 profession, 0.012 good \n", + "- 0.123 room, 0.111 clean, 0.050 small, 0.035 comfort, 0.033 bathroom, 0.017 nice, 0.016 size \n", "\n", - "- 0.123 seat, 0.067 comfort, 0.050 uncomfort, 0.039 front, 0.037 back, 0.036 firm, 0.032 drive \n", + "- 0.147 seat, 0.086 comfort, 0.058 uncomfort, 0.045 firm, 0.031 long, 0.030 drive, 0.014 time \n", "\n", - "- 0.113 room, 0.104 clean, 0.041 small, 0.037 bathroom, 0.036 comfort, 0.023 size, 0.016 well \n", + "- 0.161 mileag, 0.106 ga, 0.056 good, 0.028 expect, 0.019 hard, 0.018 road, 0.018 high \n", "\n" ] } @@ -216,6 +212,13 @@ " print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -240,7 +243,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.7.2" } }, "nbformat": 4, diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py index 446c5e7804..eee1363045 100644 --- a/gensim/corpora/opinosiscorpus.py +++ b/gensim/corpora/opinosiscorpus.py @@ -3,17 +3,25 @@ # # Author: Tobias B +# Opinosis Corpus Source: +# title: Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions +# authors: Ganesan, Kavita and Zhai, ChengXiang and Han, Jiawei +# booktitle: Proceedings of the 23rd International Conference on Computational Linguistics +# pages: 340-348 +# year: 2010 +# organization: Association for Computational Linguistics +# http://kavita-ganesan.com/opinosis-opinion-dataset/ + import os import re from gensim.corpora import Dictionary from gensim.parsing.porter import PorterStemmer from gensim.parsing.preprocessing import STOPWORDS - class OpinosisCorpus(): """Creates a corpus and dictionary from the opinosis dataset: - http://kavita-ganesan.com/opinosis-opinion-dataset/#.WyF_JNWxW00 + http://kavita-ganesan.com/opinosis-opinion-dataset/ This data is organized in folders, each folder containing a few short docs. @@ -40,14 +48,6 @@ def __init__(self, path): """ # citation - print("data source:") - print("title:\t\tOpinosis: a graph-based approach to abstractive summarization of highly redundant opinions") - print("authors:\tGanesan, Kavita and Zhai, ChengXiang and Han, Jiawei") - print("booktitle:\tProceedings of the 23rd International Conference on Computational Linguistics") - print("pages:\t\t340-348") - print("year:\t\t2010") - print("organization:\tAssociation for Computational Linguistics") - path = os.path.join(path, "summaries-gold") dictionary = Dictionary() corpus = [] From 60a717df0ac82c5fb1f60dd5903960f1b7afaae0 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 22 Apr 2019 08:50:47 +0200 Subject: [PATCH 024/166] tox8 test passing after small change on opinosis comments/citation --- gensim/corpora/opinosiscorpus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py index eee1363045..f7b1abb134 100644 --- a/gensim/corpora/opinosiscorpus.py +++ b/gensim/corpora/opinosiscorpus.py @@ -18,6 +18,7 @@ from gensim.parsing.porter import PorterStemmer from gensim.parsing.preprocessing import STOPWORDS + class OpinosisCorpus(): """Creates a corpus and dictionary from the opinosis dataset: From 2ff60cae79ef62d4e323237ef50df1ef08338173 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Tue, 25 Jun 2019 20:58:52 +0200 Subject: [PATCH 025/166] Moving max_random_state inside the model as a private variable. --- gensim/models/ensemblelda.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1075b63041..4880b22674 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -91,9 +91,6 @@ logger = logging.getLogger(__name__) -# nps max random state of 2**32 - 1 is too large for windows: -MAX_RANDOM_STATE = np.iinfo(np.int32).max - class EnsembleLda(): """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. @@ -168,6 +165,10 @@ def __init__(self, topic_model_kind="lda", num_models=3, https://radimrehurek.com/gensim/models/ldamodel.html """ + # Set random state + # nps max random state of 2**32 - 1 is too large for windows: + self._MAX_RANDOM_STATE = np.iinfo(np.int32).max + if "id2word" not in gensim_kw_args: gensim_kw_args["id2word"] = None @@ -550,7 +551,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): # the same results in every lda children. # so it is solved by generating a list of state seeds before # multiprocessing is started. - random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)] + random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] # each worker has to work on at least one model. # Don't spawn idle workers: @@ -639,7 +640,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None): logger.info("Spawned worker to generate {} topic models".format(num_models)) if random_states is None: - random_states = [self.random_state.randint(MAX_RANDOM_STATE) for _ in range(num_models)] + random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] assert len(random_states) == num_models From d36fe43ba8f4e4824351cfbaa082bea852838a9c Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Tue, 25 Jun 2019 21:07:20 +0200 Subject: [PATCH 026/166] removed whitespace --- gensim/models/ensemblelda.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 4880b22674..1bc96756f7 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -169,7 +169,6 @@ def __init__(self, topic_model_kind="lda", num_models=3, # nps max random state of 2**32 - 1 is too large for windows: self._MAX_RANDOM_STATE = np.iinfo(np.int32).max - if "id2word" not in gensim_kw_args: gensim_kw_args["id2word"] = None if "corpus" not in gensim_kw_args: From 1507adfa3ce0c1e718bfae0c5feacffe9d644769 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 21:16:14 +0200 Subject: [PATCH 027/166] docstring width --- gensim/models/ensemblelda.py | 221 +++++++++++++---------------------- 1 file changed, 78 insertions(+), 143 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1bc96756f7..f4453e4b1b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -116,53 +116,34 @@ def __init__(self, topic_model_kind="lda", num_models=3, How many LDA models to train in this ensemble. Default: 3 min_cores : number, optional - Minimum cores a cluster of topics has to contain - so that it is recognized as stable topic. + Minimum cores a cluster of topics has to contain so that it is recognized as stable topic. epsilon : float, optional - Defaults to 0.1. Epsilon for the cbdbscan clustering - that generates the stable topics. + Defaults to 0.1. Epsilon for the cbdbscan clustering that generates the stable topics. ensemble_workers : number, optional - Spawns that many processes and distributes the - models from the ensemble to those as evenly as - possible. num_models should be a multiple of - ensemble_workers. + Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. + num_models should be a multiple of ensemble_workers. - Setting it to 0 or 1 will both use the non- - multiprocessing version. Default:1 + Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1 memory_friendly_ttda : boolean, optional - If True, the models in the ensemble are - deleted after training and only the - total topic word distribution is kept - to save memory. - - Defaults to True. When False, trained - models are stored in a list in self.tms, - and no models that are not of a gensim - model type can be added to this ensemble - using the add_model function. - - If False, any topic term matrix can be - supllied to add_model. + If True, the models in the ensemble are deleted after training and only the total topic word distribution + is kept to save memory. + + Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not + of a gensim model type can be added to this ensemble using the add_model function. + + If False, any topic term matrix can be supllied to add_model. min_samples : number, optional - Required number of nearby topics for - a topic to be considered as 'core' in - the CBDBSCAN clustering. + Required number of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. masking_method : {'mass', 'rank}, optional One of "mass" (default) or "rank" (faster). masking_threshold : float, optional - Default: None, which uses 0.11 for - masking_method "rank", and 0.95 - for "mass". + Default: None, which uses 0.11 for masking_method "rank", and 0.95 for "mass". distance_workers : number, optional - When distance_workers is None, it defaults to - os.cpu_count() for maximum performance. - Default is 1, which is not multiprocessed. - Set to > 1 to enable multiprocessing. + When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which + is not multiprocessed. Set to > 1 to enable multiprocessing. **gensim_kw_args - All the parameters for the gensim model, - that is used for each model in the ensemble, - can be provided as additional keyword arguments. - https://radimrehurek.com/gensim/models/ldamodel.html + All the parameters for the gensim model, that is used for each model in the ensemble, can be provided as + additional keyword arguments. https://radimrehurek.com/gensim/models/ldamodel.html """ # Set random state @@ -252,16 +233,12 @@ def convert_to_memory_friendly(self): def generate_gensim_representation(self): """creates a gensim-model from the stable topics - The prior for the gensim model, eta, Can be anything, - (a parameter in object constructor) and won't influence - get_topics(). Note that different eta means different - log_perplexity (the default is the same as the gensim + The prior for the gensim model, eta, Can be anything, (a parameter in object constructor) and won't influence + get_topics(). Note that different eta means different log_perplexity (the default is the same as the gensim default eta, which is for example [0.5, 0.5, 0.5]). - Note, that when the clustering of the topics - was changed using add_model, etc., and when no - topics were detected because of that, the old - classic gensim model will stay in place. + Note, that when the clustering of the topics was changed using add_model, etc., and when no topics were + detected because of that, the old classic gensim model will stay in place. Returns ------- @@ -340,27 +317,20 @@ def generate_gensim_representation(self): return classic_model_representation def add_model(self, target, num_new_models=None): - """Adds the ttda of another model to the ensemble - this way, multiple topic models can be connected - to an ensemble. + """Adds the ttda of another model to the ensemble this way, multiple topic models can be connected to an + ensemble. - Make sure that all the models use the exact same - dictionary/idword mapping. + Make sure that all the models use the exact same dictionary/idword mapping. In order to generate new stable topics afterwards, use self.generate_asymmetric_distance_matrix() self.recluster() - The ttda of another ensemble can also be used, - in that case set num_new_models to the num_models - parameter of the ensemble, that means the number - of classic models in the ensemble that generated - the ttda. This is important, because that information - is used to estimate "min_samples" for - generate_topic_clusters. + The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter + of the ensemble, that means the number of classic models in the ensemble that generated the ttda. This is + important, because that information is used to estimate "min_samples" for generate_topic_clusters. - If you trained this ensemble in the past with a - certain Dictionary that you want to reuse for other + If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other models, you can get it from: self.id2word. Parameters @@ -380,23 +350,16 @@ def add_model(self, target, num_new_models=None): with topic being an array of probabilities: [token1, token2, ...] - token probabilities in a single topic sum to one, - therefore, all the words sum to len(ttda) + token probabilities in a single topic sum to one, therefore, all the words sum to len(ttda) num_new_models : integer, optional - the model keeps track of how many models - were used in this ensemble. Set higher - if ttda contained topics from more than - one model. Default: None, which takes - care of it automatically. + the model keeps track of how many models were used in this ensemble. Set higher if ttda contained topics + from more than one model. Default: None, which takes care of it automatically. - If target is a 2D-array of float values, - it assumes 1. + If target is a 2D-array of float values, it assumes 1. - If the ensemble has memory_friendly_ttda - set to False, then it will always use - the number of models in the target - parameter. + If the ensemble has memory_friendly_ttda set to False, then it will always use the number of models in + the target parameter. """ # If the model has never seen a ttda before, initialize. @@ -522,8 +485,7 @@ def load(fname): return eLDA def generate_topic_models_multiproc(self, num_models, ensemble_workers): - """Will make the ensemble multiprocess, which results - in a speedup on multicore machines. Results from the + """Will make the ensemble multiprocess, which results in a speedup on multicore machines. Results from the processes will be piped to the parent and concatenated. Parameters @@ -531,24 +493,19 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): num_models : number how many models to train in the ensemble ensemble_workers : number - into how many processes to split the models - will be set to max(workers, num_models), to avoid - workers that are supposed to train 0 models. + into how many processes to split the models will be set to max(workers, num_models), to avoid workers that + are supposed to train 0 models. - to get maximum performance, set to the number - of your cores, if non-parallelized models - are being used in the ensemble (LdaModel). + to get maximum performance, set to the number of your cores, if non-parallelized models are being used in + the ensemble (LdaModel). - For LdaMulticore, the performance gain is small - and gets larger for a significantly smaller corpus. + For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus. In that case, ensemble_workers=2 can be used. """ - # the way random_states is handled needs to prevent getting - # different results when multiprocessing is on, or getting - # the same results in every lda children. - # so it is solved by generating a list of state seeds before + # the way random_states is handled needs to prevent getting different results when multiprocessing is on, + # or getting the same results in every lda children. so it is solved by generating a list of state seeds before # multiprocessing is started. random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] @@ -558,8 +515,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): # create worker processes: # from what I know this is basically forking with a jump to a target function in each child - # so modifying the ensemble object will not modify the one in the parent - # because of no shared memory + # so modifying the ensemble object will not modify the one in the parent because of no shared memory processes = [] pipes = [] num_models_unhandled = num_models # how many more models need to be trained by workers? @@ -626,12 +582,11 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None): num_models : number number of models to be generated random_states : list - list of numbers or np.random.RandomState objects. - Will be autogenerated based on the ensembles RandomState - if None (default). + list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles + RandomState if None (default). pipe : multiprocessing.pipe - Default None. If provided, will send the trained models over - this pipe. If memory friendly, it will only send the ttda. + Default None. If provided, will send the trained models over this pipe. If memory friendly, it will only + send the ttda. """ @@ -694,14 +649,11 @@ def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe pipe.close() def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"): - """Makes the pairwise distance matrix for all the ttdas - from the ensemble. + """Makes the pairwise distance matrix for all the ttdas from the ensemble. - Returns the asymmetric pairwise distance matrix that - is used in the DBSCAN clustering. + Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering. - Afterwards, the model needs to be reclustered for - this generated matrix to take effect. + Afterwards, the model needs to be reclustered for this generated matrix to take effect. Parameters ---------- @@ -711,12 +663,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= if threshold is None and method == "mass": threshold = 0.95 - threshold keeps by default 95% of the largest terms by - mass. Except the "fast" parameter is "rank", then it - just selects that many of the largest terms. + threshold keeps by default 95% of the largest terms by mass. Except the "fast" parameter is "rank", then + it just selects that many of the largest terms. workers : number, optional - when workers is None, it defaults to os.cpu_count() - for maximum performance. Default is 1, which is not + when workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which is not multiprocessed. Set to > 1 to enable multiprocessing. """ @@ -809,21 +759,18 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st Parameters ---------- ttda1 and ttda2: 2D arrays of floats - Two ttda matrices that are going to be used for distance calculation. - Each row in ttda corresponds to one topic. Each cell in the resulting - matrix corresponds to the distance between a topic pair. + Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one + topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. threshold : float, optional - threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected - method + threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected method start_index : number - this function might be used in multiprocessing, so start_index has to be set as - ttda1 is a chunk of the complete ttda in that case. start_index would be 0 - if ttda1 == self.ttda. When self.ttda is split into two pieces, each 100 ttdas - long, then start_index should be be 100. default is 0 + this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the + complete ttda in that case. start_index would be 0 if ttda1 == self.ttda. When self.ttda is split into two + pieces, each 100 ttdas long, then start_index should be be 100. default is 0 method : {'mass', 'rank}, optional - method can be "mass" for the original masking method or "rank" for a faster - masking method that selects by rank of largest elements in the topic word - distribution, to determine which tokens are relevant for the topic. + method can be "mass" for the original masking method or "rank" for a faster masking method that selects + by rank of largest elements in the topic word distribution, to determine which tokens are relevant for the + topic. Returns ------- @@ -902,11 +849,10 @@ def rank_masking(a): return distances def generate_topic_clusters(self, eps=0.1, min_samples=None): - """Runs the DBSCAN algorithm on all the detected topics from - the models in the ensemble and labels them with label-indices. + """Runs the DBSCAN algorithm on all the detected topics from the models in the ensemble and labels them with + label-indices. - The final approval and generation of stable topics is done in - generate_stable_topics(). + The final approval and generation of stable topics is done in generate_stable_topics(). Parameters ---------- @@ -925,20 +871,16 @@ def generate_topic_clusters(self, eps=0.1, min_samples=None): self.cluster_model.fit(self.asymmetric_distance_matrix) def generate_stable_topics(self, min_cores=None): - """generates stable topics out of the clusters. - This function is the last step that has to be done - in the ensemble. + """generates stable topics out of the clusters. This function is the last step that has to be done in the + ensemble. - Stable topics can be retreived afterwards using - get_topics(). + Stable topics can be retreived afterwards using get_topics(). Parameters ---------- min_cores : number - how many cores a cluster has to have, - to be treated as stable topic. That means, how many topics - that look similar have to be present, so that the average - topic in those is used as stable topic. + how many cores a cluster has to have, to be treated as stable topic. That means, how many topics + that look similar have to be present, so that the average topic in those is used as stable topic. defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) @@ -1082,34 +1024,27 @@ def validate_core(core): self.stable_topics = stable_topics def recluster(self, eps=0.1, min_samples=None, min_cores=None): - """Runs the CBDBSCAN algorithm on all the detected topics from - the children of the ensemble. + """Runs the CBDBSCAN algorithm on all the detected topics from the children of the ensemble. Generates stable topics out of the clusters afterwards. - Finally, stable topics can be retreived using - get_topics(). + Finally, stable topics can be retreived using get_topics(). Parameters ---------- eps : float - epsilon for the CBDBSCAN algorithm, having the same - meaning as in classic DBSCAN clustering. + epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. default: 0.1 min_samples : number - The minimum number of sampels in the neighborhood - of a topic to be considered a core in CBDBSCAN. + The minimum number of sampels in the neighborhood of a topic to be considered a core in CBDBSCAN. default: int(self.num_models / 2) min_cores : number - how many cores a cluster has to have, - to be treated as stable topic. That means, how many topics - that look similar have to be present, so that the average - topic in those is used as stable topic. + how many cores a cluster has to have, to be treated as stable topic. That means, how many topics + that look similar have to be present, so that the average topic in those is used as stable topic. default: min(3, max(1, int(self.num_models /4 +1))) """ - # if new models were added to the ensemble, the distance - # matrix needs to be generated again + # if new models were added to the ensemble, the distance matrix needs to be generated again if self.asymmetric_distance_matrix_outdated: logger.info("asymmetric distance matrix is outdated due to add_model") self.generate_asymmetric_distance_matrix() From 7577aca98534435751bd01926ba75e19f66ec50f Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Tue, 25 Jun 2019 21:26:38 +0200 Subject: [PATCH 028/166] sphinx udpate --- gensim/models/ensemblelda.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index f4453e4b1b..bbac0817b2 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -142,8 +142,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which is not multiprocessed. Set to > 1 to enable multiprocessing. **gensim_kw_args - All the parameters for the gensim model, that is used for each model in the ensemble, can be provided as - additional keyword arguments. https://radimrehurek.com/gensim/models/ldamodel.html + Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble. """ # Set random state From 301feacacae294691e9acafc22481d450c4c64c8 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 21:31:09 +0200 Subject: [PATCH 029/166] fixed urls to sphinx notation --- gensim/models/ensemblelda.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index f4453e4b1b..11f72e55bc 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -111,6 +111,11 @@ def __init__(self, topic_model_kind="lda", num_models=3, topic_model_kind : str, topic model, optional Examples: 'ldamulticore' (recommended), 'lda' (default), + ensemble_workers : number, optional + Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. + num_models should be a multiple of ensemble_workers. + + Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1 gensim.models.ldamodel, gensim.models.ldamulticore num_models : number, optional How many LDA models to train in this ensemble. @@ -1069,22 +1074,22 @@ def has_gensim_representation(self): raise ValueError("use generate_gensim_representation() first") def __getitem__(self, i): - """see https://radimrehurek.com/gensim/models/ldamodel.html""" + """see :py:class:`gensim.models.LdaModel`""" self.has_gensim_representation() return self.classic_model_representation[i] def inference(self, *posargs, **kwArgs): - """see https://radimrehurek.com/gensim/models/ldamodel.html""" + """see :py:class:`gensim.models.LdaModel`""" self.has_gensim_representation() return self.classic_model_representation.inference(*posargs, **kwArgs) def log_perplexity(self, *posargs, **kwArgs): - """see https://radimrehurek.com/gensim/models/ldamodel.html""" + """see :py:class:`gensim.models.LdaModel`""" self.has_gensim_representation() return self.classic_model_representation.log_perplexity(*posargs, **kwArgs) def print_topics(self, *posargs, **kwArgs): - """see https://radimrehurek.com/gensim/models/ldamodel.html""" + """see :py:class:`gensim.models.LdaModel`""" self.has_gensim_representation() return self.classic_model_representation.print_topics(*posargs, **kwArgs) From 0f4a6b856c9a780a25e3910e95c8a2f95dc57687 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Tue, 25 Jun 2019 21:31:59 +0200 Subject: [PATCH 030/166] changed doc strings, number --> int + some sphinx --- gensim/models/ensemblelda.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index bbac0817b2..2dbc29fdc5 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -112,14 +112,14 @@ def __init__(self, topic_model_kind="lda", num_models=3, Examples: 'ldamulticore' (recommended), 'lda' (default), gensim.models.ldamodel, gensim.models.ldamulticore - num_models : number, optional + num_models : int, optional How many LDA models to train in this ensemble. Default: 3 - min_cores : number, optional + min_cores : int, optional Minimum cores a cluster of topics has to contain so that it is recognized as stable topic. epsilon : float, optional Defaults to 0.1. Epsilon for the cbdbscan clustering that generates the stable topics. - ensemble_workers : number, optional + ensemble_workers : int, optional Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. num_models should be a multiple of ensemble_workers. @@ -132,13 +132,13 @@ def __init__(self, topic_model_kind="lda", num_models=3, of a gensim model type can be added to this ensemble using the add_model function. If False, any topic term matrix can be supllied to add_model. - min_samples : number, optional - Required number of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. + min_samples : int, optional + Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. masking_method : {'mass', 'rank}, optional One of "mass" (default) or "rank" (faster). masking_threshold : float, optional Default: None, which uses 0.11 for masking_method "rank", and 0.95 for "mass". - distance_workers : number, optional + distance_workers : int, optional When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which is not multiprocessed. Set to > 1 to enable multiprocessing. **gensim_kw_args @@ -337,7 +337,7 @@ def add_model(self, target, num_new_models=None): target : {see description} 1. A single EnsembleLda object 2. List of EnsembleLda objects - 3. A single Gensim topic model + 3. A single Gensim topic model (e.g. (:py:class:`gensim.models.LdaModel`) 4. List of Gensim topic models if memory_friendly_ttda is True, target can also be: @@ -489,9 +489,9 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): Parameters ---------- - num_models : number + num_models : int how many models to train in the ensemble - ensemble_workers : number + ensemble_workers : int into how many processes to split the models will be set to max(workers, num_models), to avoid workers that are supposed to train 0 models. @@ -578,7 +578,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None): Parameters ---------- - num_models : number + num_models : int number of models to be generated random_states : list list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles @@ -762,7 +762,7 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. threshold : float, optional threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected method - start_index : number + start_index : int this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the complete ttda in that case. start_index would be 0 if ttda1 == self.ttda. When self.ttda is split into two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 @@ -857,7 +857,7 @@ def generate_topic_clusters(self, eps=0.1, min_samples=None): ---------- eps : float eps is 0.1 by default. - min_samples : number + min_samples : int min_samples is int(self.num_models / 2) """ @@ -877,7 +877,7 @@ def generate_stable_topics(self, min_cores=None): Parameters ---------- - min_cores : number + min_cores : int how many cores a cluster has to have, to be treated as stable topic. That means, how many topics that look similar have to be present, so that the average topic in those is used as stable topic. @@ -1034,10 +1034,10 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): eps : float epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. default: 0.1 - min_samples : number + min_samples : int The minimum number of sampels in the neighborhood of a topic to be considered a core in CBDBSCAN. default: int(self.num_models / 2) - min_cores : number + min_cores : int how many cores a cluster has to have, to be treated as stable topic. That means, how many topics that look similar have to be present, so that the average topic in those is used as stable topic. default: min(3, max(1, int(self.num_models /4 +1))) From f915f50edc1074da35f9191e4adf3fc73278f763 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Tue, 25 Jun 2019 21:39:29 +0200 Subject: [PATCH 031/166] Removed hanging indents. --- gensim/models/ensemblelda.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 2dbc29fdc5..f75665c191 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -161,9 +161,10 @@ def __init__(self, topic_model_kind="lda", num_models=3, logger.warning("no word id mapping provided; initializing from corpus, assuming identity") gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"]) if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None: - raise ValueError("at least one of corpus/id2word must be specified, to establish " - "input space dimensionality. Corpus should be provided using the " - "keyword argument corpus.") + raise ValueError( + "at least one of corpus/id2word must be specified, to establish " + "input space dimensionality. Corpus should be provided using the " + "keyword argument corpus.") if type(topic_model_kind) == str: self.topic_model_kind = { @@ -215,9 +216,10 @@ def __init__(self, topic_model_kind="lda", num_models=3, # singlecore self.generate_topic_models(num_models) - self.generate_asymmetric_distance_matrix(workers=self.distance_workers, - threshold=masking_threshold, - method=masking_method) + self.generate_asymmetric_distance_matrix( + workers=self.distance_workers, + threshold=masking_threshold, + method=masking_method) self.generate_topic_clusters(epsilon, min_samples) self.generate_stable_topics(min_cores) @@ -680,11 +682,8 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= # singlecore: if workers is not None and workers <= 1: - self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk(ttda1=self.ttda, - ttda2=self.ttda, - threshold=threshold, - start_index=0, - method=method) + self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk( + ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method) return self.asymmetric_distance_matrix # multicore: From a3161cd088c287ca56fb7271f5fcaab88a31d239 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 21:45:54 +0200 Subject: [PATCH 032/166] improved topic_model_kind type checking --- gensim/models/ensemblelda.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 2a307737ac..95d169a9ba 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -168,15 +168,19 @@ def __init__(self, topic_model_kind="lda", num_models=3, if gensim_kw_args["id2word"] is None and gensim_kw_args["corpus"] is None: raise ValueError("at least one of corpus/id2word must be specified, to establish " "input space dimensionality. Corpus should be provided using the " - "keyword argument corpus.") + "`corpus` keyword argument.") - if type(topic_model_kind) == str: - self.topic_model_kind = { + if isinstance(topic_model_kind, ldamodel.LdaModel): + self.topic_model_kind = topic_model_kind + else: + kinds = { "lda": ldamodel.LdaModel, "ldamulticore": ldamulticore.LdaMulticore - }[topic_model_kind] - else: - self.topic_model_kind = topic_model_kind + } + if topic_model_kind not in kinds: + raise ValueError( + "topic_model_kind should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel") + self.topic_model_kind = kinds[topic_model_kind] # Store some of the parameters self.num_models = num_models From 52c239bace43fbe246f9c307739c1dd8771b8598 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Tue, 25 Jun 2019 22:06:04 +0200 Subject: [PATCH 033/166] Sphinx and docstring updates. --- gensim/models/ensemblelda.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index f75665c191..a46542a305 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -232,21 +232,19 @@ def convert_to_memory_friendly(self): self.memory_friendly_ttda = True def generate_gensim_representation(self): - """creates a gensim-model from the stable topics + """Creates a gensim model from the stable topics. - The prior for the gensim model, eta, Can be anything, (a parameter in object constructor) and won't influence - get_topics(). Note that different eta means different log_perplexity (the default is the same as the gensim - default eta, which is for example [0.5, 0.5, 0.5]). + The returned representation is an Gensim LdaModel (:py:class:`gensim.models.LdaModel`) that has been + instantiated with an A-priori belief on word probability, eta, that represents the topic-term distributions of + any stable topics the were found by clustering over the ensemble of topic distributions. - Note, that when the clustering of the topics was changed using add_model, etc., and when no topics were - detected because of that, the old classic gensim model will stay in place. + When no stable topics have been detected, None is returned. Returns ------- - LdaModel + :py:class:`gensim.models.LdaModel` A Gensim LDA Model classic_model_representation for which: classic_model_representation.get_topics() == self.get_topics() - """ logger.info("Generating classic gensim model representation based on results from the ensemble") @@ -1179,9 +1177,10 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors ordered_min_similarity.remove(neighbor) # try to extend the cluster into the direction # of the neighbor - scan_topic(neighbor, parent_id=topic_index, - current_label=current_label, - parent_neighbors=neighbors) + scan_topic( + neighbor, parent_id=topic_index, + current_label=current_label, + parent_neighbors=neighbors) results[neighbor]["parent_ids"].add(topic_index) results[neighbor]["parent_labels"].add(current_label) From ffc8e10fe8059fbec8c54fed7f0e6323acbc0e68 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 22:06:25 +0200 Subject: [PATCH 034/166] review stuff --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 7c22739e18..eb1b0eba6c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -183,7 +183,6 @@ def __init__(self, topic_model_kind="lda", num_models=3, "topic_model_kind should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel") self.topic_model_kind = kinds[topic_model_kind] - # Store some of the parameters self.num_models = num_models self.gensim_kw_args = gensim_kw_args @@ -277,6 +276,7 @@ def generate_gensim_representation(self): if num_stable_topics == 0: logger.error("The model did not detect any stable topic. You can try to adjust epsilon: " "recluster(eps=...)") + self.classic_model_representation = None return # create a new gensim model From 2d269a5f55c65bff69c3e38089d699ef32ce34f7 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 22:08:43 +0200 Subject: [PATCH 035/166] removed unneccessary comments --- gensim/models/ensemblelda.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a10555855c..aae9897aab 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -136,7 +136,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not of a gensim model type can be added to this ensemble using the add_model function. - If False, any topic term matrix can be supllied to add_model. + If False, any topic term matrix can be suplied to add_model. min_samples : int, optional Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. masking_method : {'mass', 'rank}, optional @@ -186,7 +186,6 @@ def __init__(self, topic_model_kind="lda", num_models=3, self.num_models = num_models self.gensim_kw_args = gensim_kw_args - # some settings affecting performance self.memory_friendly_ttda = memory_friendly_ttda self.distance_workers = distance_workers From 0612c4a5b4002b67df8392d4dcb09bf7be4666a5 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 22:10:30 +0200 Subject: [PATCH 036/166] Update gensim/models/ensemblelda.py Co-Authored-By: Michael Penkov --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1075b63041..3215ed876c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -217,7 +217,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, # parameters, stop here and don't train. if num_models <= 0: return - if ("corpus" not in gensim_kw_args): + if "corpus" not in gensim_kw_args: return if gensim_kw_args["corpus"] is None: return From 24b34b1116baa5d7272072ee80521a113958659c Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 22:12:39 +0200 Subject: [PATCH 037/166] removed paranthesis --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index aae9897aab..b660d6601f 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -205,7 +205,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, # parameters, stop here and don't train. if num_models <= 0: return - if ("corpus" not in gensim_kw_args): + if "corpus" not in gensim_kw_args: return if gensim_kw_args["corpus"] is None: return From d96e1a139d762d6d19ef30581e248294bd137e24 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 22:20:01 +0200 Subject: [PATCH 038/166] review --- gensim/models/ensemblelda.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index b660d6601f..9d65e80d73 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -205,19 +205,16 @@ def __init__(self, topic_model_kind="lda", num_models=3, # parameters, stop here and don't train. if num_models <= 0: return - if "corpus" not in gensim_kw_args: - return - if gensim_kw_args["corpus"] is None: + if gensim_kw_args.get("corpus") is None: return - if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0: + if gensim_kw_args.get("iterations", 0) <= 0: return - if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: + if gensim_kw_args.get("passes", 0) <= 0: return logger.info("Generating {} topic models...".format(num_models)) - # training + if ensemble_workers > 1: - # multiprocessed self.generate_topic_models_multiproc(num_models, ensemble_workers) else: # singlecore From a1e3d951ac8b6bad71cc93a0cf2261eb2cd51d47 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 22:27:51 +0200 Subject: [PATCH 039/166] refactor private, hanging indent --- gensim/models/ensemblelda.py | 66 ++++++++++++++++----------------- gensim/test/test_ensemblelda.py | 12 +++--- 2 files changed, 38 insertions(+), 40 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 9d65e80d73..3bbc73dd5a 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -100,7 +100,7 @@ class EnsembleLda(): """ def __init__(self, topic_model_kind="lda", num_models=3, - min_cores=None, # default value from generate_stable_topics() + min_cores=None, # default value from _generate_stable_topics() epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, min_samples=None, masking_method="mass", masking_threshold=None, distance_workers=1, random_state=None, **gensim_kw_args): @@ -215,17 +215,17 @@ def __init__(self, topic_model_kind="lda", num_models=3, logger.info("Generating {} topic models...".format(num_models)) if ensemble_workers > 1: - self.generate_topic_models_multiproc(num_models, ensemble_workers) + self._generate_topic_models_multiproc(num_models, ensemble_workers) else: # singlecore - self.generate_topic_models(num_models) + self._generate_topic_models(num_models) - self.generate_asymmetric_distance_matrix( + self._generate_asymmetric_distance_matrix( workers=self.distance_workers, threshold=masking_threshold, method=masking_method) - self.generate_topic_clusters(epsilon, min_samples) - self.generate_stable_topics(min_cores) + self._generate_topic_clusters(epsilon, min_samples) + self._generate_stable_topics(min_cores) # create model that can provide the usual gensim api to the stable topics from the ensemble self.generate_gensim_representation() @@ -327,12 +327,12 @@ def add_model(self, target, num_new_models=None): Make sure that all the models use the exact same dictionary/idword mapping. In order to generate new stable topics afterwards, use - self.generate_asymmetric_distance_matrix() + self._generate_asymmetric_distance_matrix() self.recluster() The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda. This is - important, because that information is used to estimate "min_samples" for generate_topic_clusters. + important, because that information is used to estimate "min_samples" for _generate_topic_clusters. If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other models, you can get it from: self.id2word. @@ -488,7 +488,7 @@ def load(fname): return eLDA - def generate_topic_models_multiproc(self, num_models, ensemble_workers): + def _generate_topic_models_multiproc(self, num_models, ensemble_workers): """Will make the ensemble multiprocess, which results in a speedup on multicore machines. Results from the processes will be piped to the parent and concatenated. @@ -538,7 +538,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): # get the chunk from the random states that is meant to be for those models random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] - p = Process(target=self.generate_topic_models, + p = Process(target=self._generate_topic_models, args=(num_subprocess_models, random_states_for_worker, childConn)) processes += [p] @@ -566,7 +566,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): for p in pipes: answer = p[0].recv() # [0], because that is the parentConn p[0].close() - # this does basically the same as the generate_topic_models function (concatenate all the ttdas): + # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): if not self.memory_friendly_ttda: self.tms += answer ttda = np.concatenate([model.get_topics() for model in answer]) @@ -578,7 +578,7 @@ def generate_topic_models_multiproc(self, num_models, ensemble_workers): for p in processes: p.terminate() - def generate_topic_models(self, num_models, random_states=None, pipe=None): + def _generate_topic_models(self, num_models, random_states=None, pipe=None): """Will train the topic models, that form the ensemble. Parameters @@ -627,7 +627,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None): if pipe is not None: # send the ttda that is in the child/workers version of the memory into the pipe - # available, after generate_topic_models has been called in the worker + # available, after _generate_topic_models has been called in the worker if self.memory_friendly_ttda: # remember that this code is inside the worker processes memory, # so self.ttda is the ttda of only a chunk of models @@ -637,7 +637,7 @@ def generate_topic_models(self, num_models, random_states=None, pipe=None): pipe.close() - def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): + def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): """ worker, that computes the distance to all other nodes from a chunk of nodes. https://stackoverflow.com/a/1743350 """ @@ -645,14 +645,12 @@ def asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] - distance_chunk = self.calculate_asymmetric_distance_matrix_chunk(ttda1=ttda1, ttda2=self.ttda, - threshold=threshold, - start_index=ttdas_sent, - method=method) + distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( + ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method) pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory pipe.close() - def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"): + def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"): """Makes the pairwise distance matrix for all the ttdas from the ensemble. Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering. @@ -685,7 +683,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= # singlecore: if workers is not None and workers <= 1: - self.asymmetric_distance_matrix = self.calculate_asymmetric_distance_matrix_chunk( + self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method) return self.asymmetric_distance_matrix @@ -715,7 +713,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= else: n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) - p = Process(target=self.asymmetric_distance_matrix_worker, + p = Process(target=self._asymmetric_distance_matrix_worker, args=(i, ttdas_sent, n_ttdas, childConn, threshold, method)) ttdas_sent += n_ttdas @@ -742,7 +740,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= for p in pipes: answer = p[0].recv() # [0], because that is the parentConn p[0].close() # child conn will be closed from inside the worker - # this does basically the same as the generate_topic_models function (concatenate all the ttdas): + # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): distances += [answer[1]] # end all processes @@ -753,7 +751,7 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= return self.asymmetric_distance_matrix - def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): + def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): """Iterates over ttda1 and calculates the distance to every ttda in tttda2. @@ -849,11 +847,11 @@ def rank_masking(a): return distances - def generate_topic_clusters(self, eps=0.1, min_samples=None): + def _generate_topic_clusters(self, eps=0.1, min_samples=None): """Runs the DBSCAN algorithm on all the detected topics from the models in the ensemble and labels them with label-indices. - The final approval and generation of stable topics is done in generate_stable_topics(). + The final approval and generation of stable topics is done in _generate_stable_topics(). Parameters ---------- @@ -871,7 +869,7 @@ def generate_topic_clusters(self, eps=0.1, min_samples=None): self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) self.cluster_model.fit(self.asymmetric_distance_matrix) - def generate_stable_topics(self, min_cores=None): + def _generate_stable_topics(self, min_cores=None): """generates stable topics out of the clusters. This function is the last step that has to be done in the ensemble. @@ -1048,10 +1046,10 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): # if new models were added to the ensemble, the distance matrix needs to be generated again if self.asymmetric_distance_matrix_outdated: logger.info("asymmetric distance matrix is outdated due to add_model") - self.generate_asymmetric_distance_matrix() + self._generate_asymmetric_distance_matrix() - self.generate_topic_clusters(eps, min_samples) - self.generate_stable_topics(min_cores) + self._generate_topic_clusters(eps, min_samples) + self._generate_stable_topics(min_cores) self.generate_gensim_representation() # GENSIM API @@ -1060,7 +1058,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): def get_topics(self): return self.stable_topics - def has_gensim_representation(self): + def _has_gensim_representation(self): """checks if stable topics area vailable and if the internal gensim representation exists. If not, raises errors""" if self.classic_model_representation is None: @@ -1071,22 +1069,22 @@ def has_gensim_representation(self): def __getitem__(self, i): """see :py:class:`gensim.models.LdaModel`""" - self.has_gensim_representation() + self._has_gensim_representation() return self.classic_model_representation[i] def inference(self, *posargs, **kwArgs): """see :py:class:`gensim.models.LdaModel`""" - self.has_gensim_representation() + self._has_gensim_representation() return self.classic_model_representation.inference(*posargs, **kwArgs) def log_perplexity(self, *posargs, **kwArgs): """see :py:class:`gensim.models.LdaModel`""" - self.has_gensim_representation() + self._has_gensim_representation() return self.classic_model_representation.log_perplexity(*posargs, **kwArgs) def print_topics(self, *posargs, **kwArgs): """see :py:class:`gensim.models.LdaModel`""" - self.has_gensim_representation() + self._has_gensim_representation() return self.classic_model_representation.print_topics(*posargs, **kwArgs) @property diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 3f1ea8b0b1..5de7b324c7 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -318,21 +318,21 @@ def test_add_and_recluster(self): self.check_ttda(new_eLDA_mu) # 2. distance matrix - new_eLDA.generate_asymmetric_distance_matrix() - new_eLDA_mu.generate_asymmetric_distance_matrix() + new_eLDA._generate_asymmetric_distance_matrix() + new_eLDA_mu._generate_asymmetric_distance_matrix() np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, new_eLDA_mu.asymmetric_distance_matrix) # 3. CBDBSCAN results - new_eLDA.generate_topic_clusters() - new_eLDA_mu.generate_topic_clusters() + new_eLDA._generate_topic_clusters() + new_eLDA_mu._generate_topic_clusters() a = new_eLDA.cluster_model.results b = new_eLDA_mu.cluster_model.results self.assert_cluster_results_equal(a, b) # 4. finally, the stable topics - new_eLDA.generate_stable_topics() - new_eLDA_mu.generate_stable_topics() + new_eLDA._generate_stable_topics() + new_eLDA_mu._generate_stable_topics() np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics()) From 5d48c8d9593e84e2e4f966cb434726c7dd678117 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Tue, 25 Jun 2019 22:40:32 +0200 Subject: [PATCH 040/166] typo --- gensim/models/ensemblelda.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 3bbc73dd5a..cb78f28153 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -681,14 +681,13 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) - # singlecore: + # singlecore if workers is not None and workers <= 1: self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method) return self.asymmetric_distance_matrix - # multicore: - + # else, if workers > 1 use multiprocessing # best performance on 2-core machine: 2 workers if workers is None: workers = os.cpu_count() @@ -822,7 +821,7 @@ def rank_masking(a): distance = 0 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1 - # (not 2 because that correspondsto negative values. The maximum distance is 1 here) + # (not 2 because that corresponds to negative values. The maximum distance is 1 here) # don't normalize b_masked, otherwise the following threshold will never work: if b_masked.sum() <= 0.05: distance = 1 From e556e8cac535de9b3f7a407bed681ef60c354daa Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Tue, 25 Jun 2019 22:42:18 +0200 Subject: [PATCH 041/166] Clarifications to ttda in docstrings and in method docstrings. --- gensim/models/ensemblelda.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a10555855c..2f42d992ed 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -130,8 +130,8 @@ def __init__(self, topic_model_kind="lda", num_models=3, Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1 memory_friendly_ttda : boolean, optional - If True, the models in the ensemble are deleted after training and only the total topic word distribution - is kept to save memory. + If True, the models in the ensemble are deleted after training and only a concatenation of each model's + topic term distribution (called ttda) is kept to save memory. Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not of a gensim model type can be added to this ensemble using the add_model function. @@ -758,8 +758,10 @@ def generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method= return self.asymmetric_distance_matrix def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): - """Iterates over ttda1 and calculates the - distance to every ttda in tttda2. + """Internal method calculating an (asymmetric) distance from each topic term distribution in ttda1 to each topic + term distribution in ttda2 and returns a matrix of distances, size len(ttda1) by len(ttda2). + + This is an internal method and should not be used for two general ttda arrays. Parameters ---------- @@ -774,12 +776,13 @@ def calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, st pieces, each 100 ttdas long, then start_index should be be 100. default is 0 method : {'mass', 'rank}, optional method can be "mass" for the original masking method or "rank" for a faster masking method that selects - by rank of largest elements in the topic word distribution, to determine which tokens are relevant for the + by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the topic. Returns ------- 2D Numpy.numpy.ndarray of floats + Asymmetric distance matrix of size len(ttda1) by len(ttda2). """ if method not in ["mass", "rank"]: @@ -806,39 +809,38 @@ def rank_masking(a): distances = np.ndarray((len(ttda1), len(ttda2))) # now iterate over each topic - for i in range(len(ttda1)): + for ttd1_idx, ttd1 in enumerate(ttda1): - # create mask from a, that removes noise from a and keeps the largest terms - a = ttda1[i] - mask = create_mask(a) - a_masked = a[mask] + # create mask from ttd1 that removes noise from a and keeps the largest terms + mask = create_mask(ttd1) + ttd1_masked = ttd1[mask] avg_mask_size += mask.sum() # now look at every possible pair for topic a: - for j in range(len(ttda2)): + for ttd2_idx, ttd2 in enumerate(ttda2): # distance to itself is 0 - if i + start_index == j: - distances[i][j] = 0 + if ttd1_idx + start_index == ttd2_idx: + distances[ttd1_idx][ttd2_idx] = 0 continue # now mask b based on a, which will force the shape of a onto b - b_masked = ttda2[j][mask] + ttd2_masked = ttd2[mask] distance = 0 # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1 # (not 2 because that correspondsto negative values. The maximum distance is 1 here) # don't normalize b_masked, otherwise the following threshold will never work: - if b_masked.sum() <= 0.05: + if ttd2_masked.sum() <= 0.05: distance = 1 else: # if there is indeed some non-noise stuff in the masked topic b, look at # how similar the two topics are. note that normalizing is not needed for cosine distance, # as it only looks at the angle between vectors - distance = cosine(a_masked, b_masked) + distance = cosine(ttd1_masked, ttd2_masked) - distances[i][j] = distance + distances[ttd1_idx][ttd2_idx] = distance avg_mask_size = avg_mask_size / ttda1.shape[0] / ttda1.shape[1] percent = round(100 * avg_mask_size, 1) From dc566f3a68c40d9057468ae6efa2816a5c36ea76 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 29 Jul 2019 21:24:08 +0200 Subject: [PATCH 042/166] docstrings, masks explained and mask warning removed --- gensim/models/ensemblelda.py | 48 +++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1719ebc4a1..133da0b9dc 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -4,9 +4,10 @@ # Authors: Tobias B , Alex Loosley , # Stephan Sahm , Alex Salles , Data Reply Munich -"""Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. They are clustered -using DBSCAN, for which nearby topic mixtures - which are mixtures of two or more true topics and therefore undesired - -are being used to support topics in becoming cores, but not vice versa. +"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting +reliable topics that are consistently learned accross the ensemble. eLDA has the added benefit that the user +does not need to know the exact number of topics the topic model should extract ahead of time. For more details +read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). Usage examples -------------- @@ -93,10 +94,10 @@ class EnsembleLda(): - """Ensemble Latent Dirichlet Allocation (LDA), a method of ensembling multiple gensim topic models. - They are clustered using a variation of DBSCAN, for which nearby topic mixtures - which are mixtures - of two or more true topics and therefore undesired - are being used to support topics in becoming - cores, but not vice versa. + """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting + reliable topics that are consistently learned accross the ensemble. eLDA has the added benefit that the user + does not need to know the exact number of topics the topic model should extract ahead of time. For more details + read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). """ def __init__(self, topic_model_kind="lda", num_models=3, @@ -123,7 +124,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, min_cores : int, optional Minimum cores a cluster of topics has to contain so that it is recognized as stable topic. epsilon : float, optional - Defaults to 0.1. Epsilon for the cbdbscan clustering that generates the stable topics. + Defaults to 0.1. Epsilon for the CBDBSCAN clustering that generates the stable topics. ensemble_workers : int, optional Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. num_models should be a multiple of ensemble_workers. @@ -139,10 +140,21 @@ def __init__(self, topic_model_kind="lda", num_models=3, If False, any topic term matrix can be suplied to add_model. min_samples : int, optional Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. - masking_method : {'mass', 'rank}, optional - One of "mass" (default) or "rank" (faster). + masking_method : {'mass', 'rank'}, optional + Choose one of "mass" (default) or "rank" (percentile, faster). + + For clustering, distances between topic-term distributions are asymmetric. In particular, the distance + (technically a divergence) from distribution A to B is more of a measure of if A is contained in B. At a + high level, this involves using distribution A to mask distribution B and then calculating the cosine + distance between the two. The masking can be done in two ways: + mass: forms mask by taking the top ranked terms until their cumulative mass reaches the + `masking_threshold` + rank: forms mask by taking the top ranked terms (by mass) until the `masking_threshold` is reached. + For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask. masking_threshold : float, optional - Default: None, which uses 0.11 for masking_method "rank", and 0.95 for "mass". + Default: None, which uses 0.95 for "mass", and 0.11 for masking_method "rank". In general, too small a mask + threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy distance + calculations. Defaults are often a good sweet spot for this hyperparameter. distance_workers : int, optional When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which is not multiprocessed. Set to > 1 to enable multiprocessing. @@ -639,7 +651,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): pipe.close() def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): - """ worker, that computes the distance to all other nodes + """ Worker that computes the distance to all other nodes from a chunk of nodes. https://stackoverflow.com/a/1743350 """ @@ -836,16 +848,8 @@ def rank_masking(a): distances[ttd1_idx][ttd2_idx] = distance - avg_mask_size = avg_mask_size / ttda1.shape[0] / ttda1.shape[1] - percent = round(100 * avg_mask_size, 1) - if avg_mask_size > 0.75: - # if the masks covered more than 75% of tokens on average, - # print a warning. info otherwise. - # The default mass setting of 0.95 makes uniform true masks on the opinosis corpus. - logger.warning('The given threshold of {} covered on average ' - '{}% of tokens, which might be too much'.format(threshold, percent)) - else: - logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) + percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) + logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) return distances From 9d57533d0210fc2b23fc1c8254ed942b7a4d186d Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 29 Jul 2019 21:45:35 +0200 Subject: [PATCH 043/166] created internal variable for cosine distance calculations --- gensim/models/ensemblelda.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 133da0b9dc..d48ceac61e 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -92,7 +92,6 @@ logger = logging.getLogger(__name__) - class EnsembleLda(): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting reliable topics that are consistently learned accross the ensemble. eLDA has the added benefit that the user @@ -162,10 +161,15 @@ def __init__(self, topic_model_kind="lda", num_models=3, Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble. """ + # INTERNAL PARAMETERS # Set random state # nps max random state of 2**32 - 1 is too large for windows: self._MAX_RANDOM_STATE = np.iinfo(np.int32).max + # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping + # distance calculations for highly masked topic-term distributions + self._COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05 + if "id2word" not in gensim_kw_args: gensim_kw_args["id2word"] = None if "corpus" not in gensim_kw_args: @@ -834,16 +838,11 @@ def rank_masking(a): # now mask b based on a, which will force the shape of a onto b ttd2_masked = ttd2[mask] - distance = 0 - # is the masked b just some empty stuff? Then forget about it, no similarity, distance is 1 - # (not 2 because that corresponds to negative values. The maximum distance is 1 here) - # don't normalize b_masked, otherwise the following threshold will never work: - if ttd2_masked.sum() <= 0.05: + # Smart distance calculation avoids calculating cosine distance for highly masked topic-term + # distributions that will have distance values near 1. + if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD: distance = 1 else: - # if there is indeed some non-noise stuff in the masked topic b, look at - # how similar the two topics are. note that normalizing is not needed for cosine distance, - # as it only looks at the angle between vectors distance = cosine(ttd1_masked, ttd2_masked) distances[ttd1_idx][ttd2_idx] = distance From d27cd593c18631f37e89a3786ad08264a56c67c5 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 28 Aug 2019 20:41:25 +0200 Subject: [PATCH 044/166] cbdbscan docstring --- gensim/models/ensemblelda.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index d48ceac61e..41f51e3dbb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -74,11 +74,13 @@ References ---------- -.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic - modelling with large corpora. In : THE LREC 2010 WORKSHOP ON NEW - CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. - 2010. p. 45-50. Available from: - http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 +.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic modelling with large corpora. In : THE LREC + 2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50. + Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 + +.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation: bachelor thesis. + Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/ + TODO country? thi? loosley, salles, stephan? """ @@ -95,8 +97,7 @@ class EnsembleLda(): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting reliable topics that are consistently learned accross the ensemble. eLDA has the added benefit that the user - does not need to know the exact number of topics the topic model should extract ahead of time. For more details - read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). + does not need to know the exact number of topics the topic model should extract ahead of time [2]. """ def __init__(self, topic_model_kind="lda", num_models=3, @@ -1098,14 +1099,15 @@ def id2word(self): class CBDBSCAN(): + """A Variation of the DBSCAN algorithm which limits how much a cluster is allowed to spread out and which works + on an asymmetric distance matrix. + """ def __init__(self, eps=0.1, min_samples=4): - self.eps = eps self.min_samples = min_samples def fit(self, amatrix): - self.next_label = 0 results = [{ From 42aa7ad6695258874bb3e051dad598ef9dd48bbb Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 28 Aug 2019 20:49:05 +0200 Subject: [PATCH 045/166] moved validate_core outside --- gensim/models/ensemblelda.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 41f51e3dbb..c65f94bbdb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -875,6 +875,13 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None): self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) self.cluster_model.fit(self.asymmetric_distance_matrix) + def _validate_core(core): + """Core is a dict of {is_core, valid_parents, labels} among others. + If not a core returns False. Only cores with the valid_parents as + its own label can be a valid core. Returns True if that is the case, + False otherwise""" + return core["is_core"] and (core["valid_parents"] == {core["label"]}) + def _generate_stable_topics(self, min_cores=None): """generates stable topics out of the clusters. This function is the last step that has to be done in the ensemble. @@ -999,19 +1006,8 @@ def remove_from_all_sets(label): for topic in results: topic["valid_parents"] = {label for label in topic["parent_labels"] if label in valid_labels} - def validate_core(core): - """Core is a dict of {is_core, valid_parents, labels} among others. - If not a core returns False. Only cores with the valid_parents as - its own label can be a valid core. Returns True if that is the case, - False otherwise""" - if not core["is_core"]: - return False - else: - ret = core["valid_parents"] == {core["label"]} - return ret - # keeping only VALID cores - valid_core_mask = np.vectorize(validate_core)(results) + valid_core_mask = np.vectorize(self._validate_core)(results) valid_topics = self.ttda[valid_core_mask] topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask] unique_labels = np.unique(topic_labels) From eaf62d6ced40bae39a570ff32f797b48a3ee267e Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Wed, 28 Aug 2019 21:15:27 +0200 Subject: [PATCH 046/166] added citation note --- gensim/models/ensemblelda.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index c65f94bbdb..3a0f31e814 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -82,6 +82,13 @@ Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/ TODO country? thi? loosley, salles, stephan? +Citation +-------- +At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's +bachelor thesis on the topic. In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and +a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if you use +this work in a published or open-source project. + """ import logging From 2e2eb1667347a8fc9b14ceec632e9b91a7332158 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 28 Aug 2019 22:15:17 +0200 Subject: [PATCH 047/166] moved more stuff outside of _generate_stable_topics --- gensim/models/ensemblelda.py | 141 ++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 51 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index c65f94bbdb..1a39ebe01f 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Authors: Tobias B , Alex Loosley , +# Authors: Tobias Brigl , Alex Loosley , # Stephan Sahm , Alex Salles , Data Reply Munich """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting @@ -875,45 +875,34 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None): self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) self.cluster_model.fit(self.asymmetric_distance_matrix) - def _validate_core(core): - """Core is a dict of {is_core, valid_parents, labels} among others. - If not a core returns False. Only cores with the valid_parents as - its own label can be a valid core. Returns True if that is the case, - False otherwise""" - return core["is_core"] and (core["valid_parents"] == {core["label"]}) - - def _generate_stable_topics(self, min_cores=None): - """generates stable topics out of the clusters. This function is the last step that has to be done in the - ensemble. - - Stable topics can be retreived afterwards using get_topics(). + def _validate_core(self, core): + """If core is not a core returns False. Only cores with the valid_parents as its own label can be a valid core. + Returns True if that is the case, False otherwise. Parameters ---------- - min_cores : int - how many cores a cluster has to have, to be treated as stable topic. That means, how many topics - that look similar have to be present, so that the average topic in those is used as stable topic. - - defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) - + core : {'is_core', 'valid_parents', 'labels'} + eps is 0.1 by default. """ + return core["is_core"] and (core["valid_parents"] == {core["label"]}) - logger.info("Generating stable topics") - - # min_cores being 0 makes no sense. there has to be a core for a cluster - # or there is no cluster - if min_cores == 0: - min_cores = 1 - - if min_cores is None: - # min_cores is a number between 1 and 3, depending on the number of models - min_cores = min(3, max(1, int(self.num_models / 4 + 1))) + def _group_by_labels(self, results): + """Groups all the learned cores by their label, which was assigned in the cluster_model - results = self.cluster_model.results + Parameters + ---------- + results : {list of {'is_core', 'parent_labels', 'num_samples', 'label'}} + After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results + member, which can be used as the argument to this function. It's a list of infos gathered during + the clustering step and each element in the list corresponds to a single topic. - # first, group all the learned cores based on the label, - # which was assigned in the cluster_model. The result is a - # dict of {group: [topic, ...]} + Returns + ------- + dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'}) + A mapping of the label to a list of topics that belong to that particular label. Also adds + a new member to each topic called amount_parent_labels, which is the number of parent_labels of that + topic. + """ grouped_by_labels = {} for topic in results: if topic["is_core"]: @@ -926,22 +915,38 @@ def _generate_stable_topics(self, min_cores=None): if label not in grouped_by_labels: grouped_by_labels[label] = [] grouped_by_labels[label].append(topic) + return grouped_by_labels + + def _aggregate_topics(self, grouped_by_labels): + """Aggregates the topics to a list of clusters - # then aggregate their amount_parent_labels by maxing and parent_labels by concatenating - # the result is sorted by amount_parent_labels and stored in sorted_clusters + Parameters + ---------- + grouped_by_labels : dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'}) + The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the + label. + + Returns + ------- + list of {'amount_parent_labels', 'parent_labels', 'label'} + amount_parent_labels is the max number of parent labels among each topic of a given cluster. label refers + to the label identifier of the cluster. parent_labels is a concatenated list of the parent_labels sets of + each topic. Its sorted by amount_parent_labels in descending order. There is one single element for each + cluster. + """ sorted_clusters = [] + for label, group in grouped_by_labels.items(): amount_parent_labels = 0 parent_labels = [] # will be a list of sets + for topic in group: amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels) parent_labels.append(topic["parent_labels"]) - # - here, nan means "not yet evaluated" - # - removing empty sets from parent_labels + sorted_clusters.append({ "amount_parent_labels": amount_parent_labels, "parent_labels": [x for x in parent_labels if len(x) > 0], - "is_valid": np.nan, "label": label }) @@ -952,31 +957,67 @@ def _generate_stable_topics(self, min_cores=None): cluster["label"] # makes sorting deterministic ), reverse=True) - def remove_from_all_sets(label): - """removes a label from every set in "parent_labels" for - each core in sorted_clusters.""" - for a in sorted_clusters: - if label in a["parent_labels"]: - a["parent_labels"].remove(label) + return sorted_clusters + + def _remove_from_all_sets(self, label, clusters): + """removes a label from every set in "parent_labels" for + each core in clusters""" + for a in clusters: + if label in a["parent_labels"]: + a["parent_labels"].remove(label) + + def _generate_stable_topics(self, min_cores=None): + """generates stable topics out of the clusters. This function is the last step that has to be done in the + ensemble. + + Stable topics can be retreived afterwards using get_topics(). + + Parameters + ---------- + min_cores : int + how many cores a cluster has to have, to be treated as stable topic. That means, how many topics + that look similar have to be present, so that the average topic in those is used as stable topic. + + defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) + + """ + logger.info("Generating stable topics") + + # min_cores being 0 makes no sense. there has to be a core for a cluster + # or there is no cluster + if min_cores == 0: + min_cores = 1 + + if min_cores is None: + # min_cores is a number between 1 and 3, depending on the number of models + min_cores = min(3, max(1, int(self.num_models / 4 + 1))) + + results = self.cluster_model.results + + grouped_by_labels = self._group_by_labels(results) + + sorted_clusters = self._aggregate_topics(grouped_by_labels) # APPLYING THE RULES 1 to 3 # iterate over the cluster labels, see which clusters/labels # are valid to cause the creation of a stable topic for i, cluster in enumerate(sorted_clusters): + # here, nan means "not yet evaluated" + cluster["is_valid"] = np.nan + label = cluster["label"] # label is iterating over 0, 1, 3, ... # 1. rule - remove if the cores in the cluster have no parents if cluster["amount_parent_labels"] == 0: - # label is NOT VALID cluster["is_valid"] = False - remove_from_all_sets(label) + self._remove_from_all_sets(label, sorted_clusters) # 2. rule - remove if it has less than min_cores as parents # (parents are always also cores) if len(cluster["parent_labels"]) < min_cores: cluster["is_valid"] = False - remove_from_all_sets(label) + self._remove_from_all_sets(label, sorted_clusters) # 3. checking for "easy_valid"s # checks if the core has at least min_cores of cores with the only label as itself @@ -984,7 +1025,7 @@ def remove_from_all_sets(label): if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores: cluster["is_valid"] = True - # Reaplying the rule 3 + # Reapplying the rule 3 # this happens when we have a close relationship among 2 or more clusters for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]: @@ -1097,8 +1138,8 @@ def id2word(self): class CBDBSCAN(): """A Variation of the DBSCAN algorithm which limits how much a cluster is allowed to spread out and which works on an asymmetric distance matrix. - """ + """ def __init__(self, eps=0.1, min_samples=4): self.eps = eps self.min_samples = min_samples @@ -1126,7 +1167,6 @@ def fit(self, amatrix): num_topics = len(amatrix) def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None): - # count how many neighbors # check which indices (arange 0 to num_topics) is closer than eps neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps] @@ -1178,7 +1218,6 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors results[topic_index]["label"] = current_label for neighbor in neighbors: - if results[neighbor]["label"] is None: ordered_min_similarity.remove(neighbor) # try to extend the cluster into the direction From 0a6c1f6162d42ac3dfb84c69ea2b59a8547cca33 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 28 Aug 2019 22:16:36 +0200 Subject: [PATCH 048/166] typos --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 55ef8d3be0..20d0c22f12 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -977,7 +977,7 @@ def _generate_stable_topics(self, min_cores=None): """generates stable topics out of the clusters. This function is the last step that has to be done in the ensemble. - Stable topics can be retreived afterwards using get_topics(). + Stable topics can be retrieved afterwards using get_topics(). Parameters ---------- @@ -1077,7 +1077,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): Generates stable topics out of the clusters afterwards. - Finally, stable topics can be retreived using get_topics(). + Finally, stable topics can be retrieved using get_topics(). Parameters ---------- From 000298291139e9a549877e8018db411a80b1705b Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Wed, 28 Aug 2019 22:34:20 +0200 Subject: [PATCH 049/166] explained CBDBSCAN --- gensim/models/ensemblelda.py | 90 +++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 3a0f31e814..4727f3b2b0 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1031,6 +1031,9 @@ def remove_from_all_sets(label): self.sorted_clusters = sorted_clusters self.stable_topics = stable_topics + def _get_total_parent_labels(self, cluster): + return sum(map(lambda x: x == {label}, cluster["parent_labels"])) + def recluster(self, eps=0.1, min_samples=None, min_cores=None): """Runs the CBDBSCAN algorithm on all the detected topics from the children of the ensemble. @@ -1044,7 +1047,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. default: 0.1 min_samples : int - The minimum number of sampels in the neighborhood of a topic to be considered a core in CBDBSCAN. + The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. default: int(self.num_models / 2) min_cores : int how many cores a cluster has to have, to be treated as stable topic. That means, how many topics @@ -1102,11 +1105,39 @@ def id2word(self): class CBDBSCAN(): - """A Variation of the DBSCAN algorithm which limits how much a cluster is allowed to spread out and which works - on an asymmetric distance matrix. + """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). The algorithm works based on + DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the + minimum number of nearby points needed to label a candidate datapoint a core of a cluster. + (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html). + + The algorithm works as follows: + 1. (A)symmetric distance matrix provided at fit-time. + For the sake of example below, assume the there are only three topics (amatrix contains distances with dim 3x3), + T_1, T_2, and T_3: + 2. Start by scanning a candidate topic + (e.g. T_1) + 3. Check which topics are nearby and call them neighbours + (e.g. assume T_2, and T_3 are nearby and become neighbours) + 4. If there are more neighbours than `min_samples`, the candidate topic becomes a core candidate for a cluster + (e.g. if `min_samples`<=2 then T_1 becomes the first core of a cluster) + 5. CheckBack (CB) Step: Check if at least 25% of other cluster cores are nearby to core candidate + (e.g. at this stage in the example, the cluster is empty, so checkback passes) + 6. Recursively scan the next nearby topic (e.g. scan T_2) - repeat step 2.-6. + + The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for + unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is + one cause of unreliable topics). + + Parameters + ---------- + eps : float + epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. + default: 0.1 + min_samples : int + The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. """ - def __init__(self, eps=0.1, min_samples=4): + def __init__(self, eps, min_samples): self.eps = eps self.min_samples = min_samples @@ -1121,31 +1152,35 @@ def fit(self, amatrix): "label": None } for _ in range(len(amatrix))] - tmp_amatrix = amatrix.copy() + amatrix_copy = amatrix.copy() # to avoid problem about comparing the topic with itself - np.fill_diagonal(tmp_amatrix, 1) + np.fill_diagonal(amatrix_copy, 1) - min_distance_per_topic = [(distance, index) for index, distance in enumerate(tmp_amatrix.min(axis=1))] + min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))] min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x) ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] - num_topics = len(amatrix) - def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None): - # count how many neighbors - # check which indices (arange 0 to num_topics) is closer than eps - neighbors = np.arange(num_topics)[tmp_amatrix[topic_index] < self.eps] - num_samples = len(neighbors) - - # If the number of neighbors of a topic is large enough, + # count how many neighbor_indices + # check which indices is closer than eps + neighbouring_topic_indices = np.array( + [ + candidate_topic_index + for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps) + if is_neighbour + ] + ) + num_neighbouring_topics = len(neighbouring_topic_indices) + + # If the number of neighbor_indices of a topic is large enough, # it is considered a core. - # This also takes neighbors that already are identified as core in count. - if num_samples >= self.min_samples: + # This also takes neighbor_indices that already are identified as core in count. + if num_neighbouring_topics >= self.min_samples: # This topic is a core! results[topic_index]["is_core"] = True - results[topic_index]["num_samples"] = num_samples + results[topic_index]["num_samples"] = num_neighbouring_topics # if current_label is none, then this is the first core # of a new cluster (hence next_label is used) @@ -1161,7 +1196,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors # Check the distance between the # new core and the parent - # parent neighbors is the list of neighbors of parent_id + # parent neighbor_indices is the list of neighbor_indices of parent_id # (the topic_index that called this function recursively) all_members_of_current_cluster = list(parent_neighbors) all_members_of_current_cluster.append(parent_id) @@ -1173,7 +1208,7 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors all_members_of_current_cluster_ix = np.ix_([topic_index], all_members_of_current_cluster) # use the result of the previous step to index the matrix and see if those distances # are smaller then epsilon. relations_to_the_cluster is a boolean array, True for close elements - relations_to_the_cluster = tmp_amatrix[all_members_of_current_cluster_ix] < self.eps + relations_to_the_cluster = amatrix_copy[all_members_of_current_cluster_ix] < self.eps # if less than 25% of the elements are close, then the topic index in question is not a # core of the current_label, but rather the core of a new cluster @@ -1184,19 +1219,20 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors results[topic_index]["label"] = current_label - for neighbor in neighbors: + for neighbouring_topic_index in neighbouring_topic_indices: - if results[neighbor]["label"] is None: - ordered_min_similarity.remove(neighbor) + if results[neighbouring_topic_index]["label"] is None: + ordered_min_similarity.remove(neighbouring_topic_index) # try to extend the cluster into the direction # of the neighbor scan_topic( - neighbor, parent_id=topic_index, + neighbouring_topic_index, parent_id=topic_index, current_label=current_label, - parent_neighbors=neighbors) + parent_neighbors=neighbouring_topic_indices + ) - results[neighbor]["parent_ids"].add(topic_index) - results[neighbor]["parent_labels"].add(current_label) + results[neighbouring_topic_index]["parent_ids"].add(topic_index) + results[neighbouring_topic_index]["parent_labels"].add(current_label) else: # this topic is not a core! From 9675016e19a706a69071cc093ae6e12ce010fb6a Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Wed, 28 Aug 2019 22:39:42 +0200 Subject: [PATCH 050/166] added extra explanation: --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 22b1acfbbb..f387918665 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1154,9 +1154,9 @@ class CBDBSCAN(): T_1, T_2, and T_3: 2. Start by scanning a candidate topic (e.g. T_1) - 3. Check which topics are nearby and call them neighbours + 3. Check which topics are nearby using `self.eps` as a threshold and call them neighbours (e.g. assume T_2, and T_3 are nearby and become neighbours) - 4. If there are more neighbours than `min_samples`, the candidate topic becomes a core candidate for a cluster + 4. If there are more neighbours than `self.min_samples`, the candidate topic becomes a core candidate for a cluster (e.g. if `min_samples`<=2 then T_1 becomes the first core of a cluster) 5. CheckBack (CB) Step: Check if at least 25% of other cluster cores are nearby to core candidate (e.g. at this stage in the example, the cluster is empty, so checkback passes) From b53704a8a4305d8fb47fbce5e568f04161c07ded Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 28 Aug 2019 22:44:11 +0200 Subject: [PATCH 051/166] using none instead of nan for unchecked core --- gensim/models/ensemblelda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 20d0c22f12..d38c615c7e 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1009,8 +1009,8 @@ def _generate_stable_topics(self, min_cores=None): # iterate over the cluster labels, see which clusters/labels # are valid to cause the creation of a stable topic for i, cluster in enumerate(sorted_clusters): - # here, nan means "not yet evaluated" - cluster["is_valid"] = np.nan + # here, None means "not yet evaluated" + cluster["is_valid"] = None label = cluster["label"] # label is iterating over 0, 1, 3, ... @@ -1034,7 +1034,7 @@ def _generate_stable_topics(self, min_cores=None): # Reapplying the rule 3 # this happens when we have a close relationship among 2 or more clusters - for cluster in [cluster for cluster in sorted_clusters if np.isnan(cluster["is_valid"])]: + for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: # this checks for parent_labels, which are also modified in this function # hence order will influence the result. it starts with the most significant From 4f3de9670adebad4c0166c392ab1c3b7e2cd0e64 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Wed, 28 Aug 2019 22:54:18 +0200 Subject: [PATCH 052/166] updated docs --- gensim/models/ensemblelda.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index f387918665..a6c94b58e8 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -89,6 +89,11 @@ a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if you use this work in a published or open-source project. +Other Notes +----------- +.. The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and +comments. + """ import logging @@ -1005,9 +1010,8 @@ def _generate_stable_topics(self, min_cores=None): sorted_clusters = self._aggregate_topics(grouped_by_labels) - # APPLYING THE RULES 1 to 3 - # iterate over the cluster labels, see which clusters/labels - # are valid to cause the creation of a stable topic + # Iterate over the cluster labels, see which clusters/labels + # are valid to trigger the creation of a stable topic for i, cluster in enumerate(sorted_clusters): # here, nan means "not yet evaluated" cluster["is_valid"] = np.nan @@ -1073,11 +1077,10 @@ def _generate_stable_topics(self, min_cores=None): self.stable_topics = stable_topics def recluster(self, eps=0.1, min_samples=None, min_cores=None): - """Runs the CBDBSCAN algorithm on all the detected topics from the children of the ensemble. - - Generates stable topics out of the clusters afterwards. + """Runs the CBDBSCAN algorithm on all the topics in the ensemble and uses resulting clusters to identify + reliable topics. - Finally, stable topics can be retrieved using get_topics(). + Stable topics can be retrieved using get_topics(). Parameters ---------- @@ -1098,8 +1101,13 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): logger.info("asymmetric distance matrix is outdated due to add_model") self._generate_asymmetric_distance_matrix() + # Run CBDBSCAN to get topic clusters: self._generate_topic_clusters(eps, min_samples) + + # Interpret the results of CBDBSCAN to identify reliable topics: self._generate_stable_topics(min_cores) + + # Create gensim LdaModel representation of topic model with reliable topics (can be used for inference): self.generate_gensim_representation() # GENSIM API From 2ed2cfe08f94eabb5f0718a76ff0c6e326bb32be Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 28 Aug 2019 23:00:00 +0200 Subject: [PATCH 053/166] refactored kind to class, fixed check how to proceed with topic_model_class --- gensim/models/ensemblelda.py | 18 +++++++++--------- gensim/test/test_ensemblelda.py | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index bd844adc22..ae121453f9 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -107,7 +107,7 @@ class EnsembleLda(): does not need to know the exact number of topics the topic model should extract ahead of time [2]. """ - def __init__(self, topic_model_kind="lda", num_models=3, + def __init__(self, topic_model_class="lda", num_models=3, min_cores=None, # default value from _generate_stable_topics() epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, min_samples=None, masking_method="mass", masking_threshold=None, @@ -116,7 +116,7 @@ def __init__(self, topic_model_kind="lda", num_models=3, Parameters ---------- - topic_model_kind : str, topic model, optional + topic_model_class : str, topic model, optional Examples: 'ldamulticore' (recommended), 'lda' (default), ensemble_workers : number, optional @@ -195,17 +195,17 @@ def __init__(self, topic_model_kind="lda", num_models=3, "input space dimensionality. Corpus should be provided using the " "`corpus` keyword argument.") - if isinstance(topic_model_kind, ldamodel.LdaModel): - self.topic_model_kind = topic_model_kind + if type(topic_model_class) == type and issubclass(topic_model_class, ldamodel.LdaModel): + self.topic_model_class = topic_model_class else: kinds = { "lda": ldamodel.LdaModel, "ldamulticore": ldamulticore.LdaMulticore } - if topic_model_kind not in kinds: + if topic_model_class not in kinds: raise ValueError( - "topic_model_kind should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel") - self.topic_model_kind = kinds[topic_model_kind] + "topic_model_class should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel") + self.topic_model_class = kinds[topic_model_class] self.num_models = num_models self.gensim_kw_args = gensim_kw_args @@ -306,7 +306,7 @@ def generate_gensim_representation(self): params["iterations"] = 0 # no training params["passes"] = 0 # no training - classic_model_representation = self.topic_model_kind(**params) + classic_model_representation = self.topic_model_class(**params) # when eta was None, use what gensim generates as default eta for the following tasks: eta = classic_model_representation.eta @@ -636,7 +636,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): kwArgs["random_state"] = random_states[i] - tm = self.topic_model_kind(**kwArgs) + tm = self.topic_model_class(**kwArgs) # adds the lambda (that is the unnormalized get_topics) to ttda, which is # a list of all those lambdas diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 5de7b324c7..84bc8340d5 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -199,7 +199,7 @@ def test_add_models(self): # 1. memory friendly eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_kind=LdaMulticore, + iterations=1, random_state=0, topic_model_class=LdaMulticore, workers=3, ensemble_workers=2) # 1.1 ttda @@ -245,7 +245,7 @@ def test_add_models(self): # 2. memory unfriendly eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_kind=LdaMulticore, + iterations=1, random_state=0, topic_model_class=LdaMulticore, workers=3, ensemble_workers=2, memory_friendly_ttda=False) # 2.1 a single ensemble @@ -296,11 +296,11 @@ def test_add_and_recluster(self): # train new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_kind='ldamulticore', + iterations=30, random_state=1, topic_model_class='ldamulticore', distance_workers=4) new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_kind='ldamulticore', + iterations=30, random_state=1, topic_model_class='ldamulticore', distance_workers=4, memory_friendly_ttda=False) # both should be similar np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) From 2abce75d391917e09123a320dc31fbaa75903630 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 28 Aug 2019 23:18:05 +0200 Subject: [PATCH 054/166] reverted change that accidentally broke things --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 65f06dc21c..360b757a30 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -237,9 +237,9 @@ def __init__(self, topic_model_class="lda", num_models=3, if gensim_kw_args.get("corpus") is None: return - if gensim_kw_args.get("iterations", 0) <= 0: + if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0: return - if gensim_kw_args.get("passes", 0) <= 0: + if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: return logger.info("Generating {} topic models...".format(num_models)) From 511eaa5360dedb7693237734d1a5f9ecc77bd30e Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 8 Sep 2019 19:32:48 +0200 Subject: [PATCH 055/166] fixed tests locally --- gensim/test/test_data/ensemblelda | Bin 9815 -> 9893 bytes gensim/test/test_ensemblelda.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 3d77142cd210d18ed65595fed46a0f1a5a081e32..a83e87b8c7965b1871ef711a66f449a1e618b1b4 100644 GIT binary patch delta 4596 zcmZ`-c|26@+c#q$`x2o&YAh9!lxdK?V~L!HvW!^{XJVE!i;QUUM3Rb{$5T{F;;|(W ziWZTSEnAVwQnrdpk+gZwjOX|M-u`&cA9J1i`d;^S?rS;oxxS~zBiA#{%0YwV=@3Bj zb?|cX_9X4{bMSLUWOmHY&kI3m9)pX>27^nekz_8HP9-r|G$@k9f!G{~3o&_Q9-YNR zU1LEjBh$C^$MbrQ(P%WIyV30{X7=0+riliaxu68>qo}E(# zKR`J)dN7EKXrOjVsBaRVO(XMAg(j+yr9o7(0MT-_8wZL_`_lx@E8d{9kOj7{da}hhdduFf=H}jhLO| zPCS3ood|U}6S1NK85(AVNL5x>xYE@wA!f+6M0-DogQCchBq$2v2#{3(4pgMVd}aif z9s?n(J=^6$4F;V_XQ08eVLAyld!IVR5*~WEpP(bggECw6x6b?cF zqylJSzg#{i3KAgO0GR+8bjjzUAQ6z4_<{KhG83@_FcOGIXF!NOAQzyB+R>S)moPFF zLL30B#5LN=Y!;IXA=?3IJ}6BK<06i6h?A&To)GQ~ECChJ*d=ixE^&w}9*}|w00y=L zu&@Il4Q~fz;B9~`yakYh?E!h12q?g|fFisVz`-_v61*Tax5@d&WuHqI|*EK%CPak_4(~9HY zyxCKl4sRFrrXSMSSf8Y-5OwAB&t!V@kK&BXFNZyOT8MIe-Lk%R3Ql?NX4$V_!k2Ho zCH$-Mc3#fM+fgB{L~nFzm&4wY-}8D~&eT zyWiW+4Oi*@^)j%bX5IU|`914{I02#T$1;>1a?0ACGA4qapg2vlqH?TGe6Q4@iGqBZ za=wF-r){~N5ELxHl$TB?&Xawq!kU&ehDuBH-6x7! z7Y-tBsoze&zoW2vkzQQC@~XM2gfO4SsRT2I|Cc$^A2ini>*>}?%S) z`kR+mZ{4tBfM*G@TbD1f(e&`?T1#Eht36E@QDV#K&f@n z8T3aw))7x@hcLaB6;IC#vkus7CYrUx5QvW{6d+n%xOlDUH; zPApI2t?)eBcy`8dnwQlr%&&3X#~Dpy-X7x`yvlh~^nS-(=;Fn!JINi(2z-j&B`m%=GM4 zT6FVjUa!-$8{?T7NAl`%A^qkr+p0fuvtNyL(Js2Ja5|UKcXuOwv)QPSU*AAj?FyKapzQMt)^d}^<% zPcPdcp)pn4Vn$zX?-^~(Wo4;fhuLN4b3)Vj!jxKetG*S%kJ z=M!A*Tnz#}7#~d7K)N?fcp3fly=wqyojVb`c0@fU?CDRprx(hm@K{T5c^e^O>P4=f znkLovMwC>rQp1pHZZ-!iG+KNlTzU1W!b&YWW^Z>(vRvOkpB)IsN2`y%ZvMG=Sm1^D z%KLQYL=AjVDE$5$e!7fM(}cP0KDx~9f>cMdLyDPnvfW_|54OF(iSD_BV^z5HH%x^; z_Pf2DD)bo3(L=_P`>qCgynugPTlQwNu<%+nYXzYPL9{~^zN`y^`9FFw3Rk-$+&T{*~{mg=xRv6mO1%P zow+++<45Dnr;E9Xfj@4h0u@T{C8_{eiqq zDmjAn*B&9dn$2XPe(VmR8uM(t=)%$yR#4~Vf$N*qOLw367*^aid1HsM|L|QG?`sxj zTWq!@`YTCYE6f?+?$Q&)B7E=A#<^WKcGFkK_< zexLV)KLYv9)I-PVrw{7BzP0}`!E28>U(tB(o>}r4V{5~< z++$yoo3me*r#5s;nQ0|H3!n&(9>v|icicH~xom1s_&~XKX;8iTcN`d`k@~KmLK`l5 zx<+PTxoPtH+wr$ce;QaoCzSIU8+z^!25q$T^UStOcI5h1t1H%2y-rx1Z%_5MTeb@NQK4Hx*&32n3W;OFk`Me=j=b>8LX?cszJ zx!M(Eh_6ghoT09;_z0-FKu1Xt`$%L?I2XARhm_!9bJ!Br04zZ{DvQJBbCIjTNGU!T zxrVX>0CN~AL*qh4IZ7{4aUBXqD*i<(0co+S3Xl+yvyrq8gAFiyGAY7GAtn zco@08U~y*wx{GEDwWvic?um1U-q)c9*uzNu0&*Wk8qgYwkp~N28yB`dM3F~;q8Ml* z2P4h+h5Hr&CswthXxqQ4#|x@$nAMY+REWeNqw8qo>2Gbi}V6Apct)g7^SuoOvk3*iL^q0;RvwW2BzBmG- pX?-bDT4S`fAHeYAkO4frpd7@*TsRI+fiobOhSEnd4Dl)J{|7)gzwrP7 delta 4376 zcmY*cc|26@7dOT-Gqw@gw`3WSEo)+A$#U(QqDW)rnmaOP?ktL8@@5ICG)l52LefG= zN>aR~l6phE6e?0FT96dKJG{T&`)mF=_nzdG379x5BsI=ctunXtf4POu$^B zEO>~LC5lsGc??!OE|yAWv&r!|SOO49C7~kIgJ4MlfC4Z8ChZ>zl^f3jaWIA?8f6EF z04Ril6%-+H6!i?m-I%awlz^z`h!$CF6!7&N#Y*q?ObXic){B3eF2D>37WLFO#Le`p zE8-JAO=MGKKWezlO(|+b2uOeRz!GGNF5`1EGn6cI94JKaVkh5$HPj3K$NnS zGO{&?i(s&dfP>PtRYgvzmI$j8AUOz6l0?>0naBw>04xa&a>=kJBo6)C2`ynKq*Ech zU=iiCR{J+hM~I06(rFMrgS0a&!PDvV884^e94{yEvAYvm5G_hYtr9#ERW{W1Af_aY zcwA82OTvM16F3@644r-#u+3WmCj+2 zxfBRw!zO+t+)~)(HmwCjNmAe*mU|477AY)->2IOT{!Lggf>uCP2&U0HWzCV+Nz$p7 z!hXXR0*aU+-x8_ux2kJg6#p^63B_?a6tyV9uQvwxoi+=u>*h&K_gU<#ep{fm=Fx|OkpxuJft|V zBOnq&6~^W8*l}Py>;#B~h#{{$4gwMZ$)z|Dk3o)wodMJm#HBMpcq1SYf<<`rSR_gm znF7Ktfap?aSrTO#Z(V2E#^Z2~k%7$oIDptW-O5Ea$gtT2(z;x^9jG>*E&P^i)lVD>aI;lA!qFH%j;Py;b7iFeMy7FwIRhr zb+qK>o*3)wr>zU6?N&GDnB_rJgZGc%($+Ss@PciX#AdCroa{RW3SCvY+nAT(&jtpcRfdBp;lZ8~onTJ%X!hDfLwDJYLMgd!g&tPU>2`&XH)n3E717;|KX* z)<>H=*Sc40JSuh|`n3eAZT$B5q&XpbWc2;+oT!MCr#p#-nqTKL?++yLtjmlFVr%1` ziC*mw2~d9i!80{w^eX+NLW$V%+gWd=(vw)p&FVfDoeUZJr2r3=fc@{ox0e_y=l1{% z=+wPgE04}RvzmAPm3)m`v!Avlnnqc9^3m6#Su;i}_q`nQcKehIJrNrG7P>{nO!lA~Gf=~t&~O*WNeT*nCB?=rBvHatSuhAD>qI=W&0)WL10 zXH8YZScjVW9t}q~9G}UNVRe|~alZr&{p6>BH*mzM8QjiMfQ?@K%Puy+_Ksb~mD3^P|2p z&{TIaMs5Mho^<|-KXP3phJWwT3-O`#){bABn+%GpOCFcGWLaJ5p1IqG31$8ZXol`q#)A$+jgP;1` zyDF~_$c&LoC-u;MXKFCMA{%>F-L~MzPG@QONJOhE=Ei0E`?u1ZY3H7+dA=yn?{+4s zn$7+3<;!K3@1<{CJBi%Z{Hu3t!cQ7OekNl({O4wqR>`k=mi@L7f6>xZu!^M3zo9I8 zD@+So^EL2Gf3LFQfU&o#?nrjvt~++tn+^=FWOy-WM1w`!0vYl}Mk;uDuxBb|)#DZF zPb)MUxoY-%ca&brS}5-~Up-IUZ*`TZCH2cKdt4J3u{oql$#snDIGVI|J_Yv3y%kx? z3~XOxwX?;Tb3oHXKlWXlEc*7#hy0KyJ-i$Hod&a_T%M24AG3)`=Ej<~pXR(7t^CIKkHRiAUp{D;O#|Bx7a(@k-Md~=7#KbFMn`^jwia2tE3;eA5Rh{=>voy__vv$qwc4mU?W zZO~SqY_gTn)BGNo*pENe_W?h3ev7W7!(N`nG0infk9X=cZ4Mr)-Rhw*abRbGYX(*> zS0M(_xBB$zM6T;Y4{DW2H11bT5Z`f+Y`|0``OvG$RXeKlD;?*O(vKzIPf%(-v%gCL zeAVv%~OSRRh!gab@qTp04?0^!{v_VA!fX z%V4o!bMKt{(ZQEu29jmwpuzP_47RizSrwc^wy=Y>YcHus-_?7w6U z4T30}9BW+Fe@=|s?mjyqeo6DIo+`hvn*1IOhF%QgH8lM_J$bg`Ip)_WN=p6Kg1q%b z(^Sd7>UWFd{l9dq@F|bFu&}+|FRvXob@X#PpgFvDP?et3eC9#b{S-LEam}jdSjqQ3HdS)}R4o7GhLGqH**Viw6veM{ ziF|)(BWBu2(nP{EJF~map(7(Y1vmwg*0A`gw)Q4vl~4Uihgbm ze#w!H3-Ec_ce2;{agk1q&RDK=K>50+X1dR~+^WiIPr~=FMTu(r*!z1=wp_8-;iGyC z#?r2({jxVcRlPU!BqK-(Z!wwEo9R+cOYeARU0IeHU*gz%KmQJT427=Xza>AYa&g#q zzG3v}LiX$)Txr-x`3%1|lrvuyfQ+P6w@1Uy z&0b|D5?40%)NVc|mN# zKDJ%wqD*A9q}JSSuIb5Dt>~~G!LNFN;E9 z>T^~c7ghd$-Bcmm03*~!*m}p6`KT4c2k6Ti4EZlv5fj0M$ zB+1giSPq>Lv1@k(nN8!s=M&*t0%QTLgA@Tvk_3gxX7M=ig>d*HAsoJh%f=u0u>c7!F_g2WbG%LRTXoCNxcexX@e$_!2_f3`h#iHH6zoX+?^G z5N;xbHeqRy=UWItG!1TFMs6d>9m4V+9l%o1&gG?lBFJ3;D_nAq91eF8mY?qfIH8OG z0D&L=HKz!I+3$^jIp~3rQ_Y z!X+OE?gPX~avUa`3sNH}F+4<$A!4mR9DYVfgr6h&4o~MqAc_`|bblX&kmC&qLJjl{ i2LTi>5q?2{mYpvN5C=+xvY>nrLT)T1gh5`U>3;wk=tWBa diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 84bc8340d5..8479847160 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -14,7 +14,7 @@ import numpy as np from copy import deepcopy -from gensim.models import EnsembleLda, LdaMulticore +from gensim.models import EnsembleLda, LdaMulticore, LdaModel from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary num_topics = 2 From 96d6fbdf06285ff0532d5827fed7d12a854a0559 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 8 Sep 2019 19:55:14 +0200 Subject: [PATCH 056/166] fix code style --- gensim/models/ensemblelda.py | 1 + gensim/test/test_ensemblelda.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 360b757a30..a371357ca4 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -106,6 +106,7 @@ logger = logging.getLogger(__name__) + class EnsembleLda(): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting reliable topics that are consistently learned accross the ensemble. eLDA has the added benefit that the user diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 8479847160..84bc8340d5 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -14,7 +14,7 @@ import numpy as np from copy import deepcopy -from gensim.models import EnsembleLda, LdaMulticore, LdaModel +from gensim.models import EnsembleLda, LdaMulticore from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary num_topics = 2 From 819a05fa1ea1bdf1829b00a1ecac02b493bc53e9 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 12 Sep 2019 00:33:55 +0200 Subject: [PATCH 057/166] added _is_easy_valid_cluster --- gensim/models/ensemblelda.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a371357ca4..14941e6fff 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -979,6 +979,11 @@ def _remove_from_all_sets(self, label, clusters): if label in a["parent_labels"]: a["parent_labels"].remove(label) + def _is_easy_valid_cluster(self, label, cluster, min_cores): + """checks if the cluster has at least min_cores of cores with the only parent-label as itself, + meaning no other cluster influenced those cores, hence the cluster in question is easy valid.""" + return sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores + def _generate_stable_topics(self, min_cores=None): """generates stable topics out of the clusters. This function is the last step that has to be done in the ensemble. @@ -1032,23 +1037,22 @@ def _generate_stable_topics(self, min_cores=None): self._remove_from_all_sets(label, sorted_clusters) # 3. checking for "easy_valid"s - # checks if the core has at least min_cores of cores with the only label as itself + # checks if the cluster has at least min_cores of cores with the only label as itself if cluster["amount_parent_labels"] >= 1: - if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores: + if self._is_easy_valid_cluster(label, cluster, min_cores): cluster["is_valid"] = True # Reapplying the rule 3 # this happens when we have a close relationship among 2 or more clusters for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: - - # this checks for parent_labels, which are also modified in this function - # hence order will influence the result. it starts with the most significant - # label (hence "sorted_clusters") - if sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores: + # This modifies parent labels + # which is important in _is_easy_valid_cluster, so the result depends on where to start. + # That's why sorted_clusters is sorted, so it starts with the most significant cluster. + if self._is_easy_valid_cluster(label, cluster, min_cores): cluster["is_valid"] = True else: cluster["is_valid"] = False - # removes a label from every set in parent_labels for each core + # removes a label from every set in parent_labels for each core. for a in sorted_clusters: if label in a["parent_labels"]: a["parent_labels"].remove(label) From e3025f60113f706f9eb1e0ec214ecb2f6570dfe8 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 12 Sep 2019 00:39:00 +0200 Subject: [PATCH 058/166] updated thesis reference --- gensim/models/ensemblelda.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 14941e6fff..dc36dc73a3 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -78,9 +78,8 @@ 2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50. Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 -.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation: bachelor thesis. - Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/ - TODO country? thi? loosley, salles, stephan? +.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. + Ingolstadt: Tecnische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/ Citation -------- From 95e1b7937b6dc610715a209b46c7d0612fbf108c Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 12 Sep 2019 09:37:00 +0200 Subject: [PATCH 059/166] updated notebook example, typo --- docs/notebooks/Opinosis.ipynb | 126 +++++----------------------------- gensim/models/ensemblelda.py | 2 +- 2 files changed, 18 insertions(+), 110 deletions(-) diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb index 3fc845040c..8227f99932 100644 --- a/docs/notebooks/Opinosis.ipynb +++ b/docs/notebooks/Opinosis.ipynb @@ -2,12 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { - "ExecuteTime": { - "end_time": "2018-12-01T17:57:35.858751Z", - "start_time": "2018-12-01T17:57:34.507686Z" - }, "scrolled": false }, "outputs": [], @@ -27,13 +23,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2018-12-01T17:57:36.539523Z", - "start_time": "2018-12-01T17:57:36.534986Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "elda_logger = logging.getLogger(EnsembleLda.__module__)\n", @@ -64,13 +55,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2018-11-29T20:19:17.891961Z", - "start_time": "2018-11-29T20:19:16.731678Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!mkdir ~/opinosis\n", @@ -80,13 +66,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2018-12-01T17:57:39.575650Z", - "start_time": "2018-12-01T17:57:39.571588Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "path = os.path.expanduser('~/opinosis/')" @@ -104,13 +85,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2018-12-01T17:57:40.842440Z", - "start_time": "2018-12-01T17:57:39.905273Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "opinosis = OpinosisCorpus(path)" @@ -138,93 +114,25 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2018-12-01T18:10:13.690983Z", - "start_time": "2018-12-01T17:57:41.089661Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generating 128 topic models...\n", - "Spawned worker to generate 42 topic models\n", - "Spawned worker to generate 43 topic models\n", - "Spawned worker to generate 43 topic models\n", - "Generating a 2560 x 2560 asymmetric distance matrix...\n", - "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", - "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", - "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", - "Spawned worker to generate 640 rows of the asymmetric distance matrix\n", - "The given threshold of 0.11 covered on average 9.9% of tokens\n", - "The given threshold of 0.11 covered on average 9.8% of tokens\n", - "The given threshold of 0.11 covered on average 9.9% of tokens\n", - "The given threshold of 0.11 covered on average 9.8% of tokens\n", - "Fitting the clustering model\n", - "Generating stable topics\n", - "Generating classic gensim model representation based on results from the ensemble\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n", " passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n", - " topic_model_kind='ldamulticore', masking_method='rank', min_samples=32)" + " topic_model_class='ldamulticore', masking_method='rank', min_samples=32)" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2018-12-01T18:10:13.712818Z", - "start_time": "2018-12-01T18:10:13.695844Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "- 0.132 staff, 0.082 servic, 0.081 friendli, 0.075 help, 0.021 good, 0.015 quick, 0.010 expens \n", - "\n", - "- 0.166 screen, 0.063 bright, 0.040 clear, 0.022 easi, 0.020 touch, 0.015 read, 0.014 new \n", - "\n", - "- 0.146 free, 0.043 park, 0.034 coffe, 0.031 wine, 0.025 even, 0.025 morn, 0.022 internet \n", - "\n", - "- 0.142 batteri, 0.099 life, 0.026 short, 0.020 charg, 0.018 kindl, 0.018 good, 0.017 perform \n", - "\n", - "- 0.123 room, 0.111 clean, 0.050 small, 0.035 comfort, 0.033 bathroom, 0.017 nice, 0.016 size \n", - "\n", - "- 0.147 seat, 0.086 comfort, 0.058 uncomfort, 0.045 firm, 0.031 long, 0.030 drive, 0.014 time \n", - "\n", - "- 0.161 mileag, 0.106 ga, 0.056 good, 0.028 expect, 0.019 hard, 0.018 road, 0.018 high \n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# pretty print, note that the words are stemmed so they appear chopped off\n", "for t in elda.print_topics(num_words=7):\n", " print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -243,7 +151,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index dc36dc73a3..84916b25cb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -79,7 +79,7 @@ Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 .. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. - Ingolstadt: Tecnische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/ + Ingolstadt: Technische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/ Citation -------- From 3d11649f05daa72d2107ff1c167b61a6944df46f Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 14 Sep 2019 23:59:44 +0200 Subject: [PATCH 060/166] docstring styles, renaming, cleanup, stuff I need to discuss first --- gensim/models/ensemblelda.py | 412 ++++++++++++++---------------- gensim/test/test_data/ensemblelda | Bin 9893 -> 13705 bytes 2 files changed, 191 insertions(+), 221 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 84916b25cb..2bf40afdb5 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -4,10 +4,12 @@ # Authors: Tobias Brigl , Alex Loosley , # Stephan Sahm , Alex Salles , Data Reply Munich -"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting -reliable topics that are consistently learned accross the ensemble. eLDA has the added benefit that the user -does not need to know the exact number of topics the topic model should extract ahead of time. For more details -read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). +"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. + +Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that +the user does not need to know the exact number of topics the topic model should extract ahead of time + +For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). Usage examples -------------- @@ -107,17 +109,21 @@ class EnsembleLda(): - """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble and extracting - reliable topics that are consistently learned accross the ensemble. eLDA has the added benefit that the user - does not need to know the exact number of topics the topic model should extract ahead of time [2]. + """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. + + Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that + the user does not need to know the exact number of topics the topic model should extract ahead of time [2]. """ + def __init__(self, topic_model_class="lda", num_models=3, min_cores=None, # default value from _generate_stable_topics() epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, min_samples=None, masking_method="mass", masking_threshold=None, distance_workers=1, random_state=None, **gensim_kw_args): - """ + """Create and train a new EnsembleLda model. + + Will start training immediatelly, except if iterations, passes or num_models is 0 or if the corpus is missing. Parameters ---------- @@ -188,9 +194,6 @@ def __init__(self, topic_model_class="lda", num_models=3, if "corpus" not in gensim_kw_args: gensim_kw_args["corpus"] = None - # dictionary. modified version of from gensim/models/ldamodel.py - # error messages are copied from the original gensim module [1]: - # will create a fake dict if no dict is provided. if gensim_kw_args["id2word"] is None and not gensim_kw_args["corpus"] is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") gensim_kw_args["id2word"] = utils.dict_from_corpus(gensim_kw_args["corpus"]) @@ -216,7 +219,10 @@ def __init__(self, topic_model_class="lda", num_models=3, self.gensim_kw_args = gensim_kw_args self.memory_friendly_ttda = memory_friendly_ttda + self.distance_workers = distance_workers + self.masking_threshold = masking_threshold + self.masking_method = masking_method # this will provide the gensim api to the ensemble basically self.classic_model_representation = None @@ -234,7 +240,6 @@ def __init__(self, topic_model_class="lda", num_models=3, # parameters, stop here and don't train. if num_models <= 0: return - if gensim_kw_args.get("corpus") is None: return if "iterations" in gensim_kw_args and gensim_kw_args["iterations"] <= 0: @@ -242,7 +247,7 @@ def __init__(self, topic_model_class="lda", num_models=3, if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: return - logger.info("Generating {} topic models...".format(num_models)) + logger.info("generating {} topic models...".format(num_models)) if ensemble_workers > 1: self._generate_topic_models_multiproc(num_models, ensemble_workers) @@ -250,10 +255,7 @@ def __init__(self, topic_model_class="lda", num_models=3, # singlecore self._generate_topic_models(num_models) - self._generate_asymmetric_distance_matrix( - workers=self.distance_workers, - threshold=masking_threshold, - method=masking_method) + self._generate_asymmetric_distance_matrix() self._generate_topic_clusters(epsilon, min_samples) self._generate_stable_topics(min_cores) @@ -261,12 +263,12 @@ def __init__(self, topic_model_class="lda", num_models=3, self.generate_gensim_representation() def convert_to_memory_friendly(self): - """removes the stored gensim models and only keeps their ttdas""" + """Remove the stored gensim models and only keep their ttdas.""" self.tms = [] self.memory_friendly_ttda = True def generate_gensim_representation(self): - """Creates a gensim model from the stable topics. + """Create a gensim model from the stable topics. The returned representation is an Gensim LdaModel (:py:class:`gensim.models.LdaModel`) that has been instantiated with an A-priori belief on word probability, eta, that represents the topic-term distributions of @@ -279,26 +281,25 @@ def generate_gensim_representation(self): :py:class:`gensim.models.LdaModel` A Gensim LDA Model classic_model_representation for which: classic_model_representation.get_topics() == self.get_topics() - """ - logger.info("Generating classic gensim model representation based on results from the ensemble") + """ + logger.info("generating classic gensim model representation based on results from the ensemble") - number_of_words = self.sstats_sum - # if number_of_words should be wrong for some fantastic funny reason + sstats_sum = self.sstats_sum + # if sstats_sum (which is the number of words actually) should be wrong for some fantastic funny reason # that makes you want to peel your skin off, recreate it (takes a while): - if number_of_words == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None: + if sstats_sum == 0 and "corpus" in self.gensim_kw_args and not self.gensim_kw_args["corpus"] is None: for document in self.gensim_kw_args["corpus"]: for token in document: - number_of_words += token[1] - self.sstats_sum = number_of_words - assert number_of_words != 0 + sstats_sum += token[1] + self.sstats_sum = sstats_sum stable_topics = self.get_topics() num_stable_topics = len(stable_topics) if num_stable_topics == 0: - logger.error("The model did not detect any stable topic. You can try to adjust epsilon: " + logger.error("the model did not detect any stable topic. You can try to adjust epsilon: " "recluster(eps=...)") self.classic_model_representation = None return @@ -315,7 +316,10 @@ def generate_gensim_representation(self): # when eta was None, use what gensim generates as default eta for the following tasks: eta = classic_model_representation.eta - + if sstats_sum == 0: + sstats_sum = classic_model_representation.state.sstats.sum() + self.sstats_sum = sstats_sum + # the following is important for the denormalization # to generate the proper sstats for the new gensim model: # transform to dimensionality of stable_topics. axis=1 is summed @@ -337,7 +341,7 @@ def generate_gensim_representation(self): # right sstats, so that get_topics() will return the stable topics no # matter eta. - normalization_factor = np.array([[number_of_words / num_stable_topics]] * num_stable_topics) + eta_sum + normalization_factor = np.array([[sstats_sum / num_stable_topics]] * num_stable_topics) + eta_sum sstats = stable_topics * normalization_factor sstats -= eta @@ -351,10 +355,10 @@ def generate_gensim_representation(self): return classic_model_representation def add_model(self, target, num_new_models=None): - """Adds the ttda of another model to the ensemble this way, multiple topic models can be connected to an - ensemble. - - Make sure that all the models use the exact same dictionary/idword mapping. + """Add the ttda of another model to the ensemble. + + This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use + the exact same dictionary/idword mapping. In order to generate new stable topics afterwards, use self._generate_asymmetric_distance_matrix() @@ -472,7 +476,7 @@ def add_model(self, target, num_new_models=None): 'stored models for a memory unfriendly ensemble') self.num_models = len(self.tms) - logger.info("Ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) + logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) if self.ttda.shape[1] != ttda.shape[1]: raise ValueError(("target ttda dimensions do not match. Topics must be {} but was" @@ -491,7 +495,6 @@ def save(self, fname): Path to the system file where the model will be persisted. """ - logger.info("saving %s object to %s", self.__class__.__name__, fname) utils.pickle(self, fname) @@ -510,8 +513,6 @@ def load(fname): A previously saved ensembleLda object """ - - # message copied from [1] logger.info("loading %s object from %s", EnsembleLda.__name__, fname) eLDA = utils.unpickle(fname) @@ -519,8 +520,9 @@ def load(fname): return eLDA def _generate_topic_models_multiproc(self, num_models, ensemble_workers): - """Will make the ensemble multiprocess, which results in a speedup on multicore machines. Results from the - processes will be piped to the parent and concatenated. + """Generate the topic models to form the ensemble in a multiprocessed way. + + Depending on the used topic model this can result in a speedup. Parameters ---------- @@ -537,7 +539,6 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): In that case, ensemble_workers=2 can be used. """ - # the way random_states is handled needs to prevent getting different results when multiprocessing is on, # or getting the same results in every lda children. so it is solved by generating a list of state seeds before # multiprocessing is started. @@ -609,7 +610,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): p.terminate() def _generate_topic_models(self, num_models, random_states=None, pipe=None): - """Will train the topic models, that form the ensemble. + """Train the topic models, that form the ensemble. Parameters ---------- @@ -623,9 +624,8 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): send the ttda. """ - if pipe is not None: - logger.info("Spawned worker to generate {} topic models".format(num_models)) + logger.info("spawned worker to generate {} topic models".format(num_models)) if random_states is None: random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] @@ -668,11 +668,8 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): pipe.close() def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): - """ Worker that computes the distance to all other nodes - from a chunk of nodes. https://stackoverflow.com/a/1743350 - """ - - logger.info("Spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) + """Worker that computes the distance to all other nodes from a chunk of nodes.""" + logger.info("spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( @@ -680,28 +677,17 @@ def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pip pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory pipe.close() - def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method="mass"): - """Makes the pairwise distance matrix for all the ttdas from the ensemble. + def _generate_asymmetric_distance_matrix(self): + """Calculate the pairwise distance matrix for all the ttdas from the ensemble. Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering. Afterwards, the model needs to be reclustered for this generated matrix to take effect. - Parameters - ---------- - threshold : float, optional - if threshold is None and method == "rank": - threshold = 0.11 - if threshold is None and method == "mass": - threshold = 0.95 - - threshold keeps by default 95% of the largest terms by mass. Except the "fast" parameter is "rank", then - it just selects that many of the largest terms. - workers : number, optional - when workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which is not - multiprocessed. Set to > 1 to enable multiprocessing. - """ + workers = self.distance_workers + threshold = self.masking_threshold + method = self.masking_method # matrix is up to date afterwards self.asymmetric_distance_matrix_outdated = False @@ -709,7 +695,7 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method if threshold is None: threshold = {"mass": 0.95, "rank": 0.11}[method] - logger.info("Generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) + logger.info("generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) # singlecore if workers is not None and workers <= 1: @@ -728,7 +714,6 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method ttdas_sent = 0 for i in range(workers): - try: parentConn, childConn = Pipe() @@ -781,10 +766,7 @@ def _generate_asymmetric_distance_matrix(self, threshold=None, workers=1, method return self.asymmetric_distance_matrix def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): - """Internal method calculating an (asymmetric) distance from each topic term distribution in ttda1 to each topic - term distribution in ttda2 and returns a matrix of distances, size len(ttda1) by len(ttda2). - - This is an internal method and should not be used for two general ttda arrays. + """Calculate an (asymmetric) distance from each topic in ttda1 to each topic in ttda2. Parameters ---------- @@ -806,21 +788,21 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s ------- 2D Numpy.numpy.ndarray of floats Asymmetric distance matrix of size len(ttda1) by len(ttda2). - """ + """ if method not in ["mass", "rank"]: raise ValueError("method {} unknown".format(method)) # select masking method: def mass_masking(a): - """original masking method. returns a new binary mask""" + """Original masking method. Returns a new binary mask.""" sorted_a = np.sort(a)[::-1] largest_mass = sorted_a.cumsum() < threshold smallest_valid = sorted_a[largest_mass][-1] return a >= smallest_valid def rank_masking(a): - """faster masking method. returns a new binary mask""" + """Faster masking method. Returns a new binary mask.""" return a > np.sort(a)[::-1][int(len(a) * threshold)] create_mask = {"mass": mass_masking, "rank": rank_masking}[method] @@ -833,7 +815,6 @@ def rank_masking(a): # now iterate over each topic for ttd1_idx, ttd1 in enumerate(ttda1): - # create mask from ttd1 that removes noise from a and keeps the largest terms mask = create_mask(ttd1) ttd1_masked = ttd1[mask] @@ -842,7 +823,6 @@ def rank_masking(a): # now look at every possible pair for topic a: for ttd2_idx, ttd2 in enumerate(ttda2): - # distance to itself is 0 if ttd1_idx + start_index == ttd2_idx: distances[ttd1_idx][ttd2_idx] = 0 @@ -861,13 +841,12 @@ def rank_masking(a): distances[ttd1_idx][ttd2_idx] = distance percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) - logger.info('The given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) + logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) return distances def _generate_topic_clusters(self, eps=0.1, min_samples=None): - """Runs the DBSCAN algorithm on all the detected topics from the models in the ensemble and labels them with - label-indices. + """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices. The final approval and generation of stable topics is done in _generate_stable_topics(). @@ -877,43 +856,46 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None): eps is 0.1 by default. min_samples : int min_samples is int(self.num_models / 2) - """ + """ if min_samples is None: min_samples = int(self.num_models / 2) - logger.info("Fitting the clustering model") + logger.info("fitting the clustering model") self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) self.cluster_model.fit(self.asymmetric_distance_matrix) def _validate_core(self, core): - """If core is not a core returns False. Only cores with the valid_parents as its own label can be a valid core. - Returns True if that is the case, False otherwise. + """Check if the core has only a single valid parent, which is also the label assigned to the core. + + If the presumed core is not even a core returns False. Parameters ---------- core : {'is_core', 'valid_parents', 'labels'} eps is 0.1 by default. + """ return core["is_core"] and (core["valid_parents"] == {core["label"]}) def _group_by_labels(self, results): - """Groups all the learned cores by their label, which was assigned in the cluster_model + """Group all the learned cores by their label, which was assigned in the cluster_model. Parameters ---------- - results : {list of {'is_core', 'parent_labels', 'num_samples', 'label'}} + results : {list of {'is_core', 'neighboring_labels', 'num_samples', 'label'}} After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results member, which can be used as the argument to this function. It's a list of infos gathered during the clustering step and each element in the list corresponds to a single topic. Returns ------- - dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'}) + dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) A mapping of the label to a list of topics that belong to that particular label. Also adds - a new member to each topic called amount_parent_labels, which is the number of parent_labels of that - topic. + a new member to each topic called num_neighboring_labels, which is the number of + neighboring_labels of that topic. + """ grouped_by_labels = {} for topic in results: @@ -921,7 +903,7 @@ def _group_by_labels(self, results): topic = topic.copy() # counts how many different labels a core has as parents - topic["amount_parent_labels"] = len(topic["parent_labels"]) + topic["num_neighboring_labels"] = len(topic["neighboring_labels"]) label = topic["label"] if label not in grouped_by_labels: @@ -930,64 +912,70 @@ def _group_by_labels(self, results): return grouped_by_labels def _aggregate_topics(self, grouped_by_labels): - """Aggregates the topics to a list of clusters + """Aggregate the labeled topics to a list of clusters. Parameters ---------- - grouped_by_labels : dict of (int, list of {'is_core', 'amount_parent_labels', 'parent_labels'}) + grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the label. Returns ------- - list of {'amount_parent_labels', 'parent_labels', 'label'} - amount_parent_labels is the max number of parent labels among each topic of a given cluster. label refers - to the label identifier of the cluster. parent_labels is a concatenated list of the parent_labels sets of - each topic. Its sorted by amount_parent_labels in descending order. There is one single element for each - cluster. + list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'} + max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label + refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the + neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending + order. There is one single element for each cluster. + """ sorted_clusters = [] for label, group in grouped_by_labels.items(): - amount_parent_labels = 0 - parent_labels = [] # will be a list of sets + num_neighboring_labels = 0 + neighboring_labels = [] # will be a list of sets for topic in group: - amount_parent_labels = max(topic["amount_parent_labels"], amount_parent_labels) - parent_labels.append(topic["parent_labels"]) + max_num_neighboring_labels = max(topic["num_neighboring_labels"], num_neighboring_labels) + neighboring_labels.append(topic["neighboring_labels"]) + + neighboring_labels = [x for x in neighboring_labels if len(x) > 0] + + # TODO why is len(neighboring_labels) equal to num_cores? sorted_clusters.append({ - "amount_parent_labels": amount_parent_labels, - "parent_labels": [x for x in parent_labels if len(x) > 0], - "label": label + "max_num_neighboring_labels": max_num_neighboring_labels, + "neighboring_labels": neighboring_labels, + "label": label, + "num_cores": len([topic for topic in group if topic["is_core"]]) }) - # start with the most significant core. sorted_clusters = sorted(sorted_clusters, key=lambda cluster: ( - cluster["amount_parent_labels"], - cluster["label"] # makes sorting deterministic + cluster["max_num_neighboring_labels"], + cluster["label"] ), reverse=True) return sorted_clusters def _remove_from_all_sets(self, label, clusters): - """removes a label from every set in "parent_labels" for - each core in clusters""" + """Remove a label from every set in "neighboring_labels" for each core in clusters.""" for a in clusters: - if label in a["parent_labels"]: - a["parent_labels"].remove(label) + if label in a["neighboring_labels"]: + a["neighboring_labels"].remove(label) - def _is_easy_valid_cluster(self, label, cluster, min_cores): - """checks if the cluster has at least min_cores of cores with the only parent-label as itself, - meaning no other cluster influenced those cores, hence the cluster in question is easy valid.""" - return sum(map(lambda x: x == {label}, cluster["parent_labels"])) >= min_cores + def _contains_isolated_cores(self, label, cluster, min_cores): + """Check if the cluster has at least min_cores of cores that belong to no other cluster.""" + # TODO is supposed to check for cores, but checks neighboring_labels. + # Funnily, it actually seems to be cores (see output of next line) but it doesn't read that way + # print(len(cluster["neighboring_labels"]), cluster["num_cores"]) + return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores def _generate_stable_topics(self, min_cores=None): - """generates stable topics out of the clusters. This function is the last step that has to be done in the - ensemble. - - Stable topics can be retrieved afterwards using get_topics(). + """Generate stable topics out of the clusters. + + This function is the last step that has to be done in the ensemble.Stable topics can be retrieved afterwards + using get_topics(). Parameters ---------- @@ -998,8 +986,6 @@ def _generate_stable_topics(self, min_cores=None): defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) """ - logger.info("Generating stable topics") - # min_cores being 0 makes no sense. there has to be a core for a cluster # or there is no cluster if min_cores == 0: @@ -1008,6 +994,9 @@ def _generate_stable_topics(self, min_cores=None): if min_cores is None: # min_cores is a number between 1 and 3, depending on the number of models min_cores = min(3, max(1, int(self.num_models / 4 + 1))) + logger.info(f"generating stable topics, each cluster needs at least {min_cores} cores") + else: + logger.info("generating stable topics") results = self.cluster_model.results @@ -1015,52 +1004,37 @@ def _generate_stable_topics(self, min_cores=None): sorted_clusters = self._aggregate_topics(grouped_by_labels) - # Iterate over the cluster labels, see which clusters/labels - # are valid to trigger the creation of a stable topic for i, cluster in enumerate(sorted_clusters): - # here, None means "not yet evaluated" cluster["is_valid"] = None - label = cluster["label"] - # label is iterating over 0, 1, 3, ... - - # 1. rule - remove if the cores in the cluster have no parents - if cluster["amount_parent_labels"] == 0: - cluster["is_valid"] = False - self._remove_from_all_sets(label, sorted_clusters) + # TODO removed first rule because when amount_parent_labels is 0, then there is no + # topic in the cluster than has a parent, hence the size is either 0 or 1, + # which can be covered by the next rule. - # 2. rule - remove if it has less than min_cores as parents - # (parents are always also cores) - if len(cluster["parent_labels"]) < min_cores: + if cluster["num_cores"] < min_cores: cluster["is_valid"] = False - self._remove_from_all_sets(label, sorted_clusters) + self._remove_from_all_sets(cluster["label"], sorted_clusters) - # 3. checking for "easy_valid"s - # checks if the cluster has at least min_cores of cores with the only label as itself - if cluster["amount_parent_labels"] >= 1: - if self._is_easy_valid_cluster(label, cluster, min_cores): - cluster["is_valid"] = True + # TODO I don't think it makes a difference if _contains_isolated_cores (formerly easy valid) is done (here + # and afterwards) or (only afterwards) - # Reapplying the rule 3 - # this happens when we have a close relationship among 2 or more clusters + # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any + # other cluster. for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: - # This modifies parent labels - # which is important in _is_easy_valid_cluster, so the result depends on where to start. - # That's why sorted_clusters is sorted, so it starts with the most significant cluster. - if self._is_easy_valid_cluster(label, cluster, min_cores): + label = cluster["label"] + if self._contains_isolated_cores(label, cluster, min_cores): cluster["is_valid"] = True else: cluster["is_valid"] = False - # removes a label from every set in parent_labels for each core. - for a in sorted_clusters: - if label in a["parent_labels"]: - a["parent_labels"].remove(label) + # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on + # where to start. That's why sorted_clusters is sorted, so it starts with the most cluttered cluster. + self._remove_from_all_sets(label, sorted_clusters) # list of all the label numbers that are valid valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]]) for topic in results: - topic["valid_parents"] = {label for label in topic["parent_labels"] if label in valid_labels} + topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels} # keeping only VALID cores valid_core_mask = np.vectorize(self._validate_core)(results) @@ -1081,8 +1055,7 @@ def _generate_stable_topics(self, min_cores=None): self.stable_topics = stable_topics def recluster(self, eps=0.1, min_samples=None, min_cores=None): - """Runs the CBDBSCAN algorithm on all the topics in the ensemble and uses resulting clusters to identify - reliable topics. + """Quickly change the results of the ensemble by reapplying clustering and stable topic generation. Stable topics can be retrieved using get_topics(). @@ -1118,11 +1091,18 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): # to make using the ensemble in place of a gensim model as easy as possible def get_topics(self): + """Return only the stable topics from the ensemble. + + Returns + ------- + 2D Numpy.numpy.ndarray of floats + List of stable topic term distributions + + """ return self.stable_topics def _has_gensim_representation(self): - """checks if stable topics area vailable and if the internal - gensim representation exists. If not, raises errors""" + """Check if stable topics and the internal gensim representation exist. Raise an error if not.""" if self.classic_model_representation is None: if len(self.stable_topics) == 0: raise ValueError("no stable topic was detected") @@ -1130,32 +1110,35 @@ def _has_gensim_representation(self): raise ValueError("use generate_gensim_representation() first") def __getitem__(self, i): - """see :py:class:`gensim.models.LdaModel`""" + """See :py:class:`gensim.models.LdaModel`.""" self._has_gensim_representation() return self.classic_model_representation[i] def inference(self, *posargs, **kwargs): - """see :py:class:`gensim.models.LdaModel`""" + """See :py:class:`gensim.models.LdaModel`.""" self._has_gensim_representation() return self.classic_model_representation.inference(*posargs, **kwargs) def log_perplexity(self, *posargs, **kwargs): - """see :py:class:`gensim.models.LdaModel`""" + """See :py:class:`gensim.models.LdaModel`.""" self._has_gensim_representation() return self.classic_model_representation.log_perplexity(*posargs, **kwargs) def print_topics(self, *posargs, **kwargs): - """see :py:class:`gensim.models.LdaModel`""" + """See :py:class:`gensim.models.LdaModel`.""" self._has_gensim_representation() return self.classic_model_representation.print_topics(*posargs, **kwargs) @property def id2word(self): + """Return the :py:class:`gensim.corpora.dictionary.Dictionary` object used in the model.""" return self.gensim_kw_args["id2word"] class CBDBSCAN(): - """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). The algorithm works based on + """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). + + The algorithm works based on DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html). @@ -1178,58 +1161,59 @@ class CBDBSCAN(): unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is one cause of unreliable topics). - Parameters - ---------- - eps : float - epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. - default: 0.1 - min_samples : int - The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. """ def __init__(self, eps, min_samples): + """Create a new CBDBSCAN object. Call fit in order to train it on an asymmetric distance matrix. + + Parameters + ---------- + eps : float + epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. + default: 0.1 + min_samples : int + The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. + + """ self.eps = eps self.min_samples = min_samples def fit(self, amatrix): + """Apply the algorithm to an asymmetric distance matrix.""" self.next_label = 0 results = [{ "is_core": False, - "parent_labels": set(), - "parent_ids": set(), + "neighboring_labels": set(), + "neighboring_topic_indices": set(), "num_samples": 0, "label": None } for _ in range(len(amatrix))] amatrix_copy = amatrix.copy() - # to avoid problem about comparing the topic with itself + # to avoid the problem of comparing the topic with itself np.fill_diagonal(amatrix_copy, 1) min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))] min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x) ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] - def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors=None): - # count how many neighbor_indices - # check which indices is closer than eps - neighbouring_topic_indices = np.array( - [ - candidate_topic_index - for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps) - if is_neighbour - ] - ) - num_neighbouring_topics = len(neighbouring_topic_indices) - - # If the number of neighbor_indices of a topic is large enough, - # it is considered a core. - # This also takes neighbor_indices that already are identified as core in count. - if num_neighbouring_topics >= self.min_samples: + def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]): + neighboring_topic_indices = [ + candidate_topic_index + for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps) + if is_neighbour + ] + num_neighboring_topics = len(neighboring_topic_indices) + + # If the number of neighbor indices of a topic is large enough, it is considered a core. + # This also takes neighbor indices that already are identified as core in count. + if num_neighboring_topics >= self.min_samples: # This topic is a core! results[topic_index]["is_core"] = True - results[topic_index]["num_samples"] = num_neighbouring_topics + results[topic_index]["num_samples"] = num_neighboring_topics + cluster_topic_indices = cluster_topic_indices.copy() # if current_label is none, then this is the first core # of a new cluster (hence next_label is used) @@ -1240,48 +1224,34 @@ def scan_topic(topic_index, parent_id=None, current_label=None, parent_neighbors self.next_label += 1 else: - # In case the core has a parent, that means - # it has a label (= the parents label). - # Check the distance between the - # new core and the parent - - # parent neighbor_indices is the list of neighbor_indices of parent_id - # (the topic_index that called this function recursively) - all_members_of_current_cluster = list(parent_neighbors) - all_members_of_current_cluster.append(parent_id) - - # look if 25% of the members of the current cluster are also - # close to the current/new topic... - - # example: (topic_index, 0), (topic_index, 2), (topic_index, ...) - all_members_of_current_cluster_ix = np.ix_([topic_index], all_members_of_current_cluster) - # use the result of the previous step to index the matrix and see if those distances - # are smaller then epsilon. relations_to_the_cluster is a boolean array, True for close elements - relations_to_the_cluster = amatrix_copy[all_members_of_current_cluster_ix] < self.eps - - # if less than 25% of the elements are close, then the topic index in question is not a - # core of the current_label, but rather the core of a new cluster - if relations_to_the_cluster[0].mean() < 0.25: + # In case the core has a parent, check the distance to the existing cluster (since the matrix is + # asymmetric, it takes return distances into account here) + # If less than 25% of the elements are close enough, then create a new cluster rather than further + # growing the current cluster in that direction. + close_members_mask = amatrix_copy[topic_index][cluster_topic_indices] < self.eps + + # TODO changed because it is supposed to check the distance to the existing cluster, not + # to the neighbors of the parent + + if close_members_mask.mean() < 0.25: # start new cluster by changing current_label current_label = self.next_label self.next_label += 1 + cluster_topic_indices = [] + # TODO changed in order to make the parent_id parameter obsoloete because (i think) it (the appending) + # can easily done before calling scan_topic as well + cluster_topic_indices.append(topic_index) results[topic_index]["label"] = current_label - for neighbouring_topic_index in neighbouring_topic_indices: - - if results[neighbouring_topic_index]["label"] is None: - ordered_min_similarity.remove(neighbouring_topic_index) - # try to extend the cluster into the direction - # of the neighbor - scan_topic( - neighbouring_topic_index, parent_id=topic_index, - current_label=current_label, - parent_neighbors=neighbouring_topic_indices - ) + for neighboring_topic_index in neighboring_topic_indices: + if results[neighboring_topic_index]["label"] is None: + ordered_min_similarity.remove(neighboring_topic_index) + # try to extend the cluster into the direction of the neighbor + scan_topic(neighboring_topic_index, current_label, cluster_topic_indices) - results[neighbouring_topic_index]["parent_ids"].add(topic_index) - results[neighbouring_topic_index]["parent_labels"].add(current_label) + results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index) + results[neighboring_topic_index]["neighboring_labels"].add(current_label) else: # this topic is not a core! diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index a83e87b8c7965b1871ef711a66f449a1e618b1b4..3b9563f08da01a1f9900d7060694ec235af53c2f 100644 GIT binary patch literal 13705 zcmb7r2UwKvmLEkF0Ra&kf})@fq9`DWNG~D|pp2+vK^TU?q4UiQDk4ac-a(oo(yLOW z3rKQKl1(;kv*~>|yGb@}vzu%-$!3$gzwevD|Gjr}^E}aKzVi0de&@XBdnvb2Khxyv zv+{iM3NkdgdLO)K^3roPxfv>(7}Ik+o&__n3cd4jzd*qR7`4AM10h}_=x1V+li^kHx)5=Z{CiN z7IwvkP4$WY-b+Io3JSB-snRl1)wwFYUaayb;>})e zA>JG`DT1S^X20B0EOL_X5^o3OR^rWBzFWKn`Y;X(> zTxG^7+|a$P@Pp+2ONGlar8RmR3UW00epwkpK&f!WSfnn8iLkdm78k=nUi@6m?Jt+iqD zz1ncOjW$AVtBov@=2mCwg_u&|hA3AH>AxASyYSlsd0OGg63@Hwv&L^P-fZx*m0K9Q zXP|p*sc`dGb+`Q0-9<5oDxXoH)(deUSd^+*P*|WBZpqz#yyxe?66A8%qVykDA{m)p zh=(9VMi;`&z_bLpUy;%o?|GWM0$p)xrY=j9pOIUfYA|G|gv2_fBkD4;^afSFT9XRp z$xfr!i;?GK3^aekjxn3Z*MLN>={E z=5?Av^w;DYRH70JceT5rcgdx~J((oX3`23DM!2sAorT3V(wmL&Kxu(-GA|2C(Tr<_ zhYH~ltVwB((qQ3nVq&6*Nax;VQBwg#P^_R3#~3u&=ozlD1IlXzK>nW@(p1XXOy& zN}Zz@w22Z~s@y`YO2{&@*axVf@yRR7HDsxDI#sbvs!pSZ(He9`YJ-iCZ4yL&hOt)2 zk(*Tux$0CkfLX1#(d3H{HbR~xi*zAh;!!SSo9`)} zyK14#IPfKmC=UAn^Ap~P3n*6z6$YU)T@uQV1!yp5RlQWEQ{`tAx9BiG9XG zo77Z=RC`yfOjR2SbT&e@mqMtKNmiPW?0fB+|5c%OXZ3aB>gyFk16H3 zbXjOJDtWU)Xn~Sz|Gav)ri(1BhyS~Cg+8h1a|Vxd1d46Dx5rDk@^Us!a@|v|;c8xZdNsl8RQ8Ee zVA$GoDyIb<)nfrGJeBN8>xL3qO`|#cm^d(Vi`ht^rFXPy$D`DrZM|DY zP28w`9)PLe(=dHb1_w9vU~fl&z~5<`E3!lAEf4YsheH>s;;M57b?3%}#aZq-V2HY{ zNqS0OUQ;uFji%1M+(L8VhhMqieJeeHr*rIfgQ}jR)Oq*VRrL+-=VtD|W{_dL{p2-*X$I=)NVe&@mKj-b}&ULxGR=D?} zgA1?nx;<}m6IIh^9=J?>R7o>D7fNdoa5K;Q`V<|cCH{1jhIK*IrRHy7#9|Ru+LqFi z7kJ)Am8of1VKWU;L%w&=F)jKJT2aTf^Hj@23DoUQJy-l8!D;TLW{n?r(ju)>m31;L zc^Gza&u(tuJ{pLQIpSbLE7zfi+cZJrwC)o2sEF#Q3slZHxpU{k*SrueQVH37J<+EDmY(Bpjxg-1Gyp`~?EnqWW@>2I+UYRYSYXh}H}sii z!w!J^wLBTkO-VWGO9UCvCco#gr@+BHpJ1A_q{U+vkmm@0vZQI+IF*s`@Kuq_Kr3Z1 zj@fwb(VnAkP@{L;VMW8Q1g_8E$%GsEC$6fv9QZgx-S_PB?9RTSQMZU}ds;tA^<1C0 z4@6AtrcnWMp18%^$=s)=)}%+^Z=0hxwENqmBZs+@w}I;4!T1l<_~>T5BQ;&4jzs%d zZYttW5ZPNQ_vOWV_j!TqV|b+yC{+g$SHK|0oOt2pzU$!t*|z|RY75_31tuGzWvVy> zecZak9j86cIFuICdrJ{3=b=Tt2SEKOK(mtS(_ogvz?@G2nHp}4eDa9C+_Q3kjeU>h zu0-xQa+w-QWYGMrBH&I#rWvncx{skWp+ESPzbRrL12o6M<<`^uJ&EVjxjPOV zZ-FKb&^vD^QI#vThyc>-NV8o1iu%oABBQ(z2MFpzR?e4MW2My@NQE_XkNP+EdFTEVCbymXgW{JHiH6k{8j+{pu} zi8OX5i@Lc2^U8S8k2X)4(?k$`v*j^ri{?+)xYim2O!1aEMAYjFu7Bc|^lTnJ!b>sG ziw`t$?KH@nDWfrm3$y|mcb>|DB2@yiYM|x4ynM+wkEX&v-jpgN!U3qY#dRufRy%TC z27RJxp3CIf3)FV=O?t-RybDf1tPZNpJP!%X`hmPUUV>{Lp!S1NJZs?jD0;Vtd(QE+ z{hMP4?4!W#DIR(W;f`|uad+zWh2Xm*XwJqyo2H+pJ@)0+BQy~S_Se73i3M7$@zP#? z=fa5Bnia+FYEmb=4dKtA? zxIa|TdM==>%EPBP3g|WOmq}kvz%R7qafRB3rU0m8JnaU5|MD2sU*Z~Dntx$9ks-@{ zTo}&vN3f8kLVK#TR(osrXF2Rx;hY#{b>XcCzcqJg#wzuNaC?wzc#6F* z)zAvfX7Ps8*^pe`J|0Wm-VY@sY=ZZi+yh6@=u(!#ZBVu`PkvA1VK=i8T5M8z=tVVL z+$@5m4eHR*fIByNpFemq+B)#L%?ld2p$6SO-bWF2Rog&0`gl7IlXoF-7^JUf_`~k= zi7?7e>ecc(EeUWvTh3hRcJU?Gz`i#T4D>{vMElN=6yCg9>~+ESz&W`&5pvPqjZubcALP_1Vi5Zu=KWP zfVDx|e3czTZ5Psx!?FfnbJr!Xc^aC(%9AcM`UvJXOcUp*cQ0u7KTC5o4{hp#YpQY0 z!P^*Abt~hH%WWR_r12v>sC15hmhr?IEX5MJK8jj`!RuC@qz&lm3e`eMC$8m$tLTdo z#nseX;J{z+7e40Iba;~8ku*`_cb}$S!kYT`@uwJbYSO^=T6m=RWV{+3W`GRsc6)d^ zG{cSRN_ZmJ^`3Cs0-@FRu7e)Bv$S^ocs^!XO5yqvk!s#9f|g9U@i@1i-$T8(0s*0u z@aCg38aYmNM`?13eI<-XaJ;F!&gkWx#k>mv=J5$PH%)^^Ny0n*XP25OQOQ1 zHyPJDhT9@q2z!pLMLn!^1Y&8XEfuYk zuL@ytCtI?lZ4px-;1#u((vTfI#%x59KX$B>+@riNyTNa7 zzNWQw-U_wgh7cZ6ImS>$G4JHG`qx@Dtz%z8+aFIXm^3>6LY#e)> z`91ZdBnQWZaQ(T%kNjQ%<694yAyW931{8Y-hxOGe*si(B!H@JJNssH<-TBUbU#iTPierG8lM%~^M_KN-tQe1TZmPSdOx9o1Ii+5)>F;X zN4HC9_Aymh<=>^I@W=PDD`Q4o;SRC=@BwFiP3j> z@u*Dq)ZIG-ZnDq(5jF+c^esVa&TpyyqQFZx{9TW^hrY3WmI-?qv_?GDq&RzxDo#>U z3|BeYBHnG}u3VVy2v?MyNCvpRjooz~kULAUMgV$fvh#Fy03q;w&jXsECz0c8F0uMaC zn_zY94SjrNcRYs2-aJ0N_g)cK`e$-u{L9p+w|mp}L4Tch(7iWM*gjr9!sUm- zg6*srgrP_~ZL5OZ-~b!c{2{2An_s3oQe6Nvt=|{kyd|2)ph7cwG+qFz%XyBPb~)zV zg7VE$$Lj-!ZKxT6h(}%U~l9@)DDsN6a) z8(_E-h+oIJ?c|kMgv4EP&m%`mqaYHntDIXzm6$#YoM`0Lv|#%@@T1S-vOU+k(WD!H zE(Ol?fxn%5kUVs-hefp^)*rvb{YboQIOLUt(7;KkX^->%0(kT`<-L>#76vD)OR~K1 z<6dzn>NKPNL%fK{aPAIoI;5OD6iRc>{5in$mY|_w8(x15m7B*7pvSBzhnBD(^ym_1KeEDas=A zY=Hz5Ni=&Hii<1=OtT8XUUzsFVwz^h!%$UlZtNrgaPnf1&fOA(zD)z{Ozz=<2ef*d zzg^&Mdl+W-D_)BP^7dY$#V1KOFM1p+P38vAU>?g15_hoE_o-qJ%>+U>+rgi&r$hW~ z4GCpDbdWwg#2OL3)A%`OUd=5!2`H_(!PRcmb%iIbuo-@P+}A@vtA}|x1zUw7LW-0Z zPxF8;PX@*wh)tsRF$jUSE*Bx-uP}#t%;)lERA68*B#|C~e^ZFMAsHL{L9Mv(^bv(O#JNG}T;T zB*BtHc;ktM7U{KXF<9igl#*TCl|t(@Zh@%reJpJkd#A!G+o4&lG=FSw82rV+5jW~7 zMAm7;5)tiYSxlY<4e`gz@PCs~Xe1j-9Fwnbg-?*AlRfv}BvHArkGey6^Wt;*_7I`` zBC6V`UqK7Lh)~KemqvN)_Hv@-7urCCvgJke00D7|~+AveR>ROLn;#8z@0 z_PyzF*q=Y!!UjJhZ0&oFy+RY<7^=Bp!15<= z@sNe?9Djd??LnIoTw8@*cU&mpiliVz!to2Qd>zVo`ye1{JeMaQUxFVJQ-sxo4D_N8 z_0mwRGj%0jDY=T2^&`mV12qHVb=cANlh z?zm0;xo7>UB@_BSE1Ds6_$!ZiLfPJ@U{g!oHhH{lo4JS9_r+eP?-r3~l|1E(R0#ld zf;$T}icB$n0orst0MdV?Efvo_gjfP;@i6yYq|Hcx!4PcxEw3t{gP~pYnd*)P^>#0omAujqDG^ws(kz zvw7-f47}p1n!9WiN<;vS)S^b9$=fvLOUs8<)a6w|b^2?duay=L@?x3Qap39cn`-iD%~_8*yK|1(?f0Wf{{UxT!8r9%duUv~CYySb zl)NZF#QosNKr|d(hoz<(I*FS0(swh1dvLOZFT>P+f!CkFcsf$gKLy%XTBJ~|-rVBJevgu~S*5^}MgCew zn@5n(Xe!%(333rX*7Q@k3Yr3wJ`dAbfJ141bWwu^*8!P6!kTF)iYs7|%iI%c1wS%m zOABT+e2u#h1XkRqssqodL9*L5?ou15H;c>N{E*N)LBoM&p`kqH5kO<`Vsn9awEMEq zvdeIjzFY}VCgecyJXC<-srph9EjyljX@&1Kr)XM@jL-JIY-CRALGLVf56$=W2i&;| z@IA_3&0%Z3+;8=?5KgZ=0W_@e1}|qKImrXh0KQF-=(m>ujYS^v0w#U8y;=;cntmG1 zbCCxc+LFF!vg7;WH7& z>`ZY_zXZqGTFULP;w3nN=~r&t5Q+rZN-kF*37QX05R)CS^+{f!g-qD`EPs|mKNo>` z<4#lqlmC29jFXy>s(}qHdIOw*@kr6~+cOWiRcn7pq0d6RRRvj&!D%Huhnoq7%(grV z;le#H0E$-u>=iG$y&!|ToXjp6vS|G(Pn*$_RSv$_yn>O{`#}F&c@7{^wU6u1a;*nK zCT#IwzMBB8@5iC}bzZz^qq^)3sf}=rj~hUy0nqT_DNP=Q34XDn-Z#O3n8hS+r`94G z%%pi|>b;Z|d6yO+^2+NAWmt9tA{96+2zwr;IY68H?gxtz*(8Rm)j0zu=6P7>3UBn@ z{(jm)aHN*IFL3vLo`y|-1mvw)WeCgS_H{)etQv&1bQ}lz`OL?RyN8%h< z#UO&`K3WqGRHSKum(mlJyTYGHk6dmGZIB+rx$78x5nCeJh}^sQWcH)hOdZMuxplFjJ9XVM zmW9)Rsg}wQ-!}Cfqyvo>q;if(si_N(1G)cSY4^7$zuGO0!%@`XWNiQ)YPE{Cb zPm`I513f$SX?J&+P@Z$s*d@)1227+6o&|iFNT+#|-eJgr9~KAr{b6zd$Df|x?KBC` zaxoI`P7NlT3r7rUGTHF39izj^iyf=uavrrclAi{sTQrGVCYhb^H$=Son+oA?8HB%` zuJnKAYVPPx+bt$zW&iJwt)HjbS`mc%158khyP^Y^25`f(Aq3sl?MD?=^^x z#BwSRL@e262!)b<$g$HQa*>+fImJ_+p<15V&q!xtmog#_vNMfZex8;>pYrx2hg)!u zVjY!xJby{Gq10n=qVeN9J<<@>wVe#*^ODFAP59LkK0RK_Es7=Vh7VY}2A zhJ!_WVf-W1W~5D;>2j_U4*^X^Ol&xX7QI-12Db$uO6fz4Dl#Hm=6o4!3p^g-1;^6* z3#9(s^1#wbk4!6;eLB3>B8>6#BAckoLh zF*Ey#jkUXz3e!!3{IN{4Q~n38?sSso06DdKs7uREsZmSLAATL+N^_X?CSrq41Hvey z(wUg24UFdHD8zfeuq^I3*xkB=k0z47N#hIuK(wNNs1W{|`Q}kkfALmU|^{TwWTpT9- z7dXKx9go+%G^FOL(lxolzeJH3L*S$>PIl^r-%1gFTUILkE8}UzEIp2I>NLXd)G3|N zC|{HHOq*Vy6A$-F-Sxuns#8v+Vzb1H9t*8{>+9A767sF5zTr zRz_-}N{17|df`8d1O0(C?H?+H|58*c{8yRwcj~miE0+I}+ziY4Zwld$wbF$DT|Aq= zOTHTu{tt!lKed=Z`};U(D-H4|Qse)n5dIW{{O_Lz`9D(m&lEZ{943~=(V62(geGYy zodw<$I!m$k=Lgj7LW$T{XC*b*jcZY*hXB&0jZ5MuEr@2ip``m>tKlPkW_nE zto;Qlj-W*9epG7Uf-9ythTBh5xQcB=D%_+t$8j}IaYAecDlA1I>)fTstSBT{izY9qtdIE9zkOrl~JrZ^|ndW*HcK*f2KNZl_;4SaCL z6c=&(X^Kl?8<7fMsf{15#wq;8W)c-vnBuZj8z9#H0u@(KB6YtiH3-DDNP~}U;#W+4 zfetPZf6XEpxL$V+l_|O)TxF%Y>*7uFjd`kWefj+1Tds*#KkpGEPvL8f!$x=1uN`EFg5_$<`wqH$N~VsJG|$F4?q zL#&eg_@7`&_y=$wOWO&PcmSu_7Z1YIHXx9>w8m(A~ln I-!^^z3^Lb7%K!iX literal 9893 zcmb7q2{=^m_xRYt*p~=}q8gO7Ma3X{V`oZP#>^OIVwgLN?D5H(y^8%cpm9i0jWyc{B*3c@ z3Gjsh{3xsljYP+t@~3;^nEo^hkpUp%u{@s4AQ}k}Ao{VWv>+seApt=wr`H|~0>csO z00^NlTv!f08yg$8VQbJo41!6(B0&*wHXjn50SIF_ut*A%M2E~`03sL!R>T`mrQ&f+ zGH5W_-`5imH8-#$SU;XGS;cOkfr}zB05Nc!05}hhMf1cnL4r6);PE68@j-xuxxqLB zy08MC6b6${A+VSvPe2m1ngOn#1Z@02R`19jfdr($)k}u~n^0r{vJjRRRORUpu3QGg zkL4mU@qjG6aCM?Toup1>`7$YZIvpQ`1RYKE_XCx(h)g6Px9%Q4PnZjA#vr1x90brk zTTmDdj9@fbC?2Z`8f)9cRXo@vN4#gUt9@aIy&fR3N`l->t>Pw@6-M%J+ zo={Z-y%1wV_C2ag)KI{#$sl0-Ve@2TU$bo4qh=<7MZgOC)0h-0C76W6`_jmGKrtRG zL8bUnsNm+%=>7yef#OSH1_4{m4Wfc4%qt{Aut-t>-WNv-Akl+>Z5Sk$pGao;`7ppa z043`IUaZ)EbWe6Y;*j1U%7)0c^Ly^0R{gj2|%QcVGmu8~nYz7$hdJ z)7;?8dJOQucA@^|C>6+2koWoicpmhs;C~jw-ec8xED{VL8lFicljwk&4HUY64#6%A zCsu$INHg{I_a?B1@BWtt^?z-}{ZD>+cD4R`mL>jr3=~zbFt1QwN5~htFP`$%)7JOZ zyMFyRdTQ`E+Vr9%T69~MKBrigzMy!PzOrSOer~6@J|MgW&Atz3zSAeTW$9a_oj&w< zQ|KZ4!y$**w!=qZ-LD$p!Q!QOYO{|mp!p}>!DyxV)A8z_6e2qf;ORj~lfSos78vx} zVSo+_%g1&r6!Ck&i05Pa`;h#!DW1UIFkm0bc?0M&aF6`K1H2z2;LHP}-gG>T4CrEz z5R(zaV3Mf70SphsWYS6C5j}|Ecjf|zl4uOz5Qfj0?RcU;l?EP8I-rL^z>+L_04WI2 z$8bAygM1bP5BSC@?#5LWhJCgk0bsH~7Z`-faLOF9`WS$PYp!_$PqjWI=}? z=jRs;m|`|#`9PmS-T=(P0CR9QvL1$mY=Ge;8)3M}hcVn_eGCuz0A>T(5W`DGWBAC2 zF#P0$7$jL2BS7B2J{!%S0a#$Run1&X@)oQh9q;GqPsPE>$#Oljz(gbt=Kyh`yo4h% z{pm>Hh>8P%L1B5VowN=d(Af`I{VVdU!N{|waYmEDY|Q$Tjcxu*HjZ~{1^q|8sQpzY zcK32{Wm!%>*^IYL%*RyZ7Ek!v5bx;av5L9(?|r{7HFC$u7?&lKb1J8bcFcafdADjX z_4U5^R5u*S0A*UoC~Ii%x{q2+d>-6~WZk(lRGCgbe116njF?VGtT10ddB(4JO3&(z ztn6pXR1#6!l+AUCPUses02% z4T6%^+^RuCu3_T4%4#^JB8L%Y=1Ph^1&fUYtPj-~Xt&82@3}__PDMFP@?TiyEWzDO;@jZ?iLb%@&*H(b{LAJ(oE}2wVU7VJrMcv#W0N+ zR(+W;YLc9bcOW+Qrc#A^16!W)=Ut8fEK+`Ce7ei0v{5E(OmN#uU6hx7e~R8N>alMt zxTButulFtX-PXP$oGGMt$hdL1?zQZGg@%I~if`|AjYnMrk3W9N@Yqwk!(pb{Jp07iU1v_QOa^kx&^a>SWn3grN*s4Dy)u~Z zU+}ilP<{b(N!SJP)_CjZRt`RyozYy_EY_vPjP#ngyOtsaL46-eKbzfi|C;xx=m=rH^(Y|(A~Wx4aijSu`gE#a^9)1oP2sz; zPe=Nh!~aasI=?%e6Uiz}${C;DIxy`qXSby@)Pt%&o~pk`EahcX)a>m?9u87JH2XMi z?L(atXTDC`*jKEZba0|8+j>}F^lX{5Mp4h>KAS6=#X^gZyjr2FZ|(DvKLU00uLWxb4g zC4Y<}T3cX#&_`A14!!@}Nn!h8no(3&ilo+(?1qz>k{nkB5x-+-)w$O^cxp@TEzDk2 zSQ#^UTJ`(gO`$8r(Nx=@ldk!U+hwNTf(jBmuMQa9c0}52m7RFu{y^0JZRGZi3yJ3@ z+7qhRo)8YJ%u%!UglZJmxc+oHYUEeL2)fhKIQXWPu*OPh)5&t@w!PsqIHw!y@PfNr z)F**Z`>=20v8mf!^US9q;&#uFtt*WH>Hh3^lm@PCN^dmq`6qK{?7nMrp*!D-275jK zW%2xS^*oAG8>{e0&u9MCH75~qqC4E%}1ccO;bTOfCI1n@iB%#T9Q5 zqh&=iJfLccQnLebiUMTrcef$i66P?YBRa5*ZYBZ4{P&dW&3T^hlV2` zHD9VpO%1-&oBMpK_{HO0J8oqcr|ot*@#?ov;4vIu&!Ljaw;RnJs%5gjTO7ZWB&n<2 zRhwuR;p3i&lO6cX5j$|}DY-@HhCAg-WMO;r?pWM(b9OW1a@uU;(D0Ex!pBzLx*XzC zHOs0`>#?!VIe*iAOhOa?kJQRto7uBrnu|9W(JrfZ5(-Uyo~nI(+OWf;OZCuG1VOT} z)?|6|4D!drylJrqo~qBzX5AExG*TVI+`i;}uCK$i@0jP5i11~hhN)6J!$6@z*XJ4D zozc@eA4lkzcum40`TX~*9cEi}Y8iudz!me=*BhFJzu za_=mTj1>%rl=14(aPh= zU%YNS`dEKN<=9kLUqK^q6s!FpaWa=aHw8sm!1j(J1xpKR@^$7tnQ+<wEfR|CtbyR&WR<8KAZc(!+@ zE*~Cp5>>0IcqQLGJ?$9(!7H)P;Be&#=Ul^Uk(jL`Tb3SA30s+;8Gmr_^p9|^ouN&G zQPP85ZHBqD-<%Fox|@RJ`Wi0PwQLPb-YfaUK7ge%duqWn_R*w~>b>`zCo@*#V!fkm zPH3|DRaV+|#b>JQQ+{+U@mqXP!DvlN=TpQkiRc&3gtKRn_u7(7qZN2kT)kh{NLIRb zi2g)kUByy9j1fGiDxUA)eyyMuuihNlT=`2*i0FmU%Ca?Qg-a}1lQqeD@ z%%n6Xm;o~WU2FTiEu2Da<6W+e7j2f+5Xfr%KKQ#y?bNsJUTd679}9PTCf?Y{6d_sVCMDy5p9!Cbb-TQVg`^wif1h9-qJ>?_>#kSw4*%`9BeoS(*LuyP zfL*6yu3Y=HE9syPJtXqWn+d9v~$IQ(*F zCmVORyI9YLx;hsixaQS-JQm~tG9-TbuXx0|cqHuje_-)5Fct-hbHLU{v&+PjWXJ_Ef5l_h z#p7Uo@v!(=7)$u8J&~=Q1#3_GE1tYAo&xJjg~iig?A%}N=h@mRu=Wdo#nactFT(mV zVDU^C%lfN58`d7ccHO1F;yLT$xv;)GSo|`K<^R=Q0Bb+V7QgaGoCNlbl<5Cl3*AIL&YdKJt7`#FK4Gk!%`i{Kl$r)#b(1cC|%0t!+yLV0;f@n?{FIR znTH^PM1b{UC=hHt1H+4s89|Nrf6Nqm(woWI*j{!uL8BDi_9|I9V@MB=S1AYv6 z45!2U^XxnX6+2<*frcLg;{^(k{p0oUV*p_Q(_6LR#{gmuUZvRuCcpa@i7LffOJbAGHcNRi$yb|8u z@x1l;Tv(6Kx9jjJOgE&+LiQ`3h2zuk58VKc&-@HHK3k^*Apd)OTY&wJ0^<{G|B3$R z5dG<`7qRm{+$P2`SXAITm=A@p-0#;s7?nR`H2!6Bx`D7P{_J9bEN zB*V=~&r<)Ps`Lr%;Pw;=#LpidFD}%7<#~j_+_)O~N)}^p0zW6k$>b(*s9KTZ1 zM_X%7OnzC~lelM5c+UfmbH?2&^FjzoW*T12@o0c&vgeW7{SF-20`~Qq*f6c6(@U4Q zTob&K-7eje{aP1YACl>|T}!ChPMiGrQOoD~kiBPYCAChr3+eSJ$~U^_2_$&agkOfe z7gIAfr;Sdjx#meYvyvDw?k8-+DqJFY{2UVvU5?)C%V_Sbq67-<4(gB`2}D0o@ULH! zUwB2lziCoIujrHhj#h&leLnuggS|V`V%&5T2EO1^#bRG@s})VWWBgQGJRy`|V`8@; zgQB}Ds_DPa;y8g$KG|SR=C~KH0W6y7Y?f0*pBC`;xq$PfX4B`Wyi&}hR&>i2)g}*rXo;=G- z(d82x{aEhb2=4LoRzk0Z`$Gi0gdEF;{?T>oR1NTwx#n(-HU}R-BlpdX-KWZgBm6lZa+cl* zUF|QuAAQ)Qyp8y1s)fjUenmVI_))quFX6j~ zo-p-d7Uv$}hg*ub%!Q3+J%7UcNJvEZ(m?IB=BqP5MW`YAd0QujB*pxTQ=PGH$2ghd zJenw~p}dM^@p(j!)Pl$En|7-rZd;|$HznuxDjxwzSCA_%C6$pw2$oN+A+~vv_51~= z?Q2>a_~vaTT|;5RcaP)Sw&I(+j@?`B>|FUityTc%r;cIoSOeQg1zb@;@_OUGBgJ}gp|{O-%z+8blmxg~wyEKkfm zc`YS}I_V|1x{1izh3#Wnv%2OI=Jwg96~FedVk0>!Dt%Xtb#?U;+MkAz{G#k<0rOMp zI9u7Q&7HC3f)|=1BAYaJQXaS`eo!iu{)Y!Knmnc=d_{R}vVzZgc=&Q@%)aigPWnGx zImy)W1Ew)g-9osZx!oz^j22~xwIPb*KUl;bz}J+^5}2F*aCk?U$xxG9yhV02GG zanbx@h^W)PHuZyNXEKhSwvPaWF6ysp1T-AKxo~zM$DvBADPm8UvC;+=@!C^l-RxFY zxu$j4$L^~te10^`GoiurD!cD`-#0I9F#PiP^_}mfy~-76z9$X@x5smFjiqJ&?ImMj zj+>76O?LPiZ~U|pzbYcFGbeEK#={o@7kXo7Q(x_>-eCMmvj59G);{8>BxYl17jjTE z2i3HksrE&EW!v$<&hrI{dQ-}%Ua|chinm*(JJarnN=_xsq^eezFCNm^F`O0R?D*-5 z!*er3?zH~m(%3g!$gUNxAu4_e=*jmA2W5w+?CBOZ*4e_GQe+9czzX2*m4&d2)HP=cS!mE`JH{s zJ1PXNIKAK9Y>)1i8|yBgJu<;bSN-jsGT_`>S8{H5NSpmZrnUU;U@ejGwT+LaeP>-X zpZQ(|I`esI)H4grCDWbuh3sejbE4#BPxU*}cC(gQM#{tHBQH-U%HC#zO(m)NxSO{P ze?M=OlbJ47`scHuE#<@M?RC6Z;e(Ekb8~gk*M>jVUWk)w!EFBa_2OgO9N(Qf$zCy^ zB5SW!Pbz;LaZBBB5lLMW@Z!BAWPa!_=i!~s@3ef^eq4V-BRP1bj}|^Yb}6?R|M|FF zziy563yr43s}mDZ-<<>Qr{9R*bhXc8d#2V%f-4@I9XoBRa|+Rsk-S5@wu?T5y#A%S z!OGe&>Z+wtkVGpfEKlM0!M91W^Nfnd{IM?ud*!O8?sBT>Z{BLUg(~Iq@os8G%HESb zlVZBBCa#=(XJLBxJMkIP!MP?VT5ZmRIR{CU4Gh zaS4mn1l+xNq{vCvDB(~ct%d(`nSW^03#BQ=y<9czJyxtv`$Yb`2&ST*)7hCTX}eck zi_hV9xflM!=@!f2;B*yJ3fV~s%U<{z-n41DH(X3J>cO*|2z;(Ca&XsTReaJ34bpVn zKyIR5fpJ@D?GTzrIaRBosR) zuYTpI%Y4P-PFsqw&bj?_$;+>6%)6+!r6F)~VM3+guE74Fjujh$_1yq;IgVLBIX6^Y0EGpDMN+xyroyGAYg-PpF5_jVlXCksnCFev9crmt1X*#|gO zq5GPZXh$QJchA&)X$u_m_k1Rf&}35X?0=8A#p6@s5d1b?y+7sir!zticXlaVEJzz^ zD)}v+7pJm-#P3o2R(M~1%eU^)OYX%oAIo_%PwUmz*bk<#^1Ui0Gc_rP9czw%3*8<5 zvb!O!alG>GSd-9Ey}7X2Z(VfB*e`GBoBKzEk9usZ=z6~U`O)_ixsPvy-)Rsq-2dtg zVRzt+Z5=IbOmQZb;HM)t#-=!9BP(O9m64OBtqsn}!ok$h!q&z=+Sm>37tSmOaMKN_ zM7aS~U@?f{hykj>@(lzv;G0$u+#J0KT3D zK_fW42?dKb!vpTZoEESq1P8Q&1KQXXB6!~p4&aOdI$+K{kkbivG}xT`@T6U^=mU`R z5Paoa z%KrC47kK`!vUUhAYx&@^_D^pEtW8j7vX#*lJqA-W`FPoLJ?wHNM#(52UHs+U2+LMb3H7M;Nq zFOrAHc~qosMXzw5KXn_#S91W%(jN9WSnkg(Ih^iP=WaH=^$_2eJhYbHs=699PB$_^ zycYenQ}|&Ugu`}PEsQN)yU}A17tWtlm!vVU7rW-QusVDHZtRczc4 zCAlXbtt)S&vey6C&OiFz%{m8`Jz~@0t8}r$|%U;1mn;??2-=MyZL61fki_Ta~aw{cztADlO^M#zxrM=1&`R> z`fCqZ*0Dd9g1eFoU>L)V6=C?(!OjS7y8*i^sWLRh%MP`u0$O+^uwh)5MW`pe- G_5T6-X)~Yz From 71b3825d5429c6a3ce3e5e6f948509dfda1e2d60 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 15 Sep 2019 00:03:56 +0200 Subject: [PATCH 061/166] tox --- gensim/models/ensemblelda.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 2bf40afdb5..ac8c316aab 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -110,12 +110,12 @@ class EnsembleLda(): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. - + Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that the user does not need to know the exact number of topics the topic model should extract ahead of time [2]. """ - + def __init__(self, topic_model_class="lda", num_models=3, min_cores=None, # default value from _generate_stable_topics() epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, @@ -319,7 +319,7 @@ def generate_gensim_representation(self): if sstats_sum == 0: sstats_sum = classic_model_representation.state.sstats.sum() self.sstats_sum = sstats_sum - + # the following is important for the denormalization # to generate the proper sstats for the new gensim model: # transform to dimensionality of stable_topics. axis=1 is summed @@ -356,7 +356,7 @@ def generate_gensim_representation(self): def add_model(self, target, num_new_models=None): """Add the ttda of another model to the ensemble. - + This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use the exact same dictionary/idword mapping. @@ -521,7 +521,7 @@ def load(fname): def _generate_topic_models_multiproc(self, num_models, ensemble_workers): """Generate the topic models to form the ensemble in a multiprocessed way. - + Depending on the used topic model this can result in a speedup. Parameters @@ -868,7 +868,7 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None): def _validate_core(self, core): """Check if the core has only a single valid parent, which is also the label assigned to the core. - + If the presumed core is not even a core returns False. Parameters @@ -973,7 +973,7 @@ def _contains_isolated_cores(self, label, cluster, min_cores): def _generate_stable_topics(self, min_cores=None): """Generate stable topics out of the clusters. - + This function is the last step that has to be done in the ensemble.Stable topics can be retrieved afterwards using get_topics(). @@ -1092,7 +1092,7 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): def get_topics(self): """Return only the stable topics from the ensemble. - + Returns ------- 2D Numpy.numpy.ndarray of floats @@ -1137,7 +1137,7 @@ def id2word(self): class CBDBSCAN(): """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). - + The algorithm works based on DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a cluster. From 2d184a4507ee16dea7701839f320383502f811e2 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 15 Sep 2019 10:50:38 +0200 Subject: [PATCH 062/166] fixed stuff in CBDBSCAN --- docs/notebooks/Opinosis.ipynb | 31 +++++++++++++++++++++------- gensim/models/ensemblelda.py | 33 +++++++++++++++++++++++------- gensim/test/test_data/ensemblelda | Bin 13705 -> 13683 bytes gensim/test/test_ensemblelda.py | 1 + 4 files changed, 51 insertions(+), 14 deletions(-) diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/Opinosis.ipynb index 8227f99932..a828b2adf4 100644 --- a/docs/notebooks/Opinosis.ipynb +++ b/docs/notebooks/Opinosis.ipynb @@ -32,6 +32,18 @@ "elda_logger.addHandler(logging.StreamHandler())" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def pretty_print_topics():\n", + " # note that the words are stemmed so they appear chopped off\n", + " for t in elda.print_topics(num_words=7):\n", + " print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -107,9 +119,7 @@ "\n", "**topic_model_kind** ldamulticore is highly recommended for EnsembleLda. ensemble_workers and **distance_workers** are used to improve the time needed to train the models, as well as the **masking_method** 'rank'. ldamulticore is not able to fully utilize all cores on this small corpus, so **ensemble_workers** can be set to 3 to get 95 - 100% cpu usage on my i5 3470.\n", "\n", - "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories.\n", - "\n", - "The default for **min_samples** would be 64, half of the number of models. But since this does not return any topics, or at most 2, I set this to 32." + "Since the corpus is so small, a high number of **num_models** is needed to extract stable topics. The Opinosis corpus contains 51 categories, however, some of them are quite similar. For example there are 3 categories about the batteries of portable products. There are also multiple categories about cars. So I chose 20 for num_topics, which is smaller than the number of categories." ] }, { @@ -120,7 +130,15 @@ "source": [ "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n", " passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n", - " topic_model_class='ldamulticore', masking_method='rank', min_samples=32)" + " topic_model_class='ldamulticore', masking_method='rank')\n", + "pretty_print_topics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default for **min_samples** would be 64, half of the number of models and **eps** would be 0.1. You basically play around with them until you find a sweetspot that fits for your needs." ] }, { @@ -129,9 +147,8 @@ "metadata": {}, "outputs": [], "source": [ - "# pretty print, note that the words are stemmed so they appear chopped off\n", - "for t in elda.print_topics(num_words=7):\n", - " print('-', t[1].replace('*',' ').replace('\"','').replace(' +',','), '\\n')" + "elda.recluster(min_samples=55, eps=0.14)\n", + "pretty_print_topics()" ] } ], diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index ac8c316aab..2f04fa640b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -994,7 +994,7 @@ def _generate_stable_topics(self, min_cores=None): if min_cores is None: # min_cores is a number between 1 and 3, depending on the number of models min_cores = min(3, max(1, int(self.num_models / 4 + 1))) - logger.info(f"generating stable topics, each cluster needs at least {min_cores} cores") + logger.info("generating stable topics, each cluster needs at least {} cores".format(min_cores)) else: logger.info("generating stable topics") @@ -1199,7 +1199,20 @@ def fit(self, amatrix): min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x) ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] - def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]): + def scan_topic(topic_index, current_label=None): + """Extend the cluster in one direction. + + Results are accumulated in result, a variable outside of this function. + + Parameters + ---------- + topic_index : int + The topic that might be added to the existing cluster, or which might create a new cluster if + neccessary. + current_label : int + The label of the cluster that might be suitable for topic_index + + """ neighboring_topic_indices = [ candidate_topic_index for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps) @@ -1207,21 +1220,27 @@ def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]): ] num_neighboring_topics = len(neighboring_topic_indices) + # derive a list of all topic_indices that belong to the cluster with the current_label + cluster_topic_indices = [ + i + for i, topic in enumerate(results) + if topic["label"] == current_label and topic["label"] is not None + ] + # If the number of neighbor indices of a topic is large enough, it is considered a core. # This also takes neighbor indices that already are identified as core in count. if num_neighboring_topics >= self.min_samples: # This topic is a core! results[topic_index]["is_core"] = True results[topic_index]["num_samples"] = num_neighboring_topics - cluster_topic_indices = cluster_topic_indices.copy() # if current_label is none, then this is the first core # of a new cluster (hence next_label is used) if current_label is None: - # next_label is initialized with 0 in - # fit() for the first cluster + # next_label is initialized with 0 in fit() for the first cluster current_label = self.next_label self.next_label += 1 + cluster_topic_indices = [] else: # In case the core has a parent, check the distance to the existing cluster (since the matrix is @@ -1240,7 +1259,7 @@ def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]): cluster_topic_indices = [] # TODO changed in order to make the parent_id parameter obsoloete because (i think) it (the appending) - # can easily done before calling scan_topic as well + # can easily done before calling scan_topic as well. cluster_topic_indices.append(topic_index) results[topic_index]["label"] = current_label @@ -1248,7 +1267,7 @@ def scan_topic(topic_index, current_label=None, cluster_topic_indices=[]): if results[neighboring_topic_index]["label"] is None: ordered_min_similarity.remove(neighboring_topic_index) # try to extend the cluster into the direction of the neighbor - scan_topic(neighboring_topic_index, current_label, cluster_topic_indices) + scan_topic(neighboring_topic_index, current_label) results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index) results[neighboring_topic_index]["neighboring_labels"].add(current_label) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 3b9563f08da01a1f9900d7060694ec235af53c2f..717c2916143305036b37fb3a7746d07b59c971b3 100644 GIT binary patch delta 3761 zcmW+(2{)Bp8`hzcN`o@xH79XOdNYLxg~(hgO~!=OK}zJ|IC0Fg9Auv7;h4!<*Yy{^ zfAal-@AfTg8MggA``*`m4fkm*{BObPe@-ik~8c(&k0#CF-eFG3UPAh z?(HX!oY{Ta+oTZnXk#s#o}uP`Sjqz%dDmjR!e-21{~iXo^4gSL*HME#Zk!DWfPsT{ z;ZbpAFmjPwIZ5xhh$S@0`@5xxou`;86T^g%(?Ab@`O*i#gP?AG3<5tQpbf9CJCfq?o@skN(de*@}U<{HSEP z@Lz78Nq+7QgDY3r{h5t6mR8@m%lUKF@t@zSi=Wv(bKvmzvlj|ad89r=mzo{#u_Z{1 zS*M}4$lXFgDMxHzu^`Q!!}t}(5wwJIQ$XCokdFoWF1^Wk#c3FCqYJM1nmQZe(5ONPYr(qzj|w=))lEF^P@@bgJJLXdJnp^B^&>G=d`;{LAI{ z6>Lw(Kq03J&{mNoyZ=hpC$XRkh?KUEb4GFI+=o!;D>b>dxXB5(+d`Qy>BcAPJ>q0Waj!Hx<>(VJq2b@W_4p*j0C*!sKlZuO!%`d)aF zDH?8FN=W5IgmqAKICnz1coX|L!T`okp!p6OoF22uibJTwG8giNoNJUw96l=tFQ>k~j5#&*(dW4yjy*FM z9%YnP!EpW(=U+s#KH0Ct_pZ=poCh>~<*>UA4B1e|uwNpIH>%n5^;pmsr6|f7x3Smqa_O9O(#jVKK8kX3uaNrqsb6u6T9-CP8*J#46JaqN9}Ph`uWQ@2#aOa zF4!wdykXB)6R#w6-NmGhi3bjfpK{h0s|o(t_@bm$p&>)<8D!1Gt(e8wZCfm| z#ov|9=g_H1cNZA-f!_JHSQF_7_#~!4CwR{kaWD^U`PQjFpcBBlI8??B$4jikpdH6e zg)uwgCfyI8qQ>34472WDWw(7XfqsdNo^Ka;l;)veH6$t!+7!+d{@>dSUf6L*9mlV| z=kEvRthYv!X|PzOF_+DO+j}>+!#rf-$@;K>l%PB5!=t7y)*1}HCTe(UE z&>-fzdowswjDOxu3}x*K2Yu01m|b4XH4I_I$>AFtId2=p5e^3OAccJv(U?|AXy}tq zmnBM#wf1?J?~BoDpIdU`I_Ek0nhVv5SUo{;hn|Z{^%*&TI#x1ZNs){7U@?TcfBjR? zav2i=+!tYPVDh;GIzl;L?p4Y45($%yWUgkw_*NYFh!biIRrzu3D)(GPGj*4Qrd?(j zun?AXU@1%9-CitenJOVOu5sNN7$4tFY;VttXuQ z=bozabbhk`M=N%s0psQpPc4Ze*Zvad(=~^-csQ?b*RuC+r_k;fD3V*lhmc>?a9Z16Duh$j`pV9G3xVpIz)j ztu$~#&YYItwvsuIF)lh;{4g(+;yV?a)CwU$+w+^4y@Q^TaO{R)Cf{G4-@ffooc{bD z*1X{8i@eWh2}ff#wX9=P$q6>SW6$?n><|Yqh~y`7(SikyAU$_OykjCL=Ct^v{vl^h zOXU2KC4&^B!*VtCc8+Mf%TW_HVNrqJGubUeT(yxKo@F-W8Q3gm&0E_vZ>dzLE|H9w zj^m~;>q6wleQ9iZML+IM*p&iZTdPtCZe!mjHj%L>Wk@gJGuf`>V=oM3mPNfOpMpQf-Y%=@mj^m#A9 z(+iU!|Hzc26)sNM$>vFmXiiqS@kn_pVjWJ8m)X)N{n{4#^;tYZzh$r_$Y!1hc3aXx zvQ|v>&m$~Vp)navmETRda?0t1j6Qu5(7R{+-@4@l=luwm?3wnK8e1fJyTz-Qo!OL+ zaoII5eE6%W!qKHq(%T*Jp5G1<(8_u=WoEc5%#_ju20b=~&+{%S4w;CvPr8W#G-5AB zCcAFyJ7|}G>o{J-0k(Txk;RFpDMi#4$#|#Zmg&zY=!s;LDi%wDJknq)=6eWxY@sW~ zpJ0J_Un2?ni>)EaF(M%YhZP>gO9yn%b0yDfIC2(SCW6C~%-C~wQczE3pp?@kz|0AH zbr~XydG1$~%Iw@Lqx+r&+n}fFb9qUnH@Z~Pra8%|0}gyG;kX3r?7hek8X}?9YzLk{rr_0^98L9NXH43$^3pBBZcCaqe8-W$BaG)H z(+p@V6M8IiC;u|q?{h#o!<8ruR0ym?74ITmuibhcQrkJ>0MWw@RbaV~R?W7%2X(gUz>?vdq%m0<3>yhq!k zy!*DH0jbwrg;{Hs$1Z>!D6Oy2SN;lr%bxuZs&#oqvnR;+>9^O_BC1{|8!?mdGco{; z3NhDiS4dk1ay+s5<&md6z9Kx^;^@y0^10<#5Dt?teI?XU6{W@>kF!ZK3C)!}mnYgj z95XxVpw#x59pFP`u!gV3@Kv2qP=Q%oB5`QLH-q6|6`RAh=!Za)W*vP4@q_G z<%z}o7Pd`_Q7%N@5HAjGey%?rs>X# L%iqvg+dBOZ({|p( delta 3783 zcmW+(c{|i?8xAFTgiu+_6Uo+sBuZq@BOw)8vRAeTAzBP0jBTuAiR_GhA7-q_ah_k{ z{U+}hcrWiC4##hf-`w|go!fQIe~a5oHve-`(QbL}T+$;<=3=#g3v9v$59^?_buO`c z!JYarj&MFLl`8aLhBJPgz>@d7AdK6v!_<#6@;ty%_7@gJuSf+uSIKl*4dBXOGejaH4$NVDZ zumV*LM_*yp7>$YeoluFz{CFN>>p7ZJu#%e@L#+jyxMz$`>ON#Z%|#xEWAi+_I1R&h z=K!v}liRw=EI9;Sbvj1aMs3LlbE?z4+}NTpvd0+L+-^VRa2?jzk5L}`MWCiV8vz0B}$_{$zb*uR1ErCp5 zi$-i!p5LiacLO?iu9R0^iN!pp<@QdDd;3_sQdLTQ#ievBNAA;L$y0Wr1*@ERh*2o9 z!i{Ip3&-8u^t)Gc6}wbSI$)+M1e$yv2oyUtQ2zB3yIul&FO(m0q{ePcqodp>#Oae9 zFm(a_!MiZq?^GY|0LfVj?Qq-pU+%PehET z&|(1w$FTP`P(eez0xLGI97zAct?;uxg`e%~T*s;p_9Law<%oaL&t|m5Ib&P#BnY!< zZjM~RF|>DiT+5*dHkmk4`7Gk?7qL%USuXZ5nGzg>z9O!f$5CUKe9tVfjtk*YnPpf* zGkXfLZ}$GGR5Bp`d*?3+GyIx8IDje^+ACpqgr)&Zq*c+7<%}^LViT>`&{X*KTAB%U zp8gzu3pJEnm-e=$=*F^ZqI|s(7h}vMew5I-lz)z4XXS&hQFhp|MYlf|qOk1cl`Zz) zk+j+Q>1>n`;g?38qKX}u6v?&<<-Ko`tUqApiKLJ+-4j}Bs%+82lc$(vp8`86!b(px zrW?3fg4Mu8aocW%_`Ls7Doz49ZTy%U=c*r5mGAbM?eUUXJP1pI-ub6D53I%C%{dYj zy{`fbxn5L=88e&f3bq={sf!A%Vm-`O2x#MC6uXj3@*hBGz!86QK3fv8?A}u>nqbGt zSU8>KqzRUBc&p%L`nMX-YV0+LMb;BI{K*>!BBeq0U%Z+1 z=#530#T^CaT%H$OLSb+~JKGa4OB-5K%vB24+IXI(ax@=3$?pZ?UTYuG_`!So>m1;z zB>ahBKZeftaS7Jw3PyjTWh}dDI4P|CLZcsd-n``&+oQQxA#tS=nw!Lz^ETXyyZj_v z;`Wz>MT@atYzmtdn1v2a9-_+QB=+BS^RW6^kKZO|vAHQy8nKoB=OjOsXwMOM&Pe!7 zO4_utGa~an*4|vO604oWQj^Gj`-kYDKAL*_0&K?DNC0&dougjv`|}LiFITg9kP@3A z@wp&4_uS@5G8LP79C{(3t3{6!81xY(H9JFnmLwzASZB*Oj2nql=D77(`d4B4Te&c) zHIBqq?NFRTNh3Qm^3vEzRRuJz{#MVhE2C*oj@{z! zYwiWGElI?2DtaE^#D_%8dwj+ao8)3oPWj{LmJzfeIQWb6=#AoJFx$-JfF&Lq2}>i+ zg8L-Zd5U7r*mE~p#4!eK@NH>ur2+F+cd;it58NsdH7O;$T1D$4)IIPk!%~9#=MIHMX4P{@5 zb9k1eA6l`8_0K%Cxf5E-(;KlE@=5={7rSJXEnbwN(COHa#a>a|ygPqmAuO&~R*fSx zKD)2Tmw~q~d($EMt1#ipE+5~ke@B@;t~LLP*_eFkeAvf97O@s{5y&V{U&!qm*(9d0 z?ZGiq-$e1|07gF1fZa-&L&tVhy4|*yk4LC7-B*??1q}lsl348mg~e*e?~(PmLubmZjt# zF?{bwg#_Fbj=mK~qxWvk4Kdi%4{9C=(#xX%eJ(m;?!CBh23l{7T$1hs?qCC(A}Ni` zsaEF_d72l!9v65xzT$#A7VJ5dW|#1>Ak$1hl^WR|3H4KfyN8Q76v6JHO_Zt)E(yRkx};&_B6RF4yd4vHkm5-#o1mrD$Eaz&_uL7>R!@ zVJdT{pL3p=y#bX2^f_W()69%}H4XpD&O6UP=j6r{^oiayFobYc=896j_UGo`KQo{z zlGNM_^^_pdV9LDKu|(ioMx(DWj;Rf(=1LkTd}Mzw5ID!Fl*O%Cj@4Fi$DIpFb~3=G z74myS>&YR1X><;`>=(IC$gaHO|4~L(C;MYU&{XRd`_6`|sWyobskI(L z$)F<5IRwKllo(i(^O}t;C1yB`#saBTnU;JVeP&eW8dE1>u!Ej+BGcwC(O5`FOV}j| zRnOdjTB*GI@x!`6v{X2|2%y7a$Zf8QM5j!+>JTBT&0(d4^~Nh9WaFE^jePKjo#}F= zVmsE;=C^ngB`ICG7d3BR;{A+fPM-a#gW&LVb0-aJPEK@`^`JmT0a^C>c7Z zs4u*f{Fx&unD^yEDHd`15{p;RX(8D5_~SI{(g(SELb}&n@lA+|M9&XdX|)9c>5;rW zS!=`NCt~0g$r=qiVz`mz7+;73H=OX70?QFqlIc4_wfxwT&36+s&&|>6BBP_{d^F}m z;>2zrpQ0y)vko3tE!lO~HcXK*$dMnV#n0b+I7r~=DtkZu+gjO|QrTZ!Igs~H L=}yKq` Date: Sun, 15 Sep 2019 11:33:12 +0200 Subject: [PATCH 063/166] removed unused results column and only CB-Distance to other cores --- gensim/models/ensemblelda.py | 11 ++++------- gensim/test/test_data/ensemblelda | Bin 13683 -> 13705 bytes gensim/test/test_ensemblelda.py | 3 +-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 2f04fa640b..dfc90148b7 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -884,7 +884,7 @@ def _group_by_labels(self, results): Parameters ---------- - results : {list of {'is_core', 'neighboring_labels', 'num_samples', 'label'}} + results : {list of {'is_core', 'neighboring_labels', 'label'}} After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results member, which can be used as the argument to this function. It's a list of infos gathered during the clustering step and each element in the list corresponds to a single topic. @@ -1186,7 +1186,6 @@ def fit(self, amatrix): "is_core": False, "neighboring_labels": set(), "neighboring_topic_indices": set(), - "num_samples": 0, "label": None } for _ in range(len(amatrix))] @@ -1220,11 +1219,10 @@ def scan_topic(topic_index, current_label=None): ] num_neighboring_topics = len(neighboring_topic_indices) - # derive a list of all topic_indices that belong to the cluster with the current_label + # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores cluster_topic_indices = [ - i - for i, topic in enumerate(results) - if topic["label"] == current_label and topic["label"] is not None + index for index, topic in enumerate(results) + if topic["label"] == current_label and topic["label"] is not None and topic["is_core"] ] # If the number of neighbor indices of a topic is large enough, it is considered a core. @@ -1232,7 +1230,6 @@ def scan_topic(topic_index, current_label=None): if num_neighboring_topics >= self.min_samples: # This topic is a core! results[topic_index]["is_core"] = True - results[topic_index]["num_samples"] = num_neighboring_topics # if current_label is none, then this is the first core # of a new cluster (hence next_label is used) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 717c2916143305036b37fb3a7746d07b59c971b3..286d8dda8bee36a5089ebce4476eed98998ff7e4 100644 GIT binary patch delta 4758 zcmYLMX;)O)l1(NNK`22K5d{R4QVV5Nkpe|fhC(eA6%-K^O%f3^5wI-qBJ(_t0x~4? zJjpDqwc{_mf6&!yb$@t0y?#A+zuq_PE@7>6@3|*VoH!9Xc0~S3SChT)2Ww^5UZc5l z?LKBPFAY+L`4ckv6z$T3%}dxib?1c4U{jWzq*A7@V*`!oM!6lTWex4vap9g_hm+Fi z8z0Io{nBJDoo;9h{dwQbgy%m>6FwH8^DJudMP^?}XJQ`p)HqzhT&^p&(JRV2<{tC7 zwXl2pQGx~OmyV>fn8y@mWahr_ZFEXSx_yLBs?mggwB)$TxC}`7^URwk(Qc0FQ&R3J z>vw3_DmtY489L9SM#>{(>M$xWiS9kv(j|&9`Jp-5WDIkjXwb@vZGx=ac_YdIDq_w{ z`43sIrBpu1)CF72D-K5jWm!69GYfq(^NhD`kUD9m?X}P3^9>BbXpXfsJq@pZB5Sf3 zD%JZfQGEjoAt@erv2-R)7G&}*c4P&e*QHB(?68KN^D-&5XfnqP8lGT@XDIW>rCWx1 z_O`5BAH9hIsfm-R$Mo0SP5B~qx7=X#KN%#|du2p64QQ0^{1}wQ7>dz*@@I6XH`E0tj|W;?@pPmH!wa}~oV(_<<>97BoTvVmq8Ej*<*7VFY_3e6TWBpc4yx-46c zbo(+E{C<)alwb8>Tbu)37#C;j@@3YKkT z9zy{#a+iD7WM+S0D5@}x@gg+SE&Z6(5_Ua)*iDzkFyf*IWf644EH<2LnQbKX?y8uktp_fQoy2Y4vo{yEy zd~7F5g^WjIg}|6OaXz03QRQ-mP}mLi<5kCH3u}5=ew}>E;vw^BT}l;eer=EjZ0`-V z+8d(BXWl3(2p#v%NreWVWZ*8ZTrZ=%h%xH|ojT%lm{#IdaGu4M zd@w_;H9GcSJ(zh`Mz_^RpbxY07G6R6q-#geQYgK;b3WSVq1iGgjf{CIx{DtsOHIzp z$M#bGbHW+Idz)FjgiUG_h#9UlGqGsBcf4}gl%_JUWrJH%SD&;DbgPP-H}Yx$%^zl zquDY^hJHc${g|X&X+}rT!>5nQ3~kbm(GaHXGDcB;*YedlSv#K)qB~YdA075aYsgia zTIVHSFf9uS)C z>?cb1Qh#aS#S9sh-aDS?yHNP{C^LEY=s}w(-(2ZQM%OX#E54YOl9SI;-O>r;T~yEp zvf;U8G#SvE;O|9_h4Hct+6z1|^$@GF?G+rDqs0zAP_L7jJ?M9^lQtPuUP<3!;<)bu zwj&dK(Vc{zy@c&5!$F>Yckhzz%S%|4_D~OLh?V|4D-33*kiVv6SC%7!v53Y3`4q#< z9cFl%zGo2IWec+_3Vf>Mn!)I3DX zW&hm!=;FW87AscL>_vL2$451+r!NLOONj@1qNU3&I8fGK`BC9V8Iv*@BKZtwn>}%V zfpKo$lwC~w`AU@+CnZd)nQN#)>ACl(PFV+ICHlEcFk|Lq)B1qb6=JJQ!K9QrW8nm9 zH5iLki69iqhn^LoG@BOGSxD0(CSZ54Oyvb0IfTumRB60!c+cuIBpv=3bwRtUnM}J& z!&CId+{-|pJ63JfVF*1{6u;g@+`7>h^ znI)Uj@yg7GfyQDQqq1{NstU4ylCoPqjLedZV@wvzP9MPMz(Q%oCnp!uag8WD(rA`` zKT&$ZP@ni6q1sEoPCvnl4>_O`E0-=mxau1eby-@xG5QPHq|@<7tY6m3q^(63Iuu>u z=}gQ&K-twtx2#uv!W!JlONf2fHdl7Er;Cr7G?M>qHHUe^KIFDLo(qH?eI|=tHR)`!MOxCF(vopgTqJ zEW$$mmDkv`K%YP6Bit~NqE^%e8BRqT$!s==T-S0ARogq}^F2&ta6MuTJXk$QJ{4FDs9g=Q3w6 zCB@{a4(Vd4?Lv!8@iv>Xl6D-WcTc}L_Ky8R1M{gbIoiJ_;{LSB}hFBOnZJ zQS4wUS?H@ABGJRuYQF;ZO%<5%wm8Mj#z=g5R&0qCsm1Ui%-_kdMY+@~VOV$dVGTpM zu7Ck+inu0&1BBBOZ#2t1-H$GPN#rxEgv}>+QqvR`B2-erpt~#_WymYA<{*7O zti0>A;{&$PaYT&t1jLAr<)M%nTqR+6m8I_LxhKJ}J-*7A=QlwkvENI*+K5GwPeDJa;Qk^6|wcNz?`D=(}_4kD}elbq}9oOPar;s?fLFeRDn+ahBxBkfo!JJfJvgWcShVzg*7<5CkI z7D;XNPne~DHe@jtm50N{c>f)1K5IQ2r}9&$F6JsM-b0u6cqtfrkLHss+{@8!7%Zg9 zGPU}mrbx>mj?El8sB~wn0hgg9p!fR;OKI zg0vkB$-!dC{dZ_nF#A?v7O25out#$|S+C?gS!IZ|crHPd-FLjZQKn_t4bw*tNL3^k z`p}Sx$%}rBYm>D2k}SGdXDf?Yo3}|zO&PKs!w#pyrZDq{tYtFr10>pb>|Ee#qYRv3 zOsc4N?Fm~c{-_9$<-5nFJ({lQeVQs;F4FDk=unKU>k6hX#TuURCbcA&$*afNa(rfn zwqn9nhVHPSb<*`)vdN!Bv)MVImv#R8!t990iW`SeuIDMAW$_qW{gr6UCa3t`meTw7 z=#I2HA+^V`gn_%SF((UqFnFApXbPwJnza);$g;u>jdBkh&LGJ>zHPO%#5}_dP(%$D}<8C0+s8W#7#{ zfK_%uDtwr&jcQNCR-pn-`&bm`(2lNxTjz6lR5|gru8@nCsHm6FA5MKK+h{zFo}(zs zzGNvQ>9Sk=RC@Pa%-50bMh#N(05b*dVmxR`e{8Tv8k5Zc>9O|6j3A(flcdKi7kyV$ z`e5p{CSy?Ftj(>|w+CRo?v?c}rZW8>DME!qK!GC>I@ZXAQ;eQhK ze=%t=Xv{Pk-*~gHN(^S4B^t~*{;tYk!I`?rU};*k;+S7-*u%%~Y7N$^3{5udHObg; z{HlJRDi*Jaw~DvUtT5P`X%y)gDB{s`=+lGX5ODst-`bOy*lreIREZiH0C` z`FHz+IWz4I;Y*_7LR(m%oBc(;ry4GC^er}owmlE@RxX?5w7;bq!qm^&ztO2t)~T-2 zsRw4NDj(Po&#&@x(w^oTGV=^S@_W4HgT4Him6vA1xgnh2s(FJ5K7DoHv|dr?+COXm zqEn-*Q(dI0)abq{G1rB+*#{l*y~^)eoi;mOQ(=fy-}P07YkVdeqBwqarCp^Vdihq+ bUNg7rszh$3;f6^fhT|JUEXR!e48Q*cRCz9% delta 4766 zcmY*ccT-f~mTejY1O*i(iDVH0L6j&cC@5g7Ljw&abFCx6e8IoSoKQ`_P#8=iIg58Hq(^GTG|~)NjsH z+)YDP#Rd+v>-Fj?@9>UgTEMGEB3JS%9n(5*aXYngFSjabfmW!U8iS~bSNW(Sl(rs~ zzc=Gq?mNqqSQli^VuC{*TzTL7@k2-MHu99^Q9ZSB6*oPn$~$2R_pET&(i4%JX_gNC zXpmQ5>vPvts-y$nGz|=-0Uojmk5p)AiaTSdrH$qJj2H>H+Uf` z_B>DXc610$s}p#dM&j(LL(hrETr)l@r1?ueDhcq@r)t`vR;Le2Uz(>X?x#7?9cx?E z<`{h;Q7GVVyg>bh@41x*6{^U1IQ)aRPshJ-r2%JNXfNQ_%1QB}UH3BooPiQqPT;$uGGsmJE$jHy|;hP$;~ zC0-13hBvb*A764qCaskvTjs}PinMKBoyU0I0dGd^o-_RtF zBdOD(hI^r8;~@=m`(>J@HQwa$SQ?<(5dKEvRLK)=w8#BC`OKCUuVr54QPnf*K^Df0 zsgH)Zo~Eed8FhNS$yM9)VfMY)prQvnn#wo7CX&h?^{Um}+Q+7gd1gFpu*F9r+`Y162Kj2BKV*fWtO77tx|?a4~(O z_3&rZn!(e?G?o=8mir(cJ<&lX@A*&y&OYsO^HrXshOh5jc>OJ-@;qPhYG!H{)m^4m zI`;5HwvVBuu@v}UJSDRXC`yh+$})JFtMA>U4T}rBV(60sz|8R??WO3KDXwwVo5y#( zs6A8W;DGXNH^oQV8HiFX>Wi)W}Au^7yDM8I3U z-;*{A(QSf5Yg7|KN7l&pl$sBzjYi`ErfHr~KPdf*YH#B@3)oLu91NWZeHQMpvgD|HEqcmHCYjRrTp*u-58|1<@AL4Ry zxr^%ZXe$}$-S9)jt>SN;mbqQz!RNHE0HSu{(|I+i!1E!m^RgjrQzNf^hFE7{u2E!M zG|kc2ZQcR^#xlwup_WI3=`bBNv5%cjD;_-+RO)O?RZ=ecxz0D3mNXSkbdVo=&6Znb z9&yxli>9n(Zgf=el;?bD?O6bAeiW0+sUg+NJ=l^DZ&0ab1(xjBgU-pW%@_G8%y&-Pgtw2cnS8{yP%6G`2)m(I1koiQG4tAfokr%9kd6b8O_$Yzh@~|>20Qee6Dv+b|4a;ez_JL(dT1$>Yi|c6P|F3H z4CF%;<|a+Ou%nK zA~alHz+0jBBVx>{-TyH+=9Ffs5v18$_IJ_0`cV-*GTMmdwleyTra2uVr)?kFdCYTH z{N!aus`!BS7Ties`p`feEY&{}+=84uAER~OTy z1@)KliVgL~@s^w_-K;^U4UXLRGLSm_c@tP$D&^x~&}Ewi>U!@1PuX$@&pFVcRP(&& z6HKFEQ7vtSMMhFnET250E}B7Cnt`c}5Se^!z?&jo7cFR#YgBDyufcF8=s-+GW+6aot;_p270nK)R)(W%n?R&%%ct9O; z7xz&W5;zH+Gx)U=&kHopOAaPqjPqc8r)ZT}3IMddxK497sYeK>{ZN`!1>pRyw_QQ< ziz{52!ec2ph13#GjagjHHQXfVaZ@_?e7?aQV1NOX{G^gvXi<$w&tC&~Oa@0AfhV>1 zdDaLm=R1}LU`#EoRIt{@p4x8ln2eifNhGa%yh|!rwUIa7HC835v|YlLZ>*C%VQHN_^aIWJLZvn} z4fyupY9}>+E8-_Qd^q9{*zKfVRskRb_e5&QNg8P2ieX+ z!S13vpx1&_zdxkqa%zmHrn1laylUv6ho?^;8ub3@fHyALAvs^d(R*e*VPgwAZ#TGl z#gUs-Gy$mv;Nge5oKDVvKyLTIJ>Trmpv8?S#+%_5kO|We1}$v^&kIg+9+rXGr(8e) zjdYNJmtD8zO=`z>El=e0Ah&y5#Ny=BgnX{f$Mde1cl5tLrk)6HlE=_;5TB@NI{I@c z_gGVH5q%Fqh!551pmp3D8Xt`k8sHI;kDed{wJ((D=aoEaN;@*dVHt1ic^eqilMw`S zx)7LoM6V_lWwF4ArA2r<_u}b(fW|iDE-x$*$~>t{jx^1qM;-BCvA`2(taE-5p*-+_ zYTZ8}DpO_!3O|~TFnnQ1^^bWbi6_P4a}_kiT0DZr47kzmx%A}g3(=RZs?Ff_FBi~g z25@#2;~;JxiL{OLaG`FsT%h(;?)1dk9_G zz@E0wpr;K`qXzI;;yu*`YQMvS;%Q!uq`^{zb+`m3H`7vb=@ZbWulo+$-Az{v~sZ1^01vBrm5rpFwf1@H~%aWqwKyq(fu3Ob=QO z;X|jl)b=pvj`iKZ#H-Fg*1DO*0<=eB>uc&Oc||`o&%Xd_T^>=~6YTr+)9VTpRj-2; z$YkO)9zbIN=DK$&(lVIsPTLyBLg z1pj!TtJ6UaDCQf`HU*+wjJO6aN3?166!=i<;fn;;qtVZN^m@u#k!L;RZa{OJG6wJB zIi5)61|@pd_$|5PBk=FxLn}KCRozMB)pO@u!C=kQ=ZAjKYvV!()SXN1>JC+5y}C0& z-K9}?XDU>ivz8XNqNz+KD=7YuUX=0SW4^LT{Rf49$|R0Q|G!Z%z>0KMIHeOX#8~`QFv92mE*Fd?8CWlz))S>$Ofdhc z!&H(}$P}j(M#?(j3}y+!S!wsD4n3XKf*DqH=goBj=P*LYd3=89xFE?$c39|SE@J$t z;*ul=H{?=C1kqBrZ6$60#t&<(=+4{d1Z*)v$7Ote>9CVzBtNd`Wb85i)ZrkB!3}+# z8;-hNCu#RLZa8B_)SY+HNw{K!j;r|m(&2`g&W>w38F!37bzGOkbbc7X4-eh0r?mSU zH@vW-JAOka;EfR~eDL|HLIgyFo05>^hp$fP7RH}CZcB1HKTbi1pKjM*+Wn0m0a(!; z57Y?+VJub_XQr3~xJ8A6Mwy+Sm0w(>Q3^#uFgDYKI~d)+3L$#3?yB3)Vko8xJ=O2i zguBv)>Thx-MC3~G%$1B$0H=4xG4AMS*5PpiZmIS`O0)n;m7w`MZyEfs&Z9% z3R#^HA^lc22$A?E2oEv-Ou?CY;nAkj&sfhcN|Gq77ov3%F&H(%V~m-_ncn{cX&L|n diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index a87af416cf..f50b6fda0b 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -30,8 +30,7 @@ def setUp(self): self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, passes=passes, num_models=num_models, random_state=random_state) - self.eLDA.save("asdf") - + self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, passes=passes, num_models=num_models, random_state=random_state, memory_friendly_ttda=False) From 70a06602a7b51ff05802dd21f9658a196b55770d Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 15 Sep 2019 13:31:19 +0200 Subject: [PATCH 064/166] tox whitespace --- gensim/test/test_ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index f50b6fda0b..84bc8340d5 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -30,7 +30,7 @@ def setUp(self): self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, passes=passes, num_models=num_models, random_state=random_state) - + self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, passes=passes, num_models=num_models, random_state=random_state, memory_friendly_ttda=False) From c8b23ab3f5d578e571ab948e501f6eb71af250f6 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 15 Sep 2019 13:46:04 +0200 Subject: [PATCH 065/166] cleaned obsolete stuff from cbdbscan --- gensim/models/ensemblelda.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index dfc90148b7..64b5a25c17 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1237,7 +1237,6 @@ def scan_topic(topic_index, current_label=None): # next_label is initialized with 0 in fit() for the first cluster current_label = self.next_label self.next_label += 1 - cluster_topic_indices = [] else: # In case the core has a parent, check the distance to the existing cluster (since the matrix is @@ -1253,11 +1252,12 @@ def scan_topic(topic_index, current_label=None): # start new cluster by changing current_label current_label = self.next_label self.next_label += 1 - cluster_topic_indices = [] - # TODO changed in order to make the parent_id parameter obsoloete because (i think) it (the appending) - # can easily done before calling scan_topic as well. - cluster_topic_indices.append(topic_index) + # TODO parent_neighbors and parent_id got obsolete. 1. the appending of parent_id can be done + # before calling scan_topic and 2. using parent_neighbors the way it was used was not conform + # with the algorithm description. scan_topic gets the list of topic_indices of the current cluster + # out of results now. + results[topic_index]["label"] = current_label for neighboring_topic_index in neighboring_topic_indices: From 6f98624b598833676bd9afc7286bff0ef521ece4 Mon Sep 17 00:00:00 2001 From: minze Date: Mon, 28 Oct 2019 00:29:05 +0100 Subject: [PATCH 066/166] idk --- gensim/models/ensemblelda.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 64b5a25c17..20ba1753c3 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1195,7 +1195,7 @@ def fit(self, amatrix): np.fill_diagonal(amatrix_copy, 1) min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))] - min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x) + min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[0]) ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] def scan_topic(topic_index, current_label=None): @@ -1212,11 +1212,9 @@ def scan_topic(topic_index, current_label=None): The label of the cluster that might be suitable for topic_index """ - neighboring_topic_indices = [ - candidate_topic_index - for candidate_topic_index, is_neighbour in enumerate(amatrix_copy[topic_index] < self.eps) - if is_neighbour - ] + neighbors_sorted = sorted([(distance, index) for index, distance in enumerate(amatrix_copy[topic_index])], key=lambda x: x[0]) + neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps] + num_neighboring_topics = len(neighboring_topic_indices) # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores From 799c11284a75d170d4fc29f27a92cca3c1c7bb72 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Mon, 28 Oct 2019 22:42:41 +0100 Subject: [PATCH 067/166] updated doc-strings to be clearer and better reflect the truth --- gensim/models/ensemblelda.py | 84 ++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 20ba1753c3..a4ce3b4aa2 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -974,16 +974,18 @@ def _contains_isolated_cores(self, label, cluster, min_cores): def _generate_stable_topics(self, min_cores=None): """Generate stable topics out of the clusters. - This function is the last step that has to be done in the ensemble.Stable topics can be retrieved afterwards - using get_topics(). + The function finds clusters of topics using a variant of DBScan. If a cluster has enough core topics + (c.f. parameter min_cores), then this cluster represents a stable topic. The stable topic is specifically + calculated as the average over all topic-term distributions of the core topics in the cluster. + + This function is the last step that has to be done in the ensemble. After this step is complete, + Stable topics can be retrieved afterwards using the get_topics() method. Parameters ---------- min_cores : int - how many cores a cluster has to have, to be treated as stable topic. That means, how many topics - that look similar have to be present, so that the average topic in those is used as stable topic. - - defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) + Minimum number of core topics needed to form a cluster that represents a stable topic. + Using None defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) """ # min_cores being 0 makes no sense. there has to be a core for a cluster @@ -1055,7 +1057,7 @@ def _generate_stable_topics(self, min_cores=None): self.stable_topics = stable_topics def recluster(self, eps=0.1, min_samples=None, min_cores=None): - """Quickly change the results of the ensemble by reapplying clustering and stable topic generation. + """Reapply CBDBSCAN clustering and stable topic generation. Stable topics can be retrieved using get_topics(). @@ -1138,24 +1140,32 @@ def id2word(self): class CBDBSCAN(): """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). - The algorithm works based on - DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a "nearby" point is, and the - minimum number of nearby points needed to label a candidate datapoint a core of a cluster. - (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html). + The algorithm works based on DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a + "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a + cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html). The algorithm works as follows: - 1. (A)symmetric distance matrix provided at fit-time. - For the sake of example below, assume the there are only three topics (amatrix contains distances with dim 3x3), - T_1, T_2, and T_3: - 2. Start by scanning a candidate topic - (e.g. T_1) - 3. Check which topics are nearby using `self.eps` as a threshold and call them neighbours - (e.g. assume T_2, and T_3 are nearby and become neighbours) + 1. (A)symmetric distance matrix provided at fit-time (called `amatrix`). + For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5), + T_1, T_2, T_3, T_4, T_5: + 2. Start by scanning a candidate topic with respect to a parent topic + (e.g. T_1 with respect to parent None) + 3. Check which topics are nearby the candidate topic using `self.eps` as a threshold and call them neighbours + (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours) 4. If there are more neighbours than `self.min_samples`, the candidate topic becomes a core candidate for a cluster - (e.g. if `min_samples`<=2 then T_1 becomes the first core of a cluster) - 5. CheckBack (CB) Step: Check if at least 25% of other cluster cores are nearby to core candidate - (e.g. at this stage in the example, the cluster is empty, so checkback passes) - 6. Recursively scan the next nearby topic (e.g. scan T_2) - repeat step 2.-6. + (e.g. if `min_samples`=1, then T_1 becomes the first core of a cluster) + 5. If candidate is a core, CheckBack (CB) to find the fraction of neighbours that are either the parent or the + parent's neighbours. If this fraction is more than 75%, give the candidate the same label as its parent. + (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given) + 6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as + the parent and the previous neighbours as the parent_neighbours - repeat steps + 2.-6. + 2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5) + 3. (e.g. T5 is the only neighbour) + 4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core) + 5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3. + Therefore the candidate T_3 does NOT get the same label as its parent T_1) + 6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5) The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is @@ -1182,7 +1192,7 @@ def fit(self, amatrix): """Apply the algorithm to an asymmetric distance matrix.""" self.next_label = 0 - results = [{ + topic_clustering_results = [{ "is_core": False, "neighboring_labels": set(), "neighboring_topic_indices": set(), @@ -1195,13 +1205,13 @@ def fit(self, amatrix): np.fill_diagonal(amatrix_copy, 1) min_distance_per_topic = [(distance, index) for index, distance in enumerate(amatrix_copy.min(axis=1))] - min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda x: x[0]) + min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda distance: distance[0]) ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] def scan_topic(topic_index, current_label=None): """Extend the cluster in one direction. - Results are accumulated in result, a variable outside of this function. + Results are accumulated in topic_clustering_results, a variable outside of this function. Parameters ---------- @@ -1219,7 +1229,7 @@ def scan_topic(topic_index, current_label=None): # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores cluster_topic_indices = [ - index for index, topic in enumerate(results) + index for index, topic in enumerate(topic_clustering_results) if topic["label"] == current_label and topic["label"] is not None and topic["is_core"] ] @@ -1227,7 +1237,7 @@ def scan_topic(topic_index, current_label=None): # This also takes neighbor indices that already are identified as core in count. if num_neighboring_topics >= self.min_samples: # This topic is a core! - results[topic_index]["is_core"] = True + topic_clustering_results[topic_index]["is_core"] = True # if current_label is none, then this is the first core # of a new cluster (hence next_label is used) @@ -1252,31 +1262,31 @@ def scan_topic(topic_index, current_label=None): self.next_label += 1 # TODO parent_neighbors and parent_id got obsolete. 1. the appending of parent_id can be done - # before calling scan_topic and 2. using parent_neighbors the way it was used was not conform - # with the algorithm description. scan_topic gets the list of topic_indices of the current cluster - # out of results now. + # before calling scan_topic and 2. using parent_neighbors the way it was used did not conform + # with the algorithm description. scan_topic gets the list of topic_indices of the current cluster + # out of topic_clustering_results now. - results[topic_index]["label"] = current_label + topic_clustering_results[topic_index]["label"] = current_label for neighboring_topic_index in neighboring_topic_indices: - if results[neighboring_topic_index]["label"] is None: + if topic_clustering_results[neighboring_topic_index]["label"] is None: ordered_min_similarity.remove(neighboring_topic_index) # try to extend the cluster into the direction of the neighbor scan_topic(neighboring_topic_index, current_label) - results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index) - results[neighboring_topic_index]["neighboring_labels"].add(current_label) + topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index) + topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label) else: # this topic is not a core! if current_label is None: - results[topic_index]["label"] = -1 + topic_clustering_results[topic_index]["label"] = -1 else: - results[topic_index]["label"] = current_label + topic_clustering_results[topic_index]["label"] = current_label # elements are going to be removed from that array in scan_topic, do until it is empty while len(ordered_min_similarity) != 0: next_topic_index = ordered_min_similarity.pop(0) scan_topic(next_topic_index) - self.results = results + self.results = topic_clustering_results From 9866ef6a8f38b0941fde1baa33722f6134b91fc1 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 9 Nov 2019 21:29:04 +0900 Subject: [PATCH 068/166] make flake8 happy --- gensim/models/ensemblelda.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a4ce3b4aa2..d8903a2c62 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1222,7 +1222,13 @@ def scan_topic(topic_index, current_label=None): The label of the cluster that might be suitable for topic_index """ - neighbors_sorted = sorted([(distance, index) for index, distance in enumerate(amatrix_copy[topic_index])], key=lambda x: x[0]) + neighbors_sorted = sorted( + [ + (distance, index) + for index, distance in enumerate(amatrix_copy[topic_index]) + ], + key=lambda x: x[0], + ) neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps] num_neighboring_topics = len(neighboring_topic_indices) From ad97ead41fd999b280ea92ae008f398050d3a279 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 9 Nov 2019 21:58:42 +0900 Subject: [PATCH 069/166] fix trailing whitespace --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index d8903a2c62..7d63f7fefb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1224,9 +1224,9 @@ def scan_topic(topic_index, current_label=None): """ neighbors_sorted = sorted( [ - (distance, index) + (distance, index) for index, distance in enumerate(amatrix_copy[topic_index]) - ], + ], key=lambda x: x[0], ) neighboring_topic_indices = [index for distance, index in neighbors_sorted if distance < self.eps] From 765b912352250e8539c4500b78811e003490be11 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 11 Nov 2019 01:11:40 +0100 Subject: [PATCH 070/166] reverted some changes --- gensim/models/ensemblelda.py | 128 +++++++++++++++-------------------- 1 file changed, 53 insertions(+), 75 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 7d63f7fefb..ba926c9a5a 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -790,58 +790,61 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s Asymmetric distance matrix of size len(ttda1) by len(ttda2). """ - if method not in ["mass", "rank"]: - raise ValueError("method {} unknown".format(method)) + # initialize the distance matrix. ndarray is faster than zeros + distances = np.ndarray((len(ttda1), len(ttda2))) - # select masking method: - def mass_masking(a): - """Original masking method. Returns a new binary mask.""" - sorted_a = np.sort(a)[::-1] - largest_mass = sorted_a.cumsum() < threshold - smallest_valid = sorted_a[largest_mass][-1] - return a >= smallest_valid + if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: + # the worker might not have received a ttda because it was chunked up too much + + if method not in ["mass", "rank"]: + raise ValueError("method {} unknown".format(method)) - def rank_masking(a): - """Faster masking method. Returns a new binary mask.""" - return a > np.sort(a)[::-1][int(len(a) * threshold)] + # select masking method: + def mass_masking(a): + """Original masking method. Returns a new binary mask.""" + sorted_a = np.sort(a)[::-1] + largest_mass = sorted_a.cumsum() < threshold + smallest_valid = sorted_a[largest_mass][-1] + return a >= smallest_valid - create_mask = {"mass": mass_masking, "rank": rank_masking}[method] + def rank_masking(a): + """Faster masking method. Returns a new binary mask.""" + return a > np.sort(a)[::-1][int(len(a) * threshold)] - # some help to find a better threshold by useful log messages - avg_mask_size = 0 + create_mask = {"mass": mass_masking, "rank": rank_masking}[method] - # initialize the distance matrix. ndarray is faster than zeros - distances = np.ndarray((len(ttda1), len(ttda2))) + # some help to find a better threshold by useful log messages + avg_mask_size = 0 - # now iterate over each topic - for ttd1_idx, ttd1 in enumerate(ttda1): - # create mask from ttd1 that removes noise from a and keeps the largest terms - mask = create_mask(ttd1) - ttd1_masked = ttd1[mask] + # now iterate over each topic + for ttd1_idx, ttd1 in enumerate(ttda1): + # create mask from ttd1 that removes noise from a and keeps the largest terms + mask = create_mask(ttd1) + ttd1_masked = ttd1[mask] - avg_mask_size += mask.sum() + avg_mask_size += mask.sum() - # now look at every possible pair for topic a: - for ttd2_idx, ttd2 in enumerate(ttda2): - # distance to itself is 0 - if ttd1_idx + start_index == ttd2_idx: - distances[ttd1_idx][ttd2_idx] = 0 - continue + # now look at every possible pair for topic a: + for ttd2_idx, ttd2 in enumerate(ttda2): + # distance to itself is 0 + if ttd1_idx + start_index == ttd2_idx: + distances[ttd1_idx][ttd2_idx] = 0 + continue - # now mask b based on a, which will force the shape of a onto b - ttd2_masked = ttd2[mask] + # now mask b based on a, which will force the shape of a onto b + ttd2_masked = ttd2[mask] - # Smart distance calculation avoids calculating cosine distance for highly masked topic-term - # distributions that will have distance values near 1. - if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD: - distance = 1 - else: - distance = cosine(ttd1_masked, ttd2_masked) + # Smart distance calculation avoids calculating cosine distance for highly masked topic-term + # distributions that will have distance values near 1. + if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD: + distance = 1 + else: + distance = cosine(ttd1_masked, ttd2_masked) - distances[ttd1_idx][ttd2_idx] = distance + distances[ttd1_idx][ttd2_idx] = distance - percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) - logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) + percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) + logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) return distances @@ -941,8 +944,6 @@ def _aggregate_topics(self, grouped_by_labels): neighboring_labels = [x for x in neighboring_labels if len(x) > 0] - # TODO why is len(neighboring_labels) equal to num_cores? - sorted_clusters.append({ "max_num_neighboring_labels": max_num_neighboring_labels, "neighboring_labels": neighboring_labels, @@ -954,21 +955,19 @@ def _aggregate_topics(self, grouped_by_labels): key=lambda cluster: ( cluster["max_num_neighboring_labels"], cluster["label"] - ), reverse=True) + ), reverse=False) return sorted_clusters def _remove_from_all_sets(self, label, clusters): """Remove a label from every set in "neighboring_labels" for each core in clusters.""" - for a in clusters: - if label in a["neighboring_labels"]: - a["neighboring_labels"].remove(label) + for cluster in clusters: + for neighboring_labels_set in cluster["neighboring_labels"]: + if label in neighboring_labels_set: + neighboring_labels_set.remove(label) def _contains_isolated_cores(self, label, cluster, min_cores): """Check if the cluster has at least min_cores of cores that belong to no other cluster.""" - # TODO is supposed to check for cores, but checks neighboring_labels. - # Funnily, it actually seems to be cores (see output of next line) but it doesn't read that way - # print(len(cluster["neighboring_labels"]), cluster["num_cores"]) return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores def _generate_stable_topics(self, min_cores=None): @@ -1009,17 +1008,10 @@ def _generate_stable_topics(self, min_cores=None): for i, cluster in enumerate(sorted_clusters): cluster["is_valid"] = None - # TODO removed first rule because when amount_parent_labels is 0, then there is no - # topic in the cluster than has a parent, hence the size is either 0 or 1, - # which can be covered by the next rule. - if cluster["num_cores"] < min_cores: cluster["is_valid"] = False self._remove_from_all_sets(cluster["label"], sorted_clusters) - # TODO I don't think it makes a difference if _contains_isolated_cores (formerly easy valid) is done (here - # and afterwards) or (only afterwards) - # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any # other cluster. for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: @@ -1208,7 +1200,7 @@ def fit(self, amatrix): min_distance_per_topic_sorted = sorted(min_distance_per_topic, key=lambda distance: distance[0]) ordered_min_similarity = [index for distance, index in min_distance_per_topic_sorted] - def scan_topic(topic_index, current_label=None): + def scan_topic(topic_index, current_label=None, parent_neighbors=None): """Extend the cluster in one direction. Results are accumulated in topic_clustering_results, a variable outside of this function. @@ -1233,12 +1225,6 @@ def scan_topic(topic_index, current_label=None): num_neighboring_topics = len(neighboring_topic_indices) - # derive a list of all topic_indices that belong to the cluster with the current_label and that are cores - cluster_topic_indices = [ - index for index, topic in enumerate(topic_clustering_results) - if topic["label"] == current_label and topic["label"] is not None and topic["is_core"] - ] - # If the number of neighbor indices of a topic is large enough, it is considered a core. # This also takes neighbor indices that already are identified as core in count. if num_neighboring_topics >= self.min_samples: @@ -1253,32 +1239,24 @@ def scan_topic(topic_index, current_label=None): self.next_label += 1 else: - # In case the core has a parent, check the distance to the existing cluster (since the matrix is + # In case the core has a parent, check the distance to the parents neighbors (since the matrix is # asymmetric, it takes return distances into account here) # If less than 25% of the elements are close enough, then create a new cluster rather than further # growing the current cluster in that direction. - close_members_mask = amatrix_copy[topic_index][cluster_topic_indices] < self.eps + close_parent_neighbors_mask = amatrix_copy[topic_index][parent_neighbors] < self.eps - # TODO changed because it is supposed to check the distance to the existing cluster, not - # to the neighbors of the parent - - if close_members_mask.mean() < 0.25: + if close_parent_neighbors_mask.mean() < 0.25: # start new cluster by changing current_label current_label = self.next_label self.next_label += 1 - # TODO parent_neighbors and parent_id got obsolete. 1. the appending of parent_id can be done - # before calling scan_topic and 2. using parent_neighbors the way it was used did not conform - # with the algorithm description. scan_topic gets the list of topic_indices of the current cluster - # out of topic_clustering_results now. - topic_clustering_results[topic_index]["label"] = current_label for neighboring_topic_index in neighboring_topic_indices: if topic_clustering_results[neighboring_topic_index]["label"] is None: ordered_min_similarity.remove(neighboring_topic_index) # try to extend the cluster into the direction of the neighbor - scan_topic(neighboring_topic_index, current_label) + scan_topic(neighboring_topic_index, current_label, neighboring_topic_indices + [topic_index]) topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index) topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label) From 32ff401c60321b31ca17e59d047b1243f3a7b965 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 11 Nov 2019 01:23:34 +0100 Subject: [PATCH 071/166] comma, newline, comment --- gensim/models/ensemblelda.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index ba926c9a5a..ae223f25ad 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -102,6 +102,7 @@ from multiprocessing import Process, Pipe, ProcessError import numpy as np from scipy.spatial.distance import cosine + from gensim import utils from gensim.models import ldamodel, ldamulticore, basemodel @@ -610,7 +611,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): p.terminate() def _generate_topic_models(self, num_models, random_states=None, pipe=None): - """Train the topic models, that form the ensemble. + """Train the topic models that form the ensemble. Parameters ---------- @@ -1005,9 +1006,8 @@ def _generate_stable_topics(self, min_cores=None): sorted_clusters = self._aggregate_topics(grouped_by_labels) - for i, cluster in enumerate(sorted_clusters): + for cluster in sorted_clusters: cluster["is_valid"] = None - if cluster["num_cores"] < min_cores: cluster["is_valid"] = False self._remove_from_all_sets(cluster["label"], sorted_clusters) @@ -1021,7 +1021,7 @@ def _generate_stable_topics(self, min_cores=None): else: cluster["is_valid"] = False # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on - # where to start. That's why sorted_clusters is sorted, so it starts with the most cluttered cluster. + # where to start. self._remove_from_all_sets(label, sorted_clusters) # list of all the label numbers that are valid From b00e010939b6e8f119831b9f60157303a9777db9 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 11 Nov 2019 01:49:08 +0100 Subject: [PATCH 072/166] whitespace --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index ae223f25ad..305a08437f 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -796,7 +796,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: # the worker might not have received a ttda because it was chunked up too much - + if method not in ["mass", "rank"]: raise ValueError("method {} unknown".format(method)) From c58c30e863289ea827aad2ffba15671b6a3519a9 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 11 Nov 2019 19:57:01 +0100 Subject: [PATCH 073/166] citation, reference, authors --- gensim/models/ensemblelda.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 305a08437f..38c9d14916 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1,8 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Authors: Tobias Brigl , Alex Loosley , -# Stephan Sahm , Alex Salles , Data Reply Munich +# Authors: Tobias Brigl , Alex Salles , +# Alex Loosley , Data Reply Munich """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. @@ -81,14 +81,15 @@ Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 .. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. - Ingolstadt: Technische Hochschule. Available from: https://www.hip70890b.de/machine_learning/ensemble_LDA/ + Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from: + https://www.hip70890b.de/machine_learning/ensemble_LDA/ Citation -------- At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's bachelor thesis on the topic. In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and -a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if you use -this work in a published or open-source project. +a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if this work +is presented, further developed, or used as the basis for a published result. Other Notes ----------- From 637640c816c13748c7f32451ebe02a5eed86cb75 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 11 Nov 2019 21:40:08 +0100 Subject: [PATCH 074/166] potential fix for utils saveload when a class is in __dict__ --- .../{Opinosis.ipynb => ensemble_lda_with_opinosis.ipynb} | 2 +- gensim/utils.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) rename docs/notebooks/{Opinosis.ipynb => ensemble_lda_with_opinosis.ipynb} (99%) diff --git a/docs/notebooks/Opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb similarity index 99% rename from docs/notebooks/Opinosis.ipynb rename to docs/notebooks/ensemble_lda_with_opinosis.ipynb index a828b2adf4..72ad2a26ba 100644 --- a/docs/notebooks/Opinosis.ipynb +++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb @@ -168,7 +168,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/gensim/utils.py b/gensim/utils.py index cf7eca6c90..3831c9a8ca 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -18,7 +18,9 @@ from htmlentitydefs import name2codepoint as n2cp try: import cPickle as _pickle + print('import cpickle') except ImportError: + print('import pickle') import pickle as _pickle import re @@ -542,6 +544,7 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately) compress, subname = SaveLoad._adapt_by_suffix(fname) + print(compress, subname) restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) @@ -603,9 +606,10 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, recursive_saveloads = [] restores = [] for attrib, val in iteritems(self.__dict__): - if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading + if hasattr(val, '_save_specials') and type(val) != type: # better than 'isinstance(val, SaveLoad)' if IPython reloading recursive_saveloads.append(attrib) cfname = '.'.join((fname, attrib)) + print(val, compress, subname) restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname)) try: @@ -688,10 +692,12 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= Load object from file. """ + print(fname_or_handle, type(fname_or_handle), separately, sep_limit, ignore, pickle_protocol) try: _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) logger.info("saved %s object", self.__class__.__name__) except TypeError: # `fname_or_handle` does not have write attribute + print("failed") self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) From d7efe3a12fab4e7a02eb6141ddc8ae74d97a4df8 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 11 Nov 2019 22:01:28 +0100 Subject: [PATCH 075/166] commented out eLDA tests, tox8 --- gensim/models/ensemblelda.py | 38 +- gensim/test/test_ensemblelda.py | 696 ++++++++++++++++---------------- gensim/utils.py | 3 +- 3 files changed, 354 insertions(+), 383 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 38c9d14916..d9fa3c9a1b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -101,16 +101,18 @@ import logging import os from multiprocessing import Process, Pipe, ProcessError + import numpy as np from scipy.spatial.distance import cosine from gensim import utils from gensim.models import ldamodel, ldamulticore, basemodel +from gensim.utils import SaveLoad logger = logging.getLogger(__name__) -class EnsembleLda(): +class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that @@ -182,6 +184,7 @@ def __init__(self, topic_model_class="lda", num_models=3, Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble. """ + # INTERNAL PARAMETERS # Set random state # nps max random state of 2**32 - 1 is too large for windows: @@ -488,39 +491,6 @@ def add_model(self, target, num_new_models=None): # tell recluster that the distance matrix needs to be regenerated self.asymmetric_distance_matrix_outdated = True - def save(self, fname): - """Save the ensemble to a file. - - Parameters - ---------- - fname : str - Path to the system file where the model will be persisted. - - """ - logger.info("saving %s object to %s", self.__class__.__name__, fname) - - utils.pickle(self, fname) - - @staticmethod - def load(fname): - """Load a previously stored ensemble from disk. - - Parameters - ---------- - fname : str - Path to file that contains the needed object. - - Returns - ------- - A previously saved ensembleLda object - - """ - logger.info("loading %s object from %s", EnsembleLda.__name__, fname) - - eLDA = utils.unpickle(fname) - - return eLDA - def _generate_topic_models_multiproc(self, num_models, ensemble_workers): """Generate the topic models to form the ensemble in a multiprocessed way. diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 84bc8340d5..540410e5a0 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -1,348 +1,348 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Tobias B - -""" -Automated tests for checking the EnsembleLda Class -""" - - -import logging -import unittest - -import numpy as np -from copy import deepcopy - -from gensim.models import EnsembleLda, LdaMulticore -from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary - -num_topics = 2 -num_models = 4 -passes = 50 - - -class TestModel(unittest.TestCase): - def setUp(self): - # same configuration for each model to make sure - # the topics are equal - random_state = 0 - - self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=random_state) - - self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=random_state, - memory_friendly_ttda=False) - - def check_ttda(self, ensemble): - """tests the integrity of the ttda of any ensemble""" - self.assertGreater(len(ensemble.ttda), 0) - a = ensemble.ttda.sum(axis=1) - b = np.ones(len(ensemble.ttda)).astype(np.float32) - np.testing.assert_allclose(a, b, rtol=1e-04) - - def test_eLDA(self): - # given that the random_state doesn't change, it should - # always be 2 detected topics in this setup. - self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) - self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) - self.check_ttda(self.eLDA) - - # reclustering shouldn't change anything without - # added models or different parameters - self.eLDA.recluster() - - self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) - self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) - self.check_ttda(self.eLDA) - - # compare with a pre-trained reference model - reference = EnsembleLda.load(datapath('ensemblelda')) - np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) - # small values in the distance matrix tend to vary quite a bit around 2%, - # so use some absolute tolerance measurement to check if the matrix is at least - # close to the target. - atol = reference.asymmetric_distance_matrix.max() * 1e-05 - np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, - reference.asymmetric_distance_matrix, atol=atol) - - def test_clustering(self): - # the following test is quite specific to the current implementation and not part of any api, - # but it makes improving those sections of the code easier as long as sorted_clusters and the - # cluster_model results are supposed to stay the same. Potentially this test will deprecate. - - reference = EnsembleLda.load(datapath('ensemblelda')) - cluster_model_results = deepcopy(reference.cluster_model.results) - sorted_clusters = deepcopy(reference.sorted_clusters) - stable_topics = deepcopy(reference.get_topics()) - - # continue training with the distance matrix of the pretrained reference and see if - # the generated clusters match. - reference.asymmetric_distance_matrix_outdated = True - reference.recluster() - - self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) - self.assertEqual(reference.sorted_clusters, sorted_clusters) - np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05) - - def test_not_trained(self): - # should not throw errors and no training should happen - - # 0 passes - eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=0, num_models=num_models, random_state=0) - self.assertEqual(len(eLDA.ttda), 0) - - # no corpus - eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=0) - self.assertEqual(len(eLDA.ttda), 0) - - # 0 iterations - eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - iterations=0, num_models=num_models, random_state=0) - self.assertEqual(len(eLDA.ttda), 0) - - # 0 models - eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=0, random_state=0) - self.assertEqual(len(eLDA.ttda), 0) - - def test_memory_unfriendly(self): - # at this point, self.eLDA_mu and self.eLDA are already trained - # in the setUpClass function - # both should be 100% similar (but floats cannot - # be compared that easily, so check for threshold) - self.assertEqual(len(self.eLDA_mu.tms), num_models) - np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05) - np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05) - self.check_ttda(self.eLDA_mu) - - def test_generate_gensim_rep(self): - gensimModel = self.eLDA.generate_gensim_representation() - topics = gensimModel.get_topics() - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) - - def assert_cluster_results_equal(self, a, b): - """compares important attributes of the cluster results""" - np.testing.assert_array_equal([row["label"] for row in a], - [row["label"] for row in b]) - np.testing.assert_array_equal([row["is_core"] for row in a], - [row["is_core"] for row in b]) - - def test_persisting(self): - fname = get_tmpfile('gensim_models_ensemblelda') - self.eLDA.save(fname) - loaded_eLDA = EnsembleLda.load(fname) - # storing the ensemble without memory_friendy_ttda - self.eLDA_mu.save(fname) - loaded_eLDA_mu = EnsembleLda.load(fname) - - # was it stored and loaded correctly? - # memory friendly - loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() - topics = loaded_eLDA_representation.get_topics() - ttda = loaded_eLDA.ttda - amatrix = loaded_eLDA.asymmetric_distance_matrix - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) - np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05) - np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05) - - a = self.eLDA.cluster_model.results - b = loaded_eLDA.cluster_model.results - - self.assert_cluster_results_equal(a, b) - - # memory unfriendly - loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() - topics = loaded_eLDA_mu_representation.get_topics() - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) - - def test_multiprocessing(self): - # same configuration - random_state = 0 - - # use 3 processes for the ensemble and the distance, - # so that the 4 models and 8 topics cannot be distributed - # to each worker evenly - workers = 3 - - # memory friendly. contains List of topic word distributions - eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_topics, passes=passes, num_models=num_models, - random_state=random_state, ensemble_workers=workers, distance_workers=workers) - - # memory unfriendly. contains List of models - eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_topics, passes=passes, num_models=num_models, - random_state=random_state, ensemble_workers=workers, distance_workers=workers, - memory_friendly_ttda=False) - - np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05) - np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05) - - def test_add_models(self): - # same configuration - num_models = self.eLDA.num_models - - # make sure countings and sizes after adding are correct - # create new models and add other models to them. - - # there are a ton of configurations for the first parameter possible, - # try them all - - # quickly train something that can be used for counting results - num_new_models = 3 - num_new_topics = 3 - - # 1. memory friendly - eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_class=LdaMulticore, - workers=3, ensemble_workers=2) - - # 1.1 ttda - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model(self.eLDA.ttda) - self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) - self.assertEqual(eLDA_base.num_models, b + 1) # defaults to 1 for one ttda matrix - - # 1.2 an ensemble - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model(self.eLDA, 5) - self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) - self.assertEqual(eLDA_base.num_models, b + 5) - - # 1.3 a list of ensembles - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - # it should be totally legit to add a memory unfriendly object to a memory friendly one - eLDA_base.add_model([self.eLDA, self.eLDA_mu]) - self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda)) - self.assertEqual(eLDA_base.num_models, b + 2 * num_models) - - # 1.4 a single gensim model - model = self.eLDA.classic_model_representation - - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model(model) - self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics())) - self.assertEqual(eLDA_base.num_models, b + 1) - - # 1.5 a list gensim models - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model([model, model]) - self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics())) - self.assertEqual(eLDA_base.num_models, b + 2) - - self.check_ttda(eLDA_base) - - # 2. memory unfriendly - eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_class=LdaMulticore, - workers=3, ensemble_workers=2, memory_friendly_ttda=False) - - # 2.1 a single ensemble - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model(self.eLDA_mu) - self.assertEqual(len(eLDA_base_mu.tms), a + num_models) - self.assertEqual(eLDA_base_mu.num_models, b + num_models) - - # 2.2 a list of ensembles - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu]) - self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models) - self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models) - - # 2.3 a single gensim model - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model(self.eLDA_mu.tms[0]) - self.assertEqual(len(eLDA_base_mu.tms), a + 1) - self.assertEqual(eLDA_base_mu.num_models, b + 1) - - # 2.4 a list of gensim models - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model(self.eLDA_mu.tms) - self.assertEqual(len(eLDA_base_mu.tms), a + num_models) - self.assertEqual(eLDA_base_mu.num_models, b + num_models) - - # 2.5 topic term distributions should throw errors, because the - # actual models are needed for the memory unfriendly ensemble - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics())) - # remains unchanged - self.assertEqual(len(eLDA_base_mu.tms), a) - self.assertEqual(eLDA_base_mu.num_models, b) - - self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) - self.check_ttda(eLDA_base_mu) - - def test_add_and_recluster(self): - # 6.2: see if after adding a model, the model still makes sense - num_new_models = 3 - num_new_topics = 3 - - # train - new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class='ldamulticore', - distance_workers=4) - new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class='ldamulticore', - distance_workers=4, memory_friendly_ttda=False) - # both should be similar - np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) - np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) - # and every next step applied to both should result in similar results - - # 1. adding to ttda and tms - new_eLDA.add_model(self.eLDA) - new_eLDA_mu.add_model(self.eLDA_mu) - np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) - self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) - self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) - self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) - self.check_ttda(new_eLDA) - self.check_ttda(new_eLDA_mu) - - # 2. distance matrix - new_eLDA._generate_asymmetric_distance_matrix() - new_eLDA_mu._generate_asymmetric_distance_matrix() - np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, - new_eLDA_mu.asymmetric_distance_matrix) - - # 3. CBDBSCAN results - new_eLDA._generate_topic_clusters() - new_eLDA_mu._generate_topic_clusters() - a = new_eLDA.cluster_model.results - b = new_eLDA_mu.cluster_model.results - self.assert_cluster_results_equal(a, b) - - # 4. finally, the stable topics - new_eLDA._generate_stable_topics() - new_eLDA_mu._generate_stable_topics() - np.testing.assert_allclose(new_eLDA.get_topics(), - new_eLDA_mu.get_topics()) - - new_eLDA.generate_gensim_representation() - new_eLDA_mu.generate_gensim_representation() - - # same random state, hence topics should be still similar - np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) - unittest.main() +# #!/usr/bin/env python +# # -*- coding: utf-8 -*- +# # +# # Author: Tobias B +# +# """ +# Automated tests for checking the EnsembleLda Class +# """ +# +# +# import logging +# import unittest +# +# import numpy as np +# from copy import deepcopy +# +# from gensim.models import EnsembleLda, LdaMulticore +# from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary +# +# num_topics = 2 +# num_models = 4 +# passes = 50 +# +# +# class TestModel(unittest.TestCase): +# def setUp(self): +# # same configuration for each model to make sure +# # the topics are equal +# random_state = 0 +# +# self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, +# passes=passes, num_models=num_models, random_state=random_state) +# +# self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, +# passes=passes, num_models=num_models, random_state=random_state, +# memory_friendly_ttda=False) +# +# def check_ttda(self, ensemble): +# """tests the integrity of the ttda of any ensemble""" +# self.assertGreater(len(ensemble.ttda), 0) +# a = ensemble.ttda.sum(axis=1) +# b = np.ones(len(ensemble.ttda)).astype(np.float32) +# np.testing.assert_allclose(a, b, rtol=1e-04) +# +# def test_eLDA(self): +# # given that the random_state doesn't change, it should +# # always be 2 detected topics in this setup. +# self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) +# self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) +# self.check_ttda(self.eLDA) +# +# # reclustering shouldn't change anything without +# # added models or different parameters +# self.eLDA.recluster() +# +# self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) +# self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) +# self.check_ttda(self.eLDA) +# +# # compare with a pre-trained reference model +# reference = EnsembleLda.load(datapath('ensemblelda')) +# np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) +# # small values in the distance matrix tend to vary quite a bit around 2%, +# # so use some absolute tolerance measurement to check if the matrix is at least +# # close to the target. +# atol = reference.asymmetric_distance_matrix.max() * 1e-05 +# np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, +# reference.asymmetric_distance_matrix, atol=atol) +# +# def test_clustering(self): +# # the following test is quite specific to the current implementation and not part of any api, +# # but it makes improving those sections of the code easier as long as sorted_clusters and the +# # cluster_model results are supposed to stay the same. Potentially this test will deprecate. +# +# reference = EnsembleLda.load(datapath('ensemblelda')) +# cluster_model_results = deepcopy(reference.cluster_model.results) +# sorted_clusters = deepcopy(reference.sorted_clusters) +# stable_topics = deepcopy(reference.get_topics()) +# +# # continue training with the distance matrix of the pretrained reference and see if +# # the generated clusters match. +# reference.asymmetric_distance_matrix_outdated = True +# reference.recluster() +# +# self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) +# self.assertEqual(reference.sorted_clusters, sorted_clusters) +# np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05) +# +# def test_not_trained(self): +# # should not throw errors and no training should happen +# +# # 0 passes +# eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, +# passes=0, num_models=num_models, random_state=0) +# self.assertEqual(len(eLDA.ttda), 0) +# +# # no corpus +# eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics, +# passes=passes, num_models=num_models, random_state=0) +# self.assertEqual(len(eLDA.ttda), 0) +# +# # 0 iterations +# eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, +# iterations=0, num_models=num_models, random_state=0) +# self.assertEqual(len(eLDA.ttda), 0) +# +# # 0 models +# eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, +# passes=passes, num_models=0, random_state=0) +# self.assertEqual(len(eLDA.ttda), 0) +# +# def test_memory_unfriendly(self): +# # at this point, self.eLDA_mu and self.eLDA are already trained +# # in the setUpClass function +# # both should be 100% similar (but floats cannot +# # be compared that easily, so check for threshold) +# self.assertEqual(len(self.eLDA_mu.tms), num_models) +# np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05) +# np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05) +# self.check_ttda(self.eLDA_mu) +# +# def test_generate_gensim_rep(self): +# gensimModel = self.eLDA.generate_gensim_representation() +# topics = gensimModel.get_topics() +# np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) +# +# def assert_cluster_results_equal(self, a, b): +# """compares important attributes of the cluster results""" +# np.testing.assert_array_equal([row["label"] for row in a], +# [row["label"] for row in b]) +# np.testing.assert_array_equal([row["is_core"] for row in a], +# [row["is_core"] for row in b]) +# +# def test_persisting(self): +# fname = get_tmpfile('gensim_models_ensemblelda') +# self.eLDA.save(fname) +# loaded_eLDA = EnsembleLda.load(fname) +# # storing the ensemble without memory_friendy_ttda +# self.eLDA_mu.save(fname) +# loaded_eLDA_mu = EnsembleLda.load(fname) +# +# # was it stored and loaded correctly? +# # memory friendly +# loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() +# topics = loaded_eLDA_representation.get_topics() +# ttda = loaded_eLDA.ttda +# amatrix = loaded_eLDA.asymmetric_distance_matrix +# np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) +# np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05) +# np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05) +# +# a = self.eLDA.cluster_model.results +# b = loaded_eLDA.cluster_model.results +# +# self.assert_cluster_results_equal(a, b) +# +# # memory unfriendly +# loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() +# topics = loaded_eLDA_mu_representation.get_topics() +# np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) +# +# def test_multiprocessing(self): +# # same configuration +# random_state = 0 +# +# # use 3 processes for the ensemble and the distance, +# # so that the 4 models and 8 topics cannot be distributed +# # to each worker evenly +# workers = 3 +# +# # memory friendly. contains List of topic word distributions +# eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, +# num_topics=num_topics, passes=passes, num_models=num_models, +# random_state=random_state, ensemble_workers=workers, distance_workers=workers) +# +# # memory unfriendly. contains List of models +# eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, +# num_topics=num_topics, passes=passes, num_models=num_models, +# random_state=random_state, ensemble_workers=workers, distance_workers=workers, +# memory_friendly_ttda=False) +# +# np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05) +# np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05) +# +# def test_add_models(self): +# # same configuration +# num_models = self.eLDA.num_models +# +# # make sure countings and sizes after adding are correct +# # create new models and add other models to them. +# +# # there are a ton of configurations for the first parameter possible, +# # try them all +# +# # quickly train something that can be used for counting results +# num_new_models = 3 +# num_new_topics = 3 +# +# # 1. memory friendly +# eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, +# num_topics=num_new_topics, passes=1, num_models=num_new_models, +# iterations=1, random_state=0, topic_model_class=LdaMulticore, +# workers=3, ensemble_workers=2) +# +# # 1.1 ttda +# a = len(eLDA_base.ttda) +# b = eLDA_base.num_models +# eLDA_base.add_model(self.eLDA.ttda) +# self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) +# self.assertEqual(eLDA_base.num_models, b + 1) # defaults to 1 for one ttda matrix +# +# # 1.2 an ensemble +# a = len(eLDA_base.ttda) +# b = eLDA_base.num_models +# eLDA_base.add_model(self.eLDA, 5) +# self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) +# self.assertEqual(eLDA_base.num_models, b + 5) +# +# # 1.3 a list of ensembles +# a = len(eLDA_base.ttda) +# b = eLDA_base.num_models +# # it should be totally legit to add a memory unfriendly object to a memory friendly one +# eLDA_base.add_model([self.eLDA, self.eLDA_mu]) +# self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda)) +# self.assertEqual(eLDA_base.num_models, b + 2 * num_models) +# +# # 1.4 a single gensim model +# model = self.eLDA.classic_model_representation +# +# a = len(eLDA_base.ttda) +# b = eLDA_base.num_models +# eLDA_base.add_model(model) +# self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics())) +# self.assertEqual(eLDA_base.num_models, b + 1) +# +# # 1.5 a list gensim models +# a = len(eLDA_base.ttda) +# b = eLDA_base.num_models +# eLDA_base.add_model([model, model]) +# self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics())) +# self.assertEqual(eLDA_base.num_models, b + 2) +# +# self.check_ttda(eLDA_base) +# +# # 2. memory unfriendly +# eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, +# num_topics=num_new_topics, passes=1, num_models=num_new_models, +# iterations=1, random_state=0, topic_model_class=LdaMulticore, +# workers=3, ensemble_workers=2, memory_friendly_ttda=False) +# +# # 2.1 a single ensemble +# a = len(eLDA_base_mu.tms) +# b = eLDA_base_mu.num_models +# eLDA_base_mu.add_model(self.eLDA_mu) +# self.assertEqual(len(eLDA_base_mu.tms), a + num_models) +# self.assertEqual(eLDA_base_mu.num_models, b + num_models) +# +# # 2.2 a list of ensembles +# a = len(eLDA_base_mu.tms) +# b = eLDA_base_mu.num_models +# eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu]) +# self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models) +# self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models) +# +# # 2.3 a single gensim model +# a = len(eLDA_base_mu.tms) +# b = eLDA_base_mu.num_models +# eLDA_base_mu.add_model(self.eLDA_mu.tms[0]) +# self.assertEqual(len(eLDA_base_mu.tms), a + 1) +# self.assertEqual(eLDA_base_mu.num_models, b + 1) +# +# # 2.4 a list of gensim models +# a = len(eLDA_base_mu.tms) +# b = eLDA_base_mu.num_models +# eLDA_base_mu.add_model(self.eLDA_mu.tms) +# self.assertEqual(len(eLDA_base_mu.tms), a + num_models) +# self.assertEqual(eLDA_base_mu.num_models, b + num_models) +# +# # 2.5 topic term distributions should throw errors, because the +# # actual models are needed for the memory unfriendly ensemble +# a = len(eLDA_base_mu.tms) +# b = eLDA_base_mu.num_models +# self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics())) +# # remains unchanged +# self.assertEqual(len(eLDA_base_mu.tms), a) +# self.assertEqual(eLDA_base_mu.num_models, b) +# +# self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) +# self.check_ttda(eLDA_base_mu) +# +# def test_add_and_recluster(self): +# # 6.2: see if after adding a model, the model still makes sense +# num_new_models = 3 +# num_new_topics = 3 +# +# # train +# new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, +# num_topics=num_new_topics, passes=10, num_models=num_new_models, +# iterations=30, random_state=1, topic_model_class='ldamulticore', +# distance_workers=4) +# new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, +# num_topics=num_new_topics, passes=10, num_models=num_new_models, +# iterations=30, random_state=1, topic_model_class='ldamulticore', +# distance_workers=4, memory_friendly_ttda=False) +# # both should be similar +# np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) +# np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) +# # and every next step applied to both should result in similar results +# +# # 1. adding to ttda and tms +# new_eLDA.add_model(self.eLDA) +# new_eLDA_mu.add_model(self.eLDA_mu) +# np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) +# self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) +# self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) +# self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) +# self.check_ttda(new_eLDA) +# self.check_ttda(new_eLDA_mu) +# +# # 2. distance matrix +# new_eLDA._generate_asymmetric_distance_matrix() +# new_eLDA_mu._generate_asymmetric_distance_matrix() +# np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, +# new_eLDA_mu.asymmetric_distance_matrix) +# +# # 3. CBDBSCAN results +# new_eLDA._generate_topic_clusters() +# new_eLDA_mu._generate_topic_clusters() +# a = new_eLDA.cluster_model.results +# b = new_eLDA_mu.cluster_model.results +# self.assert_cluster_results_equal(a, b) +# +# # 4. finally, the stable topics +# new_eLDA._generate_stable_topics() +# new_eLDA_mu._generate_stable_topics() +# np.testing.assert_allclose(new_eLDA.get_topics(), +# new_eLDA_mu.get_topics()) +# +# new_eLDA.generate_gensim_representation() +# new_eLDA_mu.generate_gensim_representation() +# +# # same random state, hence topics should be still similar +# np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) +# +# +# if __name__ == '__main__': +# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) +# unittest.main() diff --git a/gensim/utils.py b/gensim/utils.py index 3831c9a8ca..e0e54dba3d 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -606,7 +606,8 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, recursive_saveloads = [] restores = [] for attrib, val in iteritems(self.__dict__): - if hasattr(val, '_save_specials') and type(val) != type: # better than 'isinstance(val, SaveLoad)' if IPython reloading + # better than 'isinstance(val, SaveLoad)' if IPython reloading + if hasattr(val, '_save_specials') and type(val) != type: recursive_saveloads.append(attrib) cfname = '.'.join((fname, attrib)) print(val, compress, subname) From 644f2e483a87a43538ca951ba9c1386e1f4c5e5b Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 16:44:44 +0100 Subject: [PATCH 076/166] saving the topic_model_class using a string instead --- gensim/models/ensemblelda.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index d9fa3c9a1b..80c2a88d68 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -101,6 +101,7 @@ import logging import os from multiprocessing import Process, Pipe, ProcessError +import importlib import numpy as np from scipy.spatial.distance import cosine @@ -267,6 +268,26 @@ def __init__(self, topic_model_class="lda", num_models=3, # create model that can provide the usual gensim api to the stable topics from the ensemble self.generate_gensim_representation() + @classmethod + def load(cls, *args, **kwargs): + eLDA = super().load(*args, **kwargs) + try: + eLDA.topic_model_class = importlib.import_module(eLDA.topic_model_class_string) + del eLDA.topic_model_class_string + except: + logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module and set the ' + '"topic_model_class" attribute to it. Try setting this manually instead.') + return eLDA + + load.__doc__ = SaveLoad.load.__doc__ + + def save(self, *args, **kwargs): + self.topic_model_class_string = self.topic_model_class.__module__ + kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', )) + super(EnsembleLda, self).save(*args, **kwargs) + + save.__doc__ = SaveLoad.save.__doc__ + def convert_to_memory_friendly(self): """Remove the stored gensim models and only keep their ttdas.""" self.tms = [] From a7bbfc0a21c308050fb139db49e2e3842b93e168 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 16:51:17 +0100 Subject: [PATCH 077/166] reverted utils --- gensim/utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/gensim/utils.py b/gensim/utils.py index e0e54dba3d..70b12d8a88 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -18,9 +18,7 @@ from htmlentitydefs import name2codepoint as n2cp try: import cPickle as _pickle - print('import cpickle') except ImportError: - print('import pickle') import pickle as _pickle import re @@ -57,6 +55,15 @@ PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE) RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) +NO_CYTHON = RuntimeError( + "Cython extensions are unavailable. " + "Without them, this gensim functionality is disabled. " + "If you've installed from a package, ask the package maintainer to include Cython extensions. " + "If you're building gensim from source yourself, run `python setup.py build_ext --inplace` " + "and retry. " +) +"""An exception that gensim code raises when Cython extensions are unavailable.""" + def get_random_state(seed): """Generate :class:`numpy.random.RandomState` based on input seed. @@ -544,7 +551,6 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately) compress, subname = SaveLoad._adapt_by_suffix(fname) - print(compress, subname) restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) @@ -606,11 +612,9 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, recursive_saveloads = [] restores = [] for attrib, val in iteritems(self.__dict__): - # better than 'isinstance(val, SaveLoad)' if IPython reloading - if hasattr(val, '_save_specials') and type(val) != type: + if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading recursive_saveloads.append(attrib) cfname = '.'.join((fname, attrib)) - print(val, compress, subname) restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname)) try: @@ -693,12 +697,10 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= Load object from file. """ - print(fname_or_handle, type(fname_or_handle), separately, sep_limit, ignore, pickle_protocol) try: _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) logger.info("saved %s object", self.__class__.__name__) except TypeError: # `fname_or_handle` does not have write attribute - print("failed") self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) From 2abdbf72e2c80d19df1be63c9b38ecb5e075443b Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 17:26:00 +0100 Subject: [PATCH 078/166] saving the topic_model_class using a string instead fixes --- gensim/models/ensemblelda.py | 7 +- gensim/test/test_data/ensemblelda | Bin 13705 -> 14251 bytes gensim/test/test_ensemblelda.py | 698 +++++++++++++++--------------- 3 files changed, 355 insertions(+), 350 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 80c2a88d68..514d87c305 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -272,7 +272,9 @@ def __init__(self, topic_model_class="lda", num_models=3, def load(cls, *args, **kwargs): eLDA = super().load(*args, **kwargs) try: - eLDA.topic_model_class = importlib.import_module(eLDA.topic_model_class_string) + module = importlib.import_module(eLDA.topic_model_module_string) + eLDA.topic_model_class = getattr(module, eLDA.topic_model_class_string) + del eLDA.topic_model_module_string del eLDA.topic_model_class_string except: logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module and set the ' @@ -282,7 +284,8 @@ def load(cls, *args, **kwargs): load.__doc__ = SaveLoad.load.__doc__ def save(self, *args, **kwargs): - self.topic_model_class_string = self.topic_model_class.__module__ + self.topic_model_module_string = self.topic_model_class.__module__ + self.topic_model_class_string = self.topic_model_class.__name__ kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', )) super(EnsembleLda, self).save(*args, **kwargs) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 286d8dda8bee36a5089ebce4476eed98998ff7e4..79525c3aca30b28384bb1089a558222777737cad 100644 GIT binary patch delta 8619 zcmZvC2V7LmmM0lO4EQ04ia9i3M8E(vDk5r&pSB4DwoTJ;$<*y`5rMA}B#9!}B8ZCQ zAQA*bk~(wF?Cj1?*xAW58{W*$rrA83cW2(4`QO{1Z@xGDepdCZI=4>xpHp>DS(h_Z z&{$*lb#ZaYx8)h~3ewHF7U8M$NLh}dOmn^^E66A_w}C781o6Yedl2+#{#p{+zM{O!|Q-(z(l!+6L1os$6y!(1bo4dxI1pbN*70eYDSi~t^ z$k*rm{CncGZoMjj!2pWFVv9JFDs&4Y7E_ioayC^Yrzp5J7_u@Uw>jM+&ZUa)Q&y{E zve{&lEm?QW28;2IIk&)w>GP@Lf-Ma%g*Z7|3ktJL7NLj16@wNUEf(13qQl!GB=}No z%1Vr-XIZSqe3RJ#SLc{voXd_lkKip=&Z+T{XSC#GTI!MP^d`pJE<=su>D0GAw4RkQ^61x17pKihpqL&yq$0O4PL# zZnna7qZM|_1|Tj1%YJy#Ic6~5G0Vjw2ar}AW4>d|T^P!BeDVx-&2#*r=Z1~>bxv`N zxw&aZQ;tOxoKuA{<`zmuQRw*Pok5OpujO{JV59tWqbwVXeTA%ZsS*~G!Gt7bviO?w zl^?#ss+uWH*i`9rjn=IE{o+owDAJ3&HKJHA?kPFLtxA+g#yZKWq6pH(EB{2P1j;hK z_`xddX{t{P3K2-K(td+epz=Cr3|Wohz9b}7hzbcSSL($BWo5rAQKhcz_zSOSd$qD) zjb7ARMO~U|o5ckUW-RfrP92kt`RN6D24Kr-7WGa=_c#al7y!zgT(hsi;2cni3?^%V z>?nL8aA;Ij za*#xxEvnM2sPbGdTA)gDA*|FYInii~6fc};Y}1REK%?}FIDD0+7<39-g*y-MLwd*d z>9rdA5=qn4Km#;FqeBn|n}dw#wJ z^K;ZfuR#8kE9rg+f2Pkgy^nfLdE!Pmmj8@5-X5dQ4C-0IlQwFkmRr<*itD)}f_u2# z&wf(BDeKG)YQw7Y^kVa2s^t-SvFw^p@yV_Hn5VJibFR6>Bg<%rXAbdu`*rR-Pfw|b zhPDR;t~T(9yqvqJlU{NSPx1W={Dr6KJ#~hDpL&TWa<5P|_w!r=-KU}RJhR*LJD=M$ z6r?Moj?AlU?+o7S5lW-`w#4q>XAygnxF?MrT*2eor9RxuP1Hq?6ScYVd_Tk0dWr|Z z_n|NK(YqsC6QcG~?NO@XaqhHiKfo_xkNLgf)O5#{=dK>4nsYqD^EA%mF5F2kph+`V z^YA(zqN#nGsLu@o+v|((Q4@dS8NX5^SC`NOuA&*QQ2GKD--K{~(Pr(?cgrf{4tDIOZu+n$W);2VPY3w*MvsHkw2hk%>$oHNN)XL(IrneK&Vm)|`HdI# z@(acO!7$eQW%P!hQU_1i3+RO>4Z;5tG)i+mOS~fQZo3?{+C-CChf{dqHa8vjpo!G8 z{ETa<{ua#9Wq0MuovQ=s4Nba69;T1{c?&g%!st^5(;b?R;X$_yXfaH0V@+-wZt&0v zn!S|7qs27HEqVNrYtL}|`Fx&Cq**)9g;Qk&KcM?O$AhliPM-q!i6@m?*b#{r-mddp zlE(8lX>JGCx}4oWIlql-s7DH>N_xh9^qI?( zVDsq{o6GnObsYstKk#SG1)ey;y)@^^Z`uBcrY`HL3buGg@A9MViO2XOVxjxsaZjN4 z6IOW~x|6#%Q$3AwEx&Q4sWU+RXMXfO4@W?mfvoRo%r6fNCw*NK?BA7T(?`I0o?1c@ z{L`rQAir=8<>xegYc;*!0iR5BM^Pv>@C;0P-wxP1&K?5xR?tl7ccvr!kn46v0I_eV zX&u)`^JF%)?Fpm`YNdy#{g1`)L} zne2=71s>?)v30354CD7v-CjVeNza{#$u20?>XFAi2iQJJ1KfInp3?)9ZvFXPJPhB> zBpa^Yv$Dgu2XLP`1H(*F^=<0jksP?rm6~@3T%uQ0YrLCtmfL9D@1#i+R<_~@*J{%M zqgHCnLCiOEO z=h5?9HQ_5QQQWg3kH=t?dW7z~%Ywe#peHwXQ3X4qPb24))0E3Wn%)Aebih9^XcTyU zvk7Uvjo_g^#o*Mt?gl^FOhXpSPM5Ph6TX6Kk)E#=VR|Ynn%nJE9YF2W><1_H>EMz{ z>VRt=aHk&+-lL9Ep9ua$k7<18E&AlYf}Uu(HgJbKh=!LX^W#hVxtp6-rUlSrTY&dk zu85N9EfB!fh?agWEd7~gxq;smP-P;$w_UtpDlMR~P4oofPi)(L-A-M5uk%#c8hz3Q zB%rC3Gcd>;wtya*JwXi-x!bA7;ESk!&5ft%^WH*Cwj!4YBLW{5UiXebrf;TxIOypu zu8{Go5eccH6d;=nL>j-(^UJ8g1k9ADZ=-pxyB6qE%AZUz{5*(jg3j3aOW+dN?^Ov+ z+~#LK{P>4;Jo+8P?xH7QkYk#j+@;Yd?xsqvDyA1cFh?uRYWBuZ)rphH3SCrj1~E6y zvq!IOHJa%`yp^iBJC3Vv^6Shn?m})uzUeQ%wHeaA3ZdTP+{V3T>bP}mGw|of(=q%6 zn_mlkqKC-``89-Y=b>xB%Lu>Cpb?n3IRpl%3}I!D>rI6Sjsohp^Q?}Cl6P}W-1#G6 zo6cN!h1N4Sjof^QK7r7|t+zJwoc4SvB6JAb(-?m^3Eq!#OHYDedM&(Qf4PSycAciV ztMr&1G*Zg_xB2t;+yW#|uFT}2?aTI2J6DEs9lZt?UthQhZGf;|82XhDk0kl?KqN3% z?@xVt>`{++CIZHIwKR)g24R|gkXP2&!S_N@T3V*N<*5h^ghS~>31B_I>Q5JVW5XR5D$s#c`|@I6TN`A zR++|p`PGHX+*b%mYLNk6mhxouc@N~N=Ui!^@dNHgE6-}TA&q_6FCp{TAJcRHf`iye zn|1Wa<(P(F!4(w<`LO_awV7Vt;6YtF%_q_vwMqQwwv9f})AjW6RGv4E8)G~xQAMKF zZEt2#^f51ZxVkJT3jAg~BjUI-b1!f@8AU^7s|;JYe2H&zYH+mqD9zsT$)|F98*Hb? z`m!SMd3+h~)&ur)N%Tw#bVaspK%N`lsGKe?Qq7hU`D_|JT>@fO;OAig;W15l!nE!Du_zAxD~TtrfN4XC-A1$9 zK{Sc2w%0fB`#gT{a(ELq*bmtI+Ng3n&Bp>V160l(`9`YH8jr#^<;XS@%62*skcezC zir0t9v4fu=Uyac-q|iokcmwz|N*0d8`Qz#7grsyHl#vlCS0GP)2{F-tAAhm0+=R?C zx;H6Y%O7lc{4fbBbkGM~2tNlfUQ? zWSJg*fm3A_&3f<){V^KVaTU*fZ{^qY%4O{a9!uE;H^0n?;Ev%F}`V^0!qFT=* zp>m*~5lH;(k+~juYJkSvxa!n#YDE~m4y4)3c|dZT$A+^WutY2OZ(4l=wwhmqw1o4= zV;;Yy=gDpG=erX(V51ruC+P0AG8YIN)*RqQUqGyRDZ=#~J`XC3a(VI)j%!a7`2NWZ zSbl^WH&8_hU^jPG>z3|`;Gd_ubLro?`^1YNY(Ev3H5-?4&pr6PlIpgF7XTBr+`gMW zApcOUlyESI2Z~d?_|uW?mo9Us#2xZl#Capt`5sP9L_!`<#7XcCvS`y3$p)Q;p2TLNkV~zKfHGwdH1NUqWj^q9W z1XLfM?cL|^oWl+kh!e$gfA35j`vyF>0J=|kYIST`5aZ;uE`eIM7OkRYOGE&@*o)jX zur9)A4(A7vx{Z-{Q~AkvSgi(roOj`x5CiN`g{;|jX=|i^!WlhJpQ7>OMLg`W#e|7I zfaL*I%G@4B-E4P+^RsXsyRkEtr(LMQ8*{VNca#S2Xm8yp)UpD5NayG;u zdY%}m%>YG5Z+IXw9H|jkB;ad`-yKFkwO-e7*HY@U!ubP5ZeB8t=!3GjDR{Rh{QH4t z&jR>U!92A#Gzuxa*?x~(ZqR^x0kXh&2EU9ArFS%Q+_e2XO@`vku>WK!zr`ke-=zr0 z$H{0L(t0*vtP5pFMYE_*&lTEE)NJfhx z(`ZArr)#04muWn#Ja#AHV_f}Ht4%2Gp+bYY1gd$W2c#=7mubw)Ga9TCs(eyGZo24o z{3I|qsPCLwIYF6avEI~=lH1d!_H{QeDzyS%F@S2)V3~NUk?ub+B1>ZI<|TAb3FPHWBFl&E!gGzLZa>j1R5`18>; zAJ6rrVev7JKL`r7k0~~IuNUK1F_E@dqf0M9*=!QEzEsI5p)~hFLN#tmLiz2Zgd*EC z)K)4{=DUT7xdoYNO5tN>QM_42yg9v?|0>=mr+CG918HC+KlTEBD2P2xg9Wh`yCyG| zdLe6f+<|4eu@>cVkb81tBjR#nalGumrbVJ@x~4rz&jYk8x6q3?cYicKtI%vejRpy% zYE@j}@(`{1_=A-P?D7rm0+tQ?Z>F?HIezi*cL%sM%W#ZQ~Qj_$n5=t%qSR;KcvHZHC@o(tGZ!VTt zeoK0Zy2*rJaP{)Yf1v1<{`psW@#nu-{#mNjA^){n|M^QtwkE=LljF7KH(hT#azcmk zJ2))EwZl;!7U1Tx%h3|HZ>h@#WU|Lf(e0??cf-P5T^v6P+vrM;4dKGwg$5js@WXCg z?)a_nXMX=i(c|Ch#lN$PfA7fO73T35bt!8dVpoV={D(!s|Dz)Kf6|Np>=b-SwfHa6 zpDV?vUs5Wm3xAie4QzUP^+t2J z(W>QV9v4o@G@*{VY>MH5ZQ48Ma_z{1?bt)Q6*U(AHc|PdT{NksNp5jjeg;WJoki-J zx=S_Mqtu*8y&Jy1gk;mSS<5pXkl|h*mTub7Z+S)fU`jBx$G1qw{AQ5 z-VQ|aM3T}{v#n%T_`*vXoGsf|(FbjQQVKHn{e|I0HBorU<9@`^JH-@_K=Jf0&Lxdb z{ZvkoRI`U?aYXA@2an>o){hU98ZEsG-nSOx@3~&#zmkU6M#6H}JtEctc2EAUx<4o1 z?nr9R#0J;kq)?e)Q*uy)&c65>jw7v!^iX>b=Zp&K3q+?_@0%Zb=$SSbXNX}%?n!%s z790Nv_6)^`-$YF32Z=a+j8o6gDMZt*HXLAPXxup%pe?d$J$0Dx`~g=hQ&cAOm+p8My<#30`k7Y$4PHKBz9tM z(Q;=m^P}}Aa0FG9a8bp=A$#zLB5YAVC%^cwN*Me%z4%M3`0r_IeRqW^*M>VE*?CPT z{)gk&_HH-)PizSPt4#cFjTDy@Kl4KQYg4l?ZgX(=kyn_DTK)gQ9e{erXTEDS4WmmM0>U|sRvKxL$KTDO}(JNPSo;q)HmF%HXd7@QI(y9ou z7anm}pp>q0Rgjf#C^X9Ee5*zF#)PuQ5|sclzI7ll}%km5?wL+1q{Rw@*-ze1q0cFA63v?)(RD#bd5;zx|uV@zcfqLOSt3q~8!ePgsqp;1h`S*6*6_A8^U z3MKNg)2iE4g6#@{)0WxxMU8e~P+erFN~1vwMxp4wF+vu03KOo<>_YpM(QbuOxi?W2 z+M^QeRS15h&_0Z*3hh@(>=Ed|=m5HJj3N~xMWZN{C>rfoMh6v2XymTqJ(B{BTq0ot5PH@6hG4F9LCf|zE?@kqqX69 zp&kz{1v2jM3=08okuP8}Ro0``l*tzz_5<7Om+*kY;ZIZL%gS)J3Vaonsv{NM%?rAH zy(v^;+9F?3-t=%@@mdUO=*d?xRtr6q%QQ{)x3#aS&q{b_t3XtJr|4^6Sll(+xm>n$ z1+yB?IFhF2nhnkiR{1)}7dCGBhVuJvjeHZmUQR{p^od`!d`p>9H}SthM!+9#2zqZ* zr;;YQgL^ZbKP8*`16c$0==098^Nu%>k;{0xVDXT_t&X2WhPjS8e4{qH4miT15{?;^ zw^hzYv{pF{ZCXk&7^tE;6RXnNAic#Z<8oIvfhHvoBu#GuOXPIrY0ZybFy-RT zcah$#P$>Ti;B2}vS^57Iat20lqLee0{%oo&DgC)rIZL7R|NGXGvoU4FsZ-8TXs;P< HX?y+_EizJ9 delta 8102 zcmZX32V7L$b}t<%!Y6h`v7nCVhk}3zIA8@Fv5XD%!*PJYLml=MP(N{yUZgpIpr|Nd z0~G}Y5hYnG$<0e%+D*If<-N3f({s~rdUEsMXFzY>!=LBO*=z5$SN*Sj))_W87Pi+r zt9CB-5Z=xSuXPTeOn)~ww-S4?q1c>jDzXY+omXZMhAx>(tog-;!g7Pra>**@RR}*# zrVj`?W=pBvD*UrVfF?@>YIJTo_c}3O=PqwSIuCgZHf4#gW+@AFo^mKe=Ou3ob>8x} zNarJOiyb;&d0&G0OEs>w%Um;-yC^Gk^W=n;m=LOQ?Xe0IJnF=17bOgoH5%6*Yh7H| zxt6X6C0yqxmv1m;iH#c9?@c;?Iivw~vnJU!7U3G(q6?6utss4^amgF$qHJ?fwwp|u zzOX@lZgjcXk}IN|T@i_Jj`V@` ziq&Q+7D+nqOfNSF_RKFfiQPIMJh{2qt=LJc^Ud_cgQ?Uil6CXcU2PVV$tqHGe(KXD zi?LJ@dvyLP$Kw1Fv&Aa*>H_Sl1kg}bMC_{&`{7Atr_No8*Lf(rbe>9r&Pz$uc`Hde zA7!_0g|DL3%~N1XB}M12?9l}%d+qAxInET%Fy|W2^wP$~hNt?MeKgTK#!z@zmQlr%h4Kj4m_RK|iE;bdLEmsVAmV8r5 zZqXHk&6aBvN1S`SmL?sAZ5PNXMr&by$t8nLflU>2QLZ?4M2+t`Lb1uFm~+K()pQuN z3ca%viE%>lfm=^jh*KKXT)DO@r6zGY2ZEPg@mHVx#hFYG?2#8EGG!Lji?e!h4zZZ& zj?pA>{>YIdhklGd^`djLiEpx%C}oH0bZG#%z1S+Ubb*-xSdg1$W- z;#&orujxgFt|wuy#n+m3nc*CaSVkd(N>on82*MQ5~}wOLEP{ z2CEI$7d0-GBVB|327_Luo{&UCj?HZG7qy%9qE4e`MaIdfUAuPd6!mjrHAt}<^}^8v zxiU(ja^nm*=oPncHN9HL&vK}fdueJH zeb{tj9golyk1gan9zIC#sg-)DCXgCic5Gzqk$^S#xXqWlL#Z|4xc`xi zyxrVJcgm=HD>czaesPJr^;R%y3{Ih!<`pzecX^UtUchz}NvG#iC5`eu?mD%VUeR+J z;gK_%W7N&H#-QW^ZlE^0M;*m0_z6GYnmp5yjnwHu4V$>ev6d%KK(cq##SOXCy`37l zCYhfvrCNGMJwEiIK+gkb&wG%QAJfaV)S~csKn9PWu<-+`P20{jU+0%{72n|JI|JyI zBY61^9^-DF%BMaa$%VEp+|2C|y(yR9rcyr*c+iAFK=6i(JmKKi3EVK>iyBgCH16D* zlk{ebfk*k-W%|V9)P0a|b8jF`(5LPEjGL&PAJn6gg2tDA& zL;U;#%=PjJf8^$)p)?S+F@_uF@gPrGsFi!{X;htNsi3FuYN;K+{V=*+{-nWjs+a2uC!$SbTDqATLKM0us7U852-nL>lGfm6u@;! zG*Gyh22S%p05|TTM^r87xm`;S_2K-U+G)UZEjOjpB;VRZ?VkLQ-!G;QiToji99=v{ zqmhTXgKB6hhsPYD7W}%)1BP@SUzJKv_%6SOb^GzFlbaWYE)MqIWs2l`qLjw`_!T{j z=E0L#GQlJBcO+0f4bzir)DE-Uqh|`jt~ZiqjQlzYanVcF$uPqUdcQa?i>K$q{?t)@HM#}6j#CvvvHz=RdLBr35om9YlH&n) zZ%^kbJ5B4kmY*D;aYV+*y6tu(ME$ZY2!)x1$WjNi|3DLh$4bv`@;r}3n&c{947P9! zP0vg4o);(REi~F6LtUr0ajhHO=LaXD@=YFs9s~J1Vbj5d%WVr+nfcLL8iXIKxa%9o z!KjeUZv$?mKz<#MHhm&=6J>yg!eSk_zg|bTY3(~ZX%5H;f+BaI)e0h#8ao%@Ev-@PlD*> z-VClfBu4|-d5G%ayxY+N*6!ms5a?bgkL|?oc7D&5>DlZ=KvV50nhvA7lw`vpGu`Gs z2X|_}ax6DdGmrDV#nkR~iXUE~nlovq%-l|0G3RqG02!RzNkehS+c6rVnv-57+jwGo zMqI(FavEJ4PPgLrK+@**{E>!vGy|~VEZ$4Q`SAE8k8R-UavERC18b<--5aV6!*#b1 z1@F0KRYZ6KIR=ZvsbR4;4gMOW_Mp8>OZfFg017olfI&Aw;q{J1#*phU=;Y~$&C&EW zWBr12h|_9>WjohgSO;rWAI9Xn;9s>PKAVU5?uoV3x3m26N+j~k%0+(rHD>NTOSe~r zr)cN}S8mMB3N<2%XBginpMZ4%VcP6aimUO;n z^``#9bAYesJi}wjvGkf+%N%?^4XOJG{%P9`FLoZIiNbtMEkGYy30`ea@3f(V;U7{ygJ;gIL9I4^@^?0a6UXARdMylF&ebXl2 zSQyG71XzElJw;r@yM|FD0T~Rc&j5=31+<7>RTXLu`?Q}Nvh0!}dnH|fohXBqixsmV0 z9#S_0flghD@%!d?EvW=vqo| z{OIkXgcaQ2hTxjyC#&dQq=jDZH31CXER09~I7a;T=JOPHmAL!C(e&B?w|v^i^<{;J zx%%h^c;*d1p~pPx9+nSy-+;!86e`xB z7PnG={53%NXu-E!&$Y|KdD5DT_|u}}JYnD&HyYZn z&+*N4Am^hjasta^HnSh^G#1*h3RW#X$oF`a?WHZ77F5fVHQ7SU$k5^uTwy)c@n|V~DM;)UaY2)d0EQ zE{se%%T@E2CCM$0a@9p__LMsp^RsOXl-k?W z!Ox-06ptHLQ`N}`+p4QLF0>$>x}jzLermW3NbbYrQNCTwBl{eP2utXG9AWBE=<9>` zQZL92k!3hI)zZ^&&rMi8u$MmOR(R1kH_@Xd^y)-*0M+o#B$`~Y0X2+j3d-QGw&Z=> zUzSG?Xn5%|>^GF}VT^=DixBznDx&6o7@+AnjV4P>q5d!)T?zcdJ`=&*w*i%R5^~(2 z57f1s2VltO{JMyp?xpa_U=}yJ9awW<3l29!+%?~F9?HI3k8{S*S2%DDV$sM|8jLt% zN3}x=wNuaalb2D-?|5Pk4xp9{>WGe^IxDxu#QMT>cc}ZS8#(%MEEph(EcYYL+(lJu zzvUag$gL_I2RaH-?rNwqjvJ8kZ*fd(Tp7=|6&!mS1Nn_R%-u%KhYs$9{cj*JZgS_j zHK;?=;V}T!djRL=z{Le?5sEk9lm2sOfxaF5xM*)6N@bVywu9c|H21NBI^0Z$vNFzJ z-h*o?oX^-BK+NoVkjck77@)T(fCYyxD>3{cxAhdU|}F+Bc$Zj~xi5{&KLa zmerRUuPN~16WHP0qdg>nnyFX>;MS5G|^-TuXB=Rubv80v9Be_~=P`SM0oImx0c?XcVCyE{$ z!`x})NO}g3#3Au;=3NJnpP}cAX!2V`{Rh2I1Zs69kZKZ#`}_QU7dd+Ts4^ilGY@-p z2bO2Bb5UF|y^cF`m7ETGaoK?^klq{(qIMlnuW~zJR*kRXw&jrp732r-FyQ%aWL$HHpw6PFhTW6 zs1Q%IUq@$NLJ{bsTi-&7F)wOF)ZL5F9^U50kE8iR88=^|cUvO8!x4oy?3xAC z$|~Y%YF$meE2+A0mlqEjd8Q(V@A}8v3jlXR7Ou>sk+LuzSmXtJyvG^oF;7MF;a=Z0 zrep-vqf^}LZl=Bzi62P4TW6_Xw9EJM4!yW#6V7bi4|Hy+cwJZjY@*ZEUCKxA1$_iG z*+iE!Yw6O&?$qRC*%{yc+h_l-_CU~#DK*-16tvQAADQb%p}|pTLYJ%ujS|_$bG_2{ zI)|37J9P)$qPrEMPorE_D%3uORg$gZ-rT$UefjSGKri~$W`!=WPCRrq6>7lho2zMf z@1lWpEyAZGU5nyzK}8uBkED(RdNF7dL)mjJ zv|KZ~WslK_JFT2U8%=hVOqWX!6qzq&%Z9|0IZ>WUQJ(3=^UtCTyF|Hy6@c`YB}vEV ztvM;3o|%)Znqs^tqV^5!-Z8((_4w4mgGEWvyNi;LpijL+5^A~?N{rlcelp^ zTmnOI2~Wob$C}!Z-6Cj}R(-r~<7d0Gvp`6r4o4#2T6Qh*_}*_5#fVhugT1x&#FM`MB8X)6JE4mtE-(OxAwC$~2|4uI^Y+^DS4cwn}{Ck(< zU0yL*?ZsjWO{k>~YmTwVh_=fIcyuzF{2khr8-vjRBdS5 z{dATfwPmRG9C^;LKO@EaS-to@U($hxpiLWOG`3J)(p5xR=N+@laA3LVI6zHW z2y$}^y<37;PnYdxy#Ty+;^nsmX#E0WIRMs^wlF|#$NwWF+||4g^WM>bo}7)mS@S9J`}yrY6djkbXXWBPTi~H%IFf#Gi})>> z4Zp1y-?ND&|0_FlK6cG1vf~Sj#r084{BP&o?HdAq2MGUvD#ZWQD9O%W+`b~=chR3$ zKc1N`+YCjjreN^T<@YUbov*1@{DofprCNJj z8~zZLNXm)Mzo4VB5cvi~d^W^f!9(xAqG0cN*pU4ayJX_}}Z?u){y-#g7z~ z`9I3Em-vh1H2M8N+Q70{r7n@w zAL<+~Q&^VbQC+)Cr7p)6{8!-ih5t%9N6H_n&RK=4i~njl4dQ!B2U@~Z>KaM?EdE+N zsY}eYiV5?S0|DK~9qLZ&W93!qvrJgI_NBy}*C7N{x`zIZs-);7MJ&RV99n zEBHs^_J#d6IY)}WU7Zt!tMX@Y()`hKo+`gL`0r53F_QdQ{#ZPzYvWYnPUq{G(4cty z&a&*nRa0R}aQ;qAxFb=Wr~Dwxk|c+fKP`}6E|8`wkQ$IiO~DtgDe_tQk@DjL>D>ZpBFWONz{SL)Kn;0nb(u1-&T>GK)Qb17*c^X=z@dwCRhNe@xqe=>&;H -# -# """ -# Automated tests for checking the EnsembleLda Class -# """ -# -# -# import logging -# import unittest -# -# import numpy as np -# from copy import deepcopy -# -# from gensim.models import EnsembleLda, LdaMulticore -# from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary -# -# num_topics = 2 -# num_models = 4 -# passes = 50 -# -# -# class TestModel(unittest.TestCase): -# def setUp(self): -# # same configuration for each model to make sure -# # the topics are equal -# random_state = 0 -# -# self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, -# passes=passes, num_models=num_models, random_state=random_state) -# -# self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, -# passes=passes, num_models=num_models, random_state=random_state, -# memory_friendly_ttda=False) -# -# def check_ttda(self, ensemble): -# """tests the integrity of the ttda of any ensemble""" -# self.assertGreater(len(ensemble.ttda), 0) -# a = ensemble.ttda.sum(axis=1) -# b = np.ones(len(ensemble.ttda)).astype(np.float32) -# np.testing.assert_allclose(a, b, rtol=1e-04) -# -# def test_eLDA(self): -# # given that the random_state doesn't change, it should -# # always be 2 detected topics in this setup. -# self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) -# self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) -# self.check_ttda(self.eLDA) -# -# # reclustering shouldn't change anything without -# # added models or different parameters -# self.eLDA.recluster() -# -# self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) -# self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) -# self.check_ttda(self.eLDA) -# -# # compare with a pre-trained reference model -# reference = EnsembleLda.load(datapath('ensemblelda')) -# np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) -# # small values in the distance matrix tend to vary quite a bit around 2%, -# # so use some absolute tolerance measurement to check if the matrix is at least -# # close to the target. -# atol = reference.asymmetric_distance_matrix.max() * 1e-05 -# np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, -# reference.asymmetric_distance_matrix, atol=atol) -# -# def test_clustering(self): -# # the following test is quite specific to the current implementation and not part of any api, -# # but it makes improving those sections of the code easier as long as sorted_clusters and the -# # cluster_model results are supposed to stay the same. Potentially this test will deprecate. -# -# reference = EnsembleLda.load(datapath('ensemblelda')) -# cluster_model_results = deepcopy(reference.cluster_model.results) -# sorted_clusters = deepcopy(reference.sorted_clusters) -# stable_topics = deepcopy(reference.get_topics()) -# -# # continue training with the distance matrix of the pretrained reference and see if -# # the generated clusters match. -# reference.asymmetric_distance_matrix_outdated = True -# reference.recluster() -# -# self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) -# self.assertEqual(reference.sorted_clusters, sorted_clusters) -# np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05) -# -# def test_not_trained(self): -# # should not throw errors and no training should happen -# -# # 0 passes -# eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, -# passes=0, num_models=num_models, random_state=0) -# self.assertEqual(len(eLDA.ttda), 0) -# -# # no corpus -# eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics, -# passes=passes, num_models=num_models, random_state=0) -# self.assertEqual(len(eLDA.ttda), 0) -# -# # 0 iterations -# eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, -# iterations=0, num_models=num_models, random_state=0) -# self.assertEqual(len(eLDA.ttda), 0) -# -# # 0 models -# eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, -# passes=passes, num_models=0, random_state=0) -# self.assertEqual(len(eLDA.ttda), 0) -# -# def test_memory_unfriendly(self): -# # at this point, self.eLDA_mu and self.eLDA are already trained -# # in the setUpClass function -# # both should be 100% similar (but floats cannot -# # be compared that easily, so check for threshold) -# self.assertEqual(len(self.eLDA_mu.tms), num_models) -# np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05) -# np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05) -# self.check_ttda(self.eLDA_mu) -# -# def test_generate_gensim_rep(self): -# gensimModel = self.eLDA.generate_gensim_representation() -# topics = gensimModel.get_topics() -# np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) -# -# def assert_cluster_results_equal(self, a, b): -# """compares important attributes of the cluster results""" -# np.testing.assert_array_equal([row["label"] for row in a], -# [row["label"] for row in b]) -# np.testing.assert_array_equal([row["is_core"] for row in a], -# [row["is_core"] for row in b]) -# -# def test_persisting(self): -# fname = get_tmpfile('gensim_models_ensemblelda') -# self.eLDA.save(fname) -# loaded_eLDA = EnsembleLda.load(fname) -# # storing the ensemble without memory_friendy_ttda -# self.eLDA_mu.save(fname) -# loaded_eLDA_mu = EnsembleLda.load(fname) -# -# # was it stored and loaded correctly? -# # memory friendly -# loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() -# topics = loaded_eLDA_representation.get_topics() -# ttda = loaded_eLDA.ttda -# amatrix = loaded_eLDA.asymmetric_distance_matrix -# np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) -# np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05) -# np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05) -# -# a = self.eLDA.cluster_model.results -# b = loaded_eLDA.cluster_model.results -# -# self.assert_cluster_results_equal(a, b) -# -# # memory unfriendly -# loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() -# topics = loaded_eLDA_mu_representation.get_topics() -# np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) -# -# def test_multiprocessing(self): -# # same configuration -# random_state = 0 -# -# # use 3 processes for the ensemble and the distance, -# # so that the 4 models and 8 topics cannot be distributed -# # to each worker evenly -# workers = 3 -# -# # memory friendly. contains List of topic word distributions -# eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, -# num_topics=num_topics, passes=passes, num_models=num_models, -# random_state=random_state, ensemble_workers=workers, distance_workers=workers) -# -# # memory unfriendly. contains List of models -# eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, -# num_topics=num_topics, passes=passes, num_models=num_models, -# random_state=random_state, ensemble_workers=workers, distance_workers=workers, -# memory_friendly_ttda=False) -# -# np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05) -# np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05) -# -# def test_add_models(self): -# # same configuration -# num_models = self.eLDA.num_models -# -# # make sure countings and sizes after adding are correct -# # create new models and add other models to them. -# -# # there are a ton of configurations for the first parameter possible, -# # try them all -# -# # quickly train something that can be used for counting results -# num_new_models = 3 -# num_new_topics = 3 -# -# # 1. memory friendly -# eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, -# num_topics=num_new_topics, passes=1, num_models=num_new_models, -# iterations=1, random_state=0, topic_model_class=LdaMulticore, -# workers=3, ensemble_workers=2) -# -# # 1.1 ttda -# a = len(eLDA_base.ttda) -# b = eLDA_base.num_models -# eLDA_base.add_model(self.eLDA.ttda) -# self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) -# self.assertEqual(eLDA_base.num_models, b + 1) # defaults to 1 for one ttda matrix -# -# # 1.2 an ensemble -# a = len(eLDA_base.ttda) -# b = eLDA_base.num_models -# eLDA_base.add_model(self.eLDA, 5) -# self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) -# self.assertEqual(eLDA_base.num_models, b + 5) -# -# # 1.3 a list of ensembles -# a = len(eLDA_base.ttda) -# b = eLDA_base.num_models -# # it should be totally legit to add a memory unfriendly object to a memory friendly one -# eLDA_base.add_model([self.eLDA, self.eLDA_mu]) -# self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda)) -# self.assertEqual(eLDA_base.num_models, b + 2 * num_models) -# -# # 1.4 a single gensim model -# model = self.eLDA.classic_model_representation -# -# a = len(eLDA_base.ttda) -# b = eLDA_base.num_models -# eLDA_base.add_model(model) -# self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics())) -# self.assertEqual(eLDA_base.num_models, b + 1) -# -# # 1.5 a list gensim models -# a = len(eLDA_base.ttda) -# b = eLDA_base.num_models -# eLDA_base.add_model([model, model]) -# self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics())) -# self.assertEqual(eLDA_base.num_models, b + 2) -# -# self.check_ttda(eLDA_base) -# -# # 2. memory unfriendly -# eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, -# num_topics=num_new_topics, passes=1, num_models=num_new_models, -# iterations=1, random_state=0, topic_model_class=LdaMulticore, -# workers=3, ensemble_workers=2, memory_friendly_ttda=False) -# -# # 2.1 a single ensemble -# a = len(eLDA_base_mu.tms) -# b = eLDA_base_mu.num_models -# eLDA_base_mu.add_model(self.eLDA_mu) -# self.assertEqual(len(eLDA_base_mu.tms), a + num_models) -# self.assertEqual(eLDA_base_mu.num_models, b + num_models) -# -# # 2.2 a list of ensembles -# a = len(eLDA_base_mu.tms) -# b = eLDA_base_mu.num_models -# eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu]) -# self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models) -# self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models) -# -# # 2.3 a single gensim model -# a = len(eLDA_base_mu.tms) -# b = eLDA_base_mu.num_models -# eLDA_base_mu.add_model(self.eLDA_mu.tms[0]) -# self.assertEqual(len(eLDA_base_mu.tms), a + 1) -# self.assertEqual(eLDA_base_mu.num_models, b + 1) -# -# # 2.4 a list of gensim models -# a = len(eLDA_base_mu.tms) -# b = eLDA_base_mu.num_models -# eLDA_base_mu.add_model(self.eLDA_mu.tms) -# self.assertEqual(len(eLDA_base_mu.tms), a + num_models) -# self.assertEqual(eLDA_base_mu.num_models, b + num_models) -# -# # 2.5 topic term distributions should throw errors, because the -# # actual models are needed for the memory unfriendly ensemble -# a = len(eLDA_base_mu.tms) -# b = eLDA_base_mu.num_models -# self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics())) -# # remains unchanged -# self.assertEqual(len(eLDA_base_mu.tms), a) -# self.assertEqual(eLDA_base_mu.num_models, b) -# -# self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) -# self.check_ttda(eLDA_base_mu) -# -# def test_add_and_recluster(self): -# # 6.2: see if after adding a model, the model still makes sense -# num_new_models = 3 -# num_new_topics = 3 -# -# # train -# new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, -# num_topics=num_new_topics, passes=10, num_models=num_new_models, -# iterations=30, random_state=1, topic_model_class='ldamulticore', -# distance_workers=4) -# new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, -# num_topics=num_new_topics, passes=10, num_models=num_new_models, -# iterations=30, random_state=1, topic_model_class='ldamulticore', -# distance_workers=4, memory_friendly_ttda=False) -# # both should be similar -# np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) -# np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) -# # and every next step applied to both should result in similar results -# -# # 1. adding to ttda and tms -# new_eLDA.add_model(self.eLDA) -# new_eLDA_mu.add_model(self.eLDA_mu) -# np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) -# self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) -# self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) -# self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) -# self.check_ttda(new_eLDA) -# self.check_ttda(new_eLDA_mu) -# -# # 2. distance matrix -# new_eLDA._generate_asymmetric_distance_matrix() -# new_eLDA_mu._generate_asymmetric_distance_matrix() -# np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, -# new_eLDA_mu.asymmetric_distance_matrix) -# -# # 3. CBDBSCAN results -# new_eLDA._generate_topic_clusters() -# new_eLDA_mu._generate_topic_clusters() -# a = new_eLDA.cluster_model.results -# b = new_eLDA_mu.cluster_model.results -# self.assert_cluster_results_equal(a, b) -# -# # 4. finally, the stable topics -# new_eLDA._generate_stable_topics() -# new_eLDA_mu._generate_stable_topics() -# np.testing.assert_allclose(new_eLDA.get_topics(), -# new_eLDA_mu.get_topics()) -# -# new_eLDA.generate_gensim_representation() -# new_eLDA_mu.generate_gensim_representation() -# -# # same random state, hence topics should be still similar -# np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) -# -# -# if __name__ == '__main__': -# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) -# unittest.main() +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Tobias B + +""" +Automated tests for checking the EnsembleLda Class +""" + +import sys +sys.path.append('/mnt/data/Code/ensemble_lda_repo/gensim') + +import logging +import unittest + +import numpy as np +from copy import deepcopy + +from gensim.models import EnsembleLda, LdaMulticore +from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary + +num_topics = 2 +num_models = 4 +passes = 50 + + +class TestModel(unittest.TestCase): + def setUp(self): + # same configuration for each model to make sure + # the topics are equal + random_state = 0 + + self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state) + + self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state, + memory_friendly_ttda=False) + + def check_ttda(self, ensemble): + """tests the integrity of the ttda of any ensemble""" + self.assertGreater(len(ensemble.ttda), 0) + a = ensemble.ttda.sum(axis=1) + b = np.ones(len(ensemble.ttda)).astype(np.float32) + np.testing.assert_allclose(a, b, rtol=1e-04) + + def test_eLDA(self): + # given that the random_state doesn't change, it should + # always be 2 detected topics in this setup. + self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) + self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) + self.check_ttda(self.eLDA) + + # reclustering shouldn't change anything without + # added models or different parameters + self.eLDA.recluster() + + self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) + self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) + self.check_ttda(self.eLDA) + + # compare with a pre-trained reference model + reference = EnsembleLda.load(datapath('ensemblelda')) + np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) + # small values in the distance matrix tend to vary quite a bit around 2%, + # so use some absolute tolerance measurement to check if the matrix is at least + # close to the target. + atol = reference.asymmetric_distance_matrix.max() * 1e-05 + np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, + reference.asymmetric_distance_matrix, atol=atol) + + def test_clustering(self): + # the following test is quite specific to the current implementation and not part of any api, + # but it makes improving those sections of the code easier as long as sorted_clusters and the + # cluster_model results are supposed to stay the same. Potentially this test will deprecate. + + reference = EnsembleLda.load(datapath('ensemblelda')) + cluster_model_results = deepcopy(reference.cluster_model.results) + sorted_clusters = deepcopy(reference.sorted_clusters) + stable_topics = deepcopy(reference.get_topics()) + + # continue training with the distance matrix of the pretrained reference and see if + # the generated clusters match. + reference.asymmetric_distance_matrix_outdated = True + reference.recluster() + + self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) + self.assertEqual(reference.sorted_clusters, sorted_clusters) + np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05) + + def test_not_trained(self): + # should not throw errors and no training should happen + + # 0 passes + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=0, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # no corpus + eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # 0 iterations + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + iterations=0, num_models=num_models, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + # 0 models + eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=0, random_state=0) + self.assertEqual(len(eLDA.ttda), 0) + + def test_memory_unfriendly(self): + # at this point, self.eLDA_mu and self.eLDA are already trained + # in the setUpClass function + # both should be 100% similar (but floats cannot + # be compared that easily, so check for threshold) + self.assertEqual(len(self.eLDA_mu.tms), num_models) + np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05) + self.check_ttda(self.eLDA_mu) + + def test_generate_gensim_rep(self): + gensimModel = self.eLDA.generate_gensim_representation() + topics = gensimModel.get_topics() + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + + def assert_cluster_results_equal(self, a, b): + """compares important attributes of the cluster results""" + np.testing.assert_array_equal([row["label"] for row in a], + [row["label"] for row in b]) + np.testing.assert_array_equal([row["is_core"] for row in a], + [row["is_core"] for row in b]) + + def test_persisting(self): + fname = get_tmpfile('gensim_models_ensemblelda') + self.eLDA.save(fname) + loaded_eLDA = EnsembleLda.load(fname) + # storing the ensemble without memory_friendy_ttda + self.eLDA_mu.save(fname) + loaded_eLDA_mu = EnsembleLda.load(fname) + + # was it stored and loaded correctly? + # memory friendly + loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() + topics = loaded_eLDA_representation.get_topics() + ttda = loaded_eLDA.ttda + amatrix = loaded_eLDA.asymmetric_distance_matrix + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05) + + a = self.eLDA.cluster_model.results + b = loaded_eLDA.cluster_model.results + + self.assert_cluster_results_equal(a, b) + + # memory unfriendly + loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() + topics = loaded_eLDA_mu_representation.get_topics() + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + + def test_multiprocessing(self): + # same configuration + random_state = 0 + + # use 3 processes for the ensemble and the distance, + # so that the 4 models and 8 topics cannot be distributed + # to each worker evenly + workers = 3 + + # memory friendly. contains List of topic word distributions + eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_topics, passes=passes, num_models=num_models, + random_state=random_state, ensemble_workers=workers, distance_workers=workers) + + # memory unfriendly. contains List of models + eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_topics, passes=passes, num_models=num_models, + random_state=random_state, ensemble_workers=workers, distance_workers=workers, + memory_friendly_ttda=False) + + np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05) + np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05) + + def test_add_models(self): + # same configuration + num_models = self.eLDA.num_models + + # make sure countings and sizes after adding are correct + # create new models and add other models to them. + + # there are a ton of configurations for the first parameter possible, + # try them all + + # quickly train something that can be used for counting results + num_new_models = 3 + num_new_topics = 3 + + # 1. memory friendly + eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_class=LdaMulticore, + workers=3, ensemble_workers=2) + + # 1.1 ttda + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(self.eLDA.ttda) + self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 1) # defaults to 1 for one ttda matrix + + # 1.2 an ensemble + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(self.eLDA, 5) + self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 5) + + # 1.3 a list of ensembles + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + # it should be totally legit to add a memory unfriendly object to a memory friendly one + eLDA_base.add_model([self.eLDA, self.eLDA_mu]) + self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda)) + self.assertEqual(eLDA_base.num_models, b + 2 * num_models) + + # 1.4 a single gensim model + model = self.eLDA.classic_model_representation + + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model(model) + self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics())) + self.assertEqual(eLDA_base.num_models, b + 1) + + # 1.5 a list gensim models + a = len(eLDA_base.ttda) + b = eLDA_base.num_models + eLDA_base.add_model([model, model]) + self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics())) + self.assertEqual(eLDA_base.num_models, b + 2) + + self.check_ttda(eLDA_base) + + # 2. memory unfriendly + eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_class=LdaMulticore, + workers=3, ensemble_workers=2, memory_friendly_ttda=False) + + # 2.1 a single ensemble + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu) + self.assertEqual(len(eLDA_base_mu.tms), a + num_models) + self.assertEqual(eLDA_base_mu.num_models, b + num_models) + + # 2.2 a list of ensembles + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu]) + self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models) + self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models) + + # 2.3 a single gensim model + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu.tms[0]) + self.assertEqual(len(eLDA_base_mu.tms), a + 1) + self.assertEqual(eLDA_base_mu.num_models, b + 1) + + # 2.4 a list of gensim models + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + eLDA_base_mu.add_model(self.eLDA_mu.tms) + self.assertEqual(len(eLDA_base_mu.tms), a + num_models) + self.assertEqual(eLDA_base_mu.num_models, b + num_models) + + # 2.5 topic term distributions should throw errors, because the + # actual models are needed for the memory unfriendly ensemble + a = len(eLDA_base_mu.tms) + b = eLDA_base_mu.num_models + self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics())) + # remains unchanged + self.assertEqual(len(eLDA_base_mu.tms), a) + self.assertEqual(eLDA_base_mu.num_models, b) + + self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) + self.check_ttda(eLDA_base_mu) + + def test_add_and_recluster(self): + # 6.2: see if after adding a model, the model still makes sense + num_new_models = 3 + num_new_topics = 3 + + # train + new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_class='ldamulticore', + distance_workers=4) + new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_class='ldamulticore', + distance_workers=4, memory_friendly_ttda=False) + # both should be similar + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + # and every next step applied to both should result in similar results + + # 1. adding to ttda and tms + new_eLDA.add_model(self.eLDA) + new_eLDA_mu.add_model(self.eLDA_mu) + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) + self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) + self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) + self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) + self.check_ttda(new_eLDA) + self.check_ttda(new_eLDA_mu) + + # 2. distance matrix + new_eLDA._generate_asymmetric_distance_matrix() + new_eLDA_mu._generate_asymmetric_distance_matrix() + np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, + new_eLDA_mu.asymmetric_distance_matrix) + + # 3. CBDBSCAN results + new_eLDA._generate_topic_clusters() + new_eLDA_mu._generate_topic_clusters() + a = new_eLDA.cluster_model.results + b = new_eLDA_mu.cluster_model.results + self.assert_cluster_results_equal(a, b) + + # 4. finally, the stable topics + new_eLDA._generate_stable_topics() + new_eLDA_mu._generate_stable_topics() + np.testing.assert_allclose(new_eLDA.get_topics(), + new_eLDA_mu.get_topics()) + + new_eLDA.generate_gensim_representation() + new_eLDA_mu.generate_gensim_representation() + + # same random state, hence topics should be still similar + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) + unittest.main() From 254ca60fa71d97c57d1f2682384b48efbe61f571 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 18:03:18 +0100 Subject: [PATCH 079/166] tox --- gensim/models/ensemblelda.py | 11 ++++++++--- gensim/test/test_ensemblelda.py | 3 --- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 514d87c305..66d7ce2489 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -276,9 +276,14 @@ def load(cls, *args, **kwargs): eLDA.topic_model_class = getattr(module, eLDA.topic_model_class_string) del eLDA.topic_model_module_string del eLDA.topic_model_class_string - except: - logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module and set the ' - '"topic_model_class" attribute to it. Try setting this manually instead.') + except ModuleNotFoundError: + logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the ' + f'"{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. ' + 'Try setting this manually instead after loading.') + except AttributeError: + logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the ' + f'{eLDA.topic_model_module_string} module in order to set the "topic_model_class" attribute. ' + 'Try setting this manually instead after loading.') return eLDA load.__doc__ = SaveLoad.load.__doc__ diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index e416de9644..f2b05bb72e 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -7,9 +7,6 @@ Automated tests for checking the EnsembleLda Class """ -import sys -sys.path.append('/mnt/data/Code/ensemble_lda_repo/gensim') - import logging import unittest From aec58c9173ccf64c5c6592c04c20dbe7fd898694 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 18:06:55 +0100 Subject: [PATCH 080/166] quotes in logger.error --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 66d7ce2489..da58e199cb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -282,7 +282,7 @@ def load(cls, *args, **kwargs): 'Try setting this manually instead after loading.') except AttributeError: logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the ' - f'{eLDA.topic_model_module_string} module in order to set the "topic_model_class" attribute. ' + f'"{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. ' 'Try setting this manually instead after loading.') return eLDA From f7de190083f3175c727aa2e5037836fa7a86cb62 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 18:14:41 +0100 Subject: [PATCH 081/166] multiline string --- gensim/models/ensemblelda.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index da58e199cb..938eea5c5b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -277,13 +277,13 @@ def load(cls, *args, **kwargs): del eLDA.topic_model_module_string del eLDA.topic_model_class_string except ModuleNotFoundError: - logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the ' - f'"{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. ' - 'Try setting this manually instead after loading.') + logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the \ + "{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. \ + Try setting this manually instead after loading.') except AttributeError: - logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the ' - f'"{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. ' - 'Try setting this manually instead after loading.') + logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the \ + "{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. \ + Try setting this manually instead after loading.') return eLDA load.__doc__ = SaveLoad.load.__doc__ From dfc97a08d29d5c07f1e50e98cb92d97dd48c36a5 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 18:28:17 +0100 Subject: [PATCH 082/166] python 3.5 format strings --- gensim/models/ensemblelda.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 938eea5c5b..d975835351 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -277,13 +277,13 @@ def load(cls, *args, **kwargs): del eLDA.topic_model_module_string del eLDA.topic_model_class_string except ModuleNotFoundError: - logger.error(f'Could not import the "{eLDA.topic_model_class_string}" module in order to provide the \ - "{eLDA.topic_model_class_string}" class as "topic_model_class" attribute. \ - Try setting this manually instead after loading.') + logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" ' + 'attribute. Try setting this manually instead after loading.' + .format(eLDA.topic_model_class_string, eLDA.topic_model_class_string)) except AttributeError: - logger.error(f'Could not import the "{eLDA.topic_model_class_string}" class from the \ - "{eLDA.topic_model_module_string}" module in order to set the "topic_model_class" attribute. \ - Try setting this manually instead after loading.') + logger.error('Could not import the "{}" class from the "{}" module in order to set the ' + '"topic_model_class" attribute. Try setting this manually instead after loading.' + .format(eLDA.topic_model_class_string, eLDA.topic_model_module_string)) return eLDA load.__doc__ = SaveLoad.load.__doc__ From 88b338b1dbb4cf0ba27cfb238d58021531012dfd Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Nov 2019 19:23:49 +0100 Subject: [PATCH 083/166] ModuleNotFoundError: No module named 'numpy.random._pickle' --- gensim/test/test_data/ensemblelda | Bin 14251 -> 14238 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 79525c3aca30b28384bb1089a558222777737cad..85cf7b33f1966b8fdac4027524a82863b2552592 100644 GIT binary patch delta 5057 zcmY*dS5s8mx+SO}NJc~iQ4ml8!Hp;?G%bPz6#)|l6xo7`NKm3_k#lacIj=G%%1gXr&3gKaXWx zM%7~sT$C9lL%OeHAs$svWI7-;PMWMw$Y8z<{y@{|6Q`rlcmvxW>C)iohk98T&DqbG ze0>Sy!HLp_28?=rl|9U2Nv5RIHc0XC%|oNi$f`8t_;?<{rV6dMldKfzWLxH=t!3=+ zl?04ATsmovMoVl5$T%jhI~)x{jXl~?nw08ncQVEfW%kapV(ONP=8xxPSmx7HuAvFr zPjAV#m3MsNYs^cdOn;H?Q$I2>h0Y?GmW?}>sB<`V_|+L%^UrpaY0P2p(kszQry{zK z#;8Y&Y=@x9BJYD#tHLf}iAi6>&>u2#8M~;#X8a{o+GL|vy1W8iB2&2Ns52I1@|;Yf zPmM80mZ0bOdqt7iWK=Vv4m9OSBj)3k8|cPfz8`vJA}bChCeoWGbFXAY7DRVNK|i*A z&)v52OcX9&!0<=ZX&(Ho7pqbyOEFhstTB2xSlZpN631OirIUG8I$e+kEMWrMXH3o- z2gr=^qjM-rXU%uem1u-PX)B8KmS*&vNx@ba*DXDALFNpx63J&)Fko{cM>J3dj=Mg= zD%vtp%!=2rcVifr8rhH)3^RlkCoIb%rZc0DA4MmwvR%f(@q9jBjB0 z2HIp?RzAqcOWBb*nX|#dvm8IE@iV~)3qH+)R^*^FNLGAP-(8dC$#lAmF=>+l8T`clbz;_623&dBgPMC%i7GECeawq3nSIE>yZtRa&iejj zB4dup&oGJ#W2rHD!{RSNcL~MNMxoUS+n4jvFTY+xV=7vdV2E!h>Cinmg;ADK!z^ML zf4M9h_^=nhP>RVA3_M1sthi$~H|_2tl*vx6%;XSYi)c@gRcL*oefR0Sv>Uraiv`S| zl@c~jLvSzp%PwYaap5ie%EYA1pc(UM_Io9rM}NrhL)nDVCo_&xV`Xe1+oz4LKDJV1 zw;&wNRz{Dd*2y;wb!pM>_ z8Ka7AsX-5(!c(f-I{ z0V}1?I5j7o&Fn+_MYJEmB$iS6={Ac{Nl{vs#(RuzU7D3bnU^x`1mDV+y@yy%!Hl$9 z>WPP5_h+9`^76rBOxZp%V{EmUM!9U?kd;R=^*N!Cjh()h^sX@dd^**0FxJuU@oQ@? z+h>OgY}&=dUUTE(o0cikl9nEN$%t%d50@sX$9#%(vAd;auas|YaiR^7O$JdVRq=6B zgFT|7?;NHMV@;tDjAMpfn33IE4$k*|(IFEUB4FCw%?o+4ppuca+bL-_1wYgnBm@`n ziv(XpBO9>r(?=E}%~2PXnTItQWjVG@xl|SUL>GbjZ=z^DNtMpmo}ZaQH<>;DE!*L4 zNOBs6bI~ll((s%On2}!?z-*FCJ}k;*4f@bcl`UgE`epPiMwm+(`kx$=l2@`NtpSCO zGAna=R%`|FhV~N}*IPlHvrOeXxp|z({fc#u!_wdnWfn6zOmS3}j4}P%elp`iT)*6N%7OBl(q#%DhaOcz4)2 zkGt+twD!>Wr)b`w_&AzmD;i@?vXfjOeQrS*V57q@C(4dAOGhedT~KvQO209=R@q3B zWe05Cm4)~88BE)u@=RiOAaq4iddK^&=%k7WmBt@&?C$_JrRM@#a|roC8GPvJOLf&s zd6Knt^37lpaj^)~5m*UKXM>wic8ZQsYLFyN0bUMviAoUlcG%A`F!1PE%$!C!>NBnp zYrE*8FOIR3wRy7m3>8n9LVY@STO%ZjX}J~e>_a?3@sT<7`pee0+vu={#y;Bk3T?La zL=3h1gLIhFe+Nvcr+w#S-t@AubSH<~Q^}^-k}kVKMf$wdZ2LSrIdO_Dt!3;}&+{Ww zef-`?y<4;kn$1k|u_#rf;xH0G$l^^Vlv{hmwIHgWe_*^c#5Jj3sNzgKWncD{%%jCg6lhBUl* zZGSm33gtg#T#epH(rGQ!R?4A~trMvI5HB0p2s2S5l@=G;dRQ8R7a7@<+5Q*49#QY7L+dO&A|3&8Fk~1b9BgGUmnQZ zQ5L(3GNq9gY+U#Eli@>-Z=}U8`H5YyOv<7R%2ec68dGCo84lwZGnXJ*H(7HrPooxa zG_ZO7(2nSDqBGCkleycN_mhrb2h6)i%52nKy1C|279Xt1MfYPVziP*_j6V<^ZDqxP zg<16@-8i6etf2_)x2+$#KfjxPPfNu13z@>J*zAV%6)(hBUpa3CM&Fk~sK zk!W|&mve&4bfCqJIWND!I;E!>^?sMx?A@cDuF@P%MD)d^iPn%$wA`dxFW#kIwmQ%S zb%n%+626*Jcqplw?@VdydxSe-RQ63lyE!IgHQuj=RFxoFwQkt#}#A+{O7 z0$N{W(auI?*pq;?iN{3TQ#09Std8`r5#>ox7#d#C73*$(GYY?&`HCFUobmIayNAnY zwmE6D!ogLhDfS!a33IfMFOXJSjz<+{=uAF@IuEM78!8GP{K+7tTE7ZlAw<|RJyq!(TPcvEUs zF(`e@ij?ONAjKwJc-H(P#aEM*afrUzj^!lTV(6s=ZI`n-J*S7_-H;L>+GJ~4JAWZv z%8p~ko~BZH4R!fWD7KM32Tpmz*mh+&itgG=slF4BS`SNNa#MGvRQIi6zH16lvsB3UeuNox$`q0baO0ncSB_nU(m z1|re;p4+x?KB!A$xC`%s;xP2=`F+aXY&zjpf#obredPSKg3V8f$A0j1GR7fz>fTlT zSWzx>e>{^8D@Ix-%Tjml*>g6!#y6+$eST$5e?rbcZq8szqOpO2L3Y-coT0l}8n+Yw zR4RC#@-<4urADGvJkhj7tJ0+;CM`C1%EyNDVVC)kCBi7i;RQIc^S)Y%=4Z0nTp=i z@=O(l4ttfY8@fYGZ1`bm%MY(;e(2F|8}h}{MYVtVGgl@<-X-yeD6}XyrN{C6&G@iy zH*5J#h)c3v}{F zbJc|&$_hZE6S-fvzecISEextSS}Z_ol12&8-$wDrd(xb%B0-m=+Wa#=_n)@5mx*&K zv!iT9KPQ}48PTut`;YoJ;ApoUmM)GBVAegbN&9WgA_-a0s2)K4aK6r_wU!DqzR3- zP*zl!w~`7zUh?}F135@m2sWO{iQyTh^_B`ah8F5w^FfmP+hrWVtOadn@(yEI| z#ATwChl_5GIQoL{ye2&Xtl#r<(sV}lcpA~irq5X6)B7m+D@xfrD#KZr=A{4sRYS*T zR3s?B-y=a5FcX1Va%xC8$~$l@I4H&Mew52F=u)T- zqX$3iCqywTN&d{KVR8Q^x6+6%KUd{-GWZSF{PP zI_GPDzoy^9>@X|2%JlYOamT>m04?{V+{27VN^X0Gr>KUnvi=)Enl1y|1-&MysKH8=b}AwDsH delta 5090 zcmY*d**}$S+g__sqEI4AQ7KaCm7&5SLZXr<4P=T?ksd=TJh#X^gq3+{nU{H5Sj+hS zj&twZ`Tm7(@7sR2zn$-(dfwk_W3|?GU&DDF=W!h8c{i9poxkwlpt5>IrP?ogX~2va zojtJQ3nN;f#~kLPWbAdZv`TkYVKi2xB_T7~$u{+qR369lEm^|OQ88fp5EiBLET3-yABHP%8@fyZ5^8QTn;_lmAF?Ab5Ss1kuLor&>l>%k(t~83yM@D7PIx^n% z^}QtYa@S4txZOmZ%%SJ-bIVVW=cHMT+_F<@AIscfOpED;EGmgI6o+<f@c6h}fE;3Q#rg?E5T@0j#SS}PFxi5qJUYCnbv{x{K zl@ozSF(q5qW#Eii0NTz=+szo!-GAZ;lT^sW={K*5Vx#=9z?k&x;_oENFcuHvhqOZ{ z%lYUr$29Z5jCpKW9<=cNcs{}Jcm@owZ$1-!nzV(PVL3TgzDpe%QwYbfqL!+Q#~ttk z2DR@^tjYFSba)c<)zpj+*bJ1(eObK4ET)1o_MJ|W={s0|93}IgFezPmvZhGgJsFI9 zCx!^D%Z3-KeB>*>$c9X+We{8T^35C-g`)N4f~k|{2VctO3v67FI#ujxRM_0akW3tn zbivYX8TFF-7*EWGtI4LFxbsqr(HtjK;k%4uTPmW6x$%x$x%|LzC|SB9+aAgzS-vA< z*f5tVX~JqkJgN!Bcg(!=S0Zl98nrMI5N1yHZgH1p&xotn+xF;gBj^IadA#Yozlgx?IBTUZOiddeolM31iA}EM8zvjEd+=JkcU1 zV)})&)yCc+_iSMDJeF}w8l?WB4;lLdZ6~GCUkq>1d({Dz=te`d-R(eG_7M~Mux3tD zHH1L_p_Cb}A=lewKF7yZDp8LPwU^A7iq6tT8q5nn!jvZ3tmm02iCC3BQXEuGPWKf(Yl*w)Zr^k(3VpGwX%_nr95=o_!FuI zsUnevJzUX|ZYM3!C5W-q;?EC(AM9R7$Y`*vSfPn-@q=7%W1!pRq70F4vn1RER-w<6 zNtykN_gh1g^t?g6te2zHRug|eP{z|_^ho3_nG9o|wmb>mN>nE|NRre7u9P4@`QDr(X&a^b$iNi{ky+CN4=>e4^Hjg?+3 zSx07gTq?J@C3TvYq)|6o-%|4(QspeeVKgs_JyR-U3Yh8Tpm59{P}DFy!Omlap0_OX zGD>;(qVuo~5nl+A`M7f)UbcmPGI}~s76?isr8|@GCw7wX?ZstOiq=1xo>PHURRD}< z$rT;*(}Q`^`G+fAy%)^T_%7fWJDDU+ZkR4Cyr_y5la~$Zt|=IMUc%?AulR+mv{V%;cjg0*l3eBxRK4D_C&FH^#qw{z@W-y%S~i z(uw$}M>L?-XZHxm21~#w*6*OjC)XXLsfVcQ0cpL9ZSVJd*iA2Yq5>PQldX(E>2#D@$9uBlaFF=* zmtr|hzFSK3Ungb$7{eaMw@Zw}h;JV;?OG~5r60R}xN)6dD5Iho4B93k#-Aplh_vRmWEk{Nf1f$>Twe&f%$aF}zj69a{Q(mb&ZHU61Rxp&QT*hn>k6f27-e$&8rXJ-KzLsSy zVB-eG-hl7^fd|EqL6KtG;~W+p&1k<%RI5q!u>`0=^dXG6Q4IDn6k$Qebr)dakn}%F zkn#77q>c{IS0)DkI5YZGr&Oh4>H7ZkB2m^g=jmfR|Hz^BG^5ilKY*3ABL+>X+aA)- zR8&&(3--)v2l|p^GA0w75!gU)jx?neV+HM}uy!}^0NafPS=OUm7R+!> zxuYM?GnxAY%TTsAd1~;(botTLb5e2e(EVg5f8|vu)>ACsp#oD*XpS#0;W^C-T-!+6 zZ$#mHj)R(R+d`jPIpvc7?UC7gg@q3?gOps1V(|bX026Lz$Zjy3O)#In~=o&@JY zR%j29_@-SK4rc$+9vvQ)DU$_sget{`Mz!OT0ll^C*t*hr=Dnl5G_q2hCk;`&f(|P& zF49S23_$0IA%$i&%C0Zc4`AW8E%de6qQzXvR+o&9BwpCh^c$i5RQH#c)0AKn& zQQTmDwt0s&*?b_%V2BuG<_v!ulkcD4a#JB)W>k9ERaRr&OnTyPV?IWzW#eIy44_}- zd|DQsU1pm5vV24r#+q6qlm2&Q=`QNbZ+R9tSf`VT+h*CP=u>(u?33!dVd$oe1{|=S zkVhu>nw^d{BNE-BOt>CTB3PRz=u2zt36@gg#c-baoVk-kjB2q2-rd4BmkgWrxGt@S zNU;tps&$612UR7xVz|L}tvy`6L}n5BIkcWeWhrU55v$pkX-@HPqW(eVvHg}Ie>$@K zR3>TBl*Oy)VmtQ^q>nM-6lb9mGf`?V;Wa z+JI7D(VaN(cER?m#x%KE3%_Qny`Jj|Y`I z3OlEh;pjS7augkfKKAJGrgiC0`lP?|lCQooXM8^<%eQ0Ptd@D)R7tH%DsiZ$YxX`q z=W7>!FJ6pyu@qJ!vu0;A_+VU=Mtnt8feiX#M6_P=-AfjdE(VEFg_Z++w2tvmOn%U$ z@O@CrwrpfSTa2AExfyRy2l8FzTRM7EdC~bKGb%%y?DHgt{92Wno77WxqK6DyVZ4aR z*O%80cwT0ZJ;NYDozAzF0qwsWoYWawE64v) zzN%EJjY=+1t$+Zy@!<=q}#QE=zG?rZo2`mi{-TUPswYH&FaUB0Z?bH*G5mvb7 z{n&EWzB}CxVU-qJ`g^AK=B1`9Z0PFPDgNAyG#dEz zRqEU^ySrm3rmjoP-rhRxzb-ve>$Krsan_-q=v&RZ!=lsbyw0dq+JAbr?N>psO}Ol> zPFt?I?4(xyxsMZm{XM2%U)Nwx8bd_)k`!ojrd}Gk#e1o|dK)^8^j{O5Z5GD%MyZbZ zx6)cfw>5iow}4Ie?p&2?py9o+7fDEcQc|fVIK{e^WM_mJN`a_HKY5%n`hL(?zw3W z)EK#}K}T`vWv^drJlKK^9>t30of7qo3jgwJ{P#8af}ST6WYo+q6iuAb{*m>o_2>Qe zPVzER*^N_5pvd+ePdd`d{!Jq~-VEPnHB;b}aKd#WKWrXDIQpcMSO_=YB&M zn;=c6?yyzaBiCM7Y{n-4Dp^ju_Rd`Wf7<`{^FAEMV$XNV-P@o}Yvq3?OqUn+OJ&A< whWS$n-RxC Date: Sat, 23 Nov 2019 20:00:46 +0100 Subject: [PATCH 084/166] ModuleNotFoundError: No module named 'numpy.random._pickle' x2 --- gensim/test/test_data/ensemblelda | Bin 14238 -> 10082 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 85cf7b33f1966b8fdac4027524a82863b2552592..127d00888c3c5184ca18d49e7117e0d31352d7c9 100644 GIT binary patch literal 10082 zcma)i2{@Er^zhgjOV)&x-N=$9g&1V5p==}4G-k#yliA)`$~GcP$eL_fvXn|8X;F5i z5+#+AB2uD7i}ua`o%hvm`M&S>f4=v5IPX2%J@?#m=e_6Ln;=YzATu~rx*44rMy7Gh zzy_I4q>*W11f<*IyBGKl2pJ^;!hApkg;nD+SyU1p5)l07(a3lXmrZ3v0HWBX|5gX8 zf%pJ1tSpJf<8aArSP&3L{VzScY;fJ+@8Y}}2}m#jNj@Nj!Xg>uC@!8xAd+bS5+jV2 zrc)Vs4uQ_1kvV`gh!7#OIDm|Ys zf?Gy^i>_J}H3mt4#%KbIn!+ZV!J@0-mN_gcFxX;I)Kbu^Fl?|DEV>5n#~K!03%Az& z-K))_sI8z^5!kbKu;_ZYAA4BT0dArH?$vQo)CulYRNy9OSabv2j|(j73b)+;?$v!! z)C2BSOd#qBi*AJb*#wJX;Fi~)UdcQncG*AUokSyWIDf+38!HK}4R6890XAbrxO5J% z1&0;oa>EFK4~fB}v!cxeGj2u)1CdH#vkB2iJey2nGQjo1BY`pI3v(eEVK5c&!yreP`0Ko%?*o>(FK&6_uGS^yvD9|plCrUxKI z5Q1moVM&W42ouOfFkD^?!7k}T5Of;Ws29*Hk>*zg5V43ykkH+b+>$bgFVsWkAjD_; zF&~1&IS3*={HmaK^EL=hpo1W&Rptvpgvazn0o|!!6V!I9fbj{cTVVPb7(bC+epA5L zb)g!9S_GqBNX{!3;tNG5!7v?$30J&eedtG!{)sGSxP2MMSB5X@ec=M(D@O|1%tq4q@8O341o77KWO9btTzg=4qCe#C2doK9Y{ko`{R zPmtYRJKsWZq5_`ZiC}nME7M_!cmdDj#IYBU{h=t>&dNo6s6RZ8u1HVF{z|b5*lxob z2u_@a=Xat4j?d8f#rS-_2=BpoC#pJRzg7kupH6@9j&OWd7Qyk^^-LPFKlJlF>~|Cx zpIjnV1us}`JT`~A3;aeAc9CgJLKp`K!~t8e%b}(G*N2QvW`Q3J@MoLAr7{^n5CN+| zCvYO6F9ijpP?)qZAb2xYhD~6EG3nq>0D(&ewk@u57>UH={UEBpfQTn?nQSBwV&n&G zM`0zr11xN9t=0iJiZA8wRlWmU<=!me6biVmdEk#Hw65|0xvsmU5C}!tsyva`X3hcn zUzjm@{3@A=24|jC=gfnkK$QCx&$t%iFSPIthbe-*TViw(O+IL@v$E$`?!W@Rm#8z= zF|=eHZ@Ycv3ADn*)xU>PC;RkRkCaayF!8bQLeV?b9CCoMyDc2ALx~@byiY z{2T2u_3ScWhoK}QzWMi<(J_Uf{7J!7rpD)cL-V63Zo+tj;H zYckh-lv{H>^nl9^qxZ51Wo{n9*gtSrSa#T^bL;$s4oUmAm}B`CS$jT}ifrE>o{cNL zqW*~EOU^Q%Cs9W%uB^tt-Sq?kgKrop{Q}~24dzGLj`HWdbM z$7kY>^!Hw}Pviz;rNd?YPd?adgS(--(WYwInL?pvU}K)h=_lU;%|4C1XnR%KTAriO zy0up0q1LhdtAvLI-utz_HqZddt-KF5@A;5^hOte5h%o&(9?|?WO2O<=57Fn*ee(@>|G>(}@odyvTD-*!C&j{fEL^p0K*X z@AS8V)hornga+$q6WXtBz4EiS_uH50Rn0yAchi%T{Ngvzr9U0>ZBVVbHq)EbQs_jz zlqIhkS5x;~`|HeVJGPd{SaRsQ8Hw?&7q)yiU69KwjQP4Dcf`DTyRYYJT@8M~4L$qk z;dYL{>H-rbPTDl(m4487u$1O{$4{!12LJrfSG!923yr5ekm}y6S+I6SEcMC?3thhp zs-JKdCKJ!i>yKN^*mx*;_#Sd=__c<&FHxxBH0`I0jg9^Oo_l;UBiVNb@YrpJ-=bc! zymw!{%)z#V#J1Ij)ayGkU+LDU?iE9fWse)lRqHKGHAs4oj2x~@U3=q0fWwdN!W4SF zty}6nT&(CY?qaQQ%BV+Yv@t$EuRiCrhLq#)@nxDtEJvkQkt1#KZgk$~WZDsN)uZl`)4MxQE`F&S&}%?T-t{Bm?z~QGtvku=IWf-nUlx3G z>N3qmW$s)0&n2o>ucaGL^*!8maA4m|?qicP;x2Q_gYVvBeG>weF)DG_kVEoisJ5@g z#_!C&=>$jh=2vCfKhr}ED6G4zb-qirH}9gn^0TZLxvS39e_X%XV5B5A$bYWd@4mZ} zXx`xQx_wVIDcc*i#~LyAqo-c3v{Qf3q*lIkUA)F%yxHv86B1QVEq1aeB6`L%<0*b-cSFo_S)fqC`hnine8Zbx4H{&; zgd=7fds1#_jNhoA*)%E4UiCX7=XTJ*g%bx%W4nFrxZW#FV=R`$FR0v_rp<(y57UkS zy_I6kX2n$=$^`*yW7qNaZaeX)|I92|-@S8&lhfC}>Cv7{_4C}wd&Wy!(i+b@{l4F- zp*DS7ckf%LE!0fCuaHQ4~2KOl+g^Wvcps763-nu zGo|-t6qhSrh@>w_hf7?P^;q8_yumPN)`GV1{n%X=*)P0ctme+v((-n~+hC1Bn`YIA ztJ^mGoSaPl613}T!Kw6RM{b7d7h8<(-%h|D+BfZHwHtA{DBHmD+%@(C*?0XjP2p4Vni!9~V@(K0P2JBB#*2 ztD|sJZGesQ{`EDiPN~BunQ?6ob)RWji!}H2d+~aGGo?BZTrK;6j2G2;rr)+7KY%yc zQL|SVw~r$vyj?*z){x3SH2)#KZQ1ldyn=c1_2IGvLb(lc$mC;Fde*kpNWR0C%-C6EOb_I>;aT{c*l7M)L;x;b&U+oF1d2{)21U@%K&P z_Zwbgdgss9DyHszy=ry&z#aEP&BiIS<2sM77;GJ+$Sw8bQ1dTOA3d$%yUVW$eY0Jc z6_}`J`r^X7?x-PV*svnPoJ+m9?j_=^SY)$b%=2`!!JM~qNwR&0*K`Z3@_9tb4nV~quuCEcbUFdg{cGpE)5Fn+ z=zZt?iJw?;7MMbVC6`E$Pg*bx9xDvoE^`1Hh!6`SlL*lO-NSJL@(#!nMio?-h&(Pi z3}AqU!aHx~|1w}``;7!x;D7->z(!#u@pvd3=KvfWC7cpL3B+N+%{n!L0cPi5Lcqmg z#W@1K9Dw()?fEXa)*zEgiAcwW5@S5kM{72stMfLZcMiFsMX`O3TTrfO3iy69(hW^! zxuOf&_BqKs*yofmV&imNw9WDNT&v^ecdd?oYug-e#;tO6h;?xiyc=DUcarsKa|9;( z+Q(aOwyT`H)eeS&2$@R&cK^$UD1i+!DFO@3L7xcnw*T)xP@@Dm@-QZe14Lt_1dia4 zsH|uX5Q76t8*)q*mrAF`knsc>i$VZm)3HimU5!cyXNARP5(z{qjmnJ%;yfIaV@!euTh{3j| z{L4wH&_qE_lKkT!aPTsJ4N&0Med$;v7)>k!mqa16fwaw(7#xKIh62>5psL9M(gn3e zaumyr#*82egfjls&;EY}#PA%$e(O1=z3&O8eGH0jf4ri`!Ah0}y`O$Yvv+o&+1DKl zM!y;gMtc@2qZM^Z93&J<9OM*B91J&>I8^p3I&kDP(SrAQ?yN%uuEfDJZ_oPCWpV3$ zH^i>z5-DJ4%D@X`(ZQJ*Axiw8;0LQqEGC;^1{N^|X)A#pjdcC{703ctKsF!9K?z1F zDHtUb3Autu;4&l03`=SlkjDoOpn}9fuYuFV1Q&EZMmk6gY(}sNEDCTCgM^5jXs~)q z2MRD^5RuC!gG;s$BNZelH3@33MHtB-f$vF7ItyH^Y@ir}fF*hCUF2w>1S1+G3etHT zkah?o0jZ(z=mZ8(iV=cHU@($_GK_c-nC2p=3{XipfkXz%F~X2$Fb!vcwKFnMff3<> zktlGENLV-rILrqs1*JV=RuT+o$R|KGAE*JfQHnuNmtuq|V8kh97zxTDj3lK9BSk61 zASnki(vbVsASeigI_=)v{kt~*#+Q`qx)^+jMp@a)(a4u=dkf^utffzoGB@pQEjxHI z%HGKHZReBN);G+T-}mbFJ&g`>9#e%%t^M_E zsZh;kmu`N`DW#yEv#S+PEfns2Op0{e_xKtsH~sCnC&zZ~;5F~%+97=Y$*Pn68%=cP zepQ@af#TXVP><$KN4dmS|J>!*tzq$@!p`9Gni|c?Ct;k99G@fPPkZ0Qm8r*@{?Oa0 zrmyva!$C#9Q$4U}SJ?N6gtvF{48k@557U*TX&Y9ymq;C}FsUzk`6DUo^alrILR%VF zha=>ash@Z@)y-}Hy0aT;k88bktAvW~>gwpLwM_8esqi$VfmIc2EiEZPjMp9AwHf-C z4&`piM6O9|eB*e5s^t0bqwbVP{PIspfmxiT_dRBI+)bLX#6`$szohwhxP-B~1MFw> zCR7a@KAv4)`{SvKGg2d9NmP939RtTMwzgRiLc{b~N4Dj7S?QL9iRy`$RdS4Me=@N^ zR^)&NrpA2&yEbt4-KBM*Y;BEB>fqF^k58H!l=Uo~G&Lv@mTM9MeUSr@sU))Gi@5l!A!!H88PzyRHF1C77^Yh17Cso<^Tp*I%;}4dKSEBBU zUH4#|AIRUd(|Osag%9TzZnnn0j#a82bz0ZWPReQ*Ntt~1>QI`WnlyU18&2wZti#0f z=p5fY15HoP6l8UiHq8okxXcw@VO*FRF8WpUHZJUXU;*~b(@;<0LX|a_+m}SYzIyEi zab?U`b#{^6{VaO!v%upD(n0qEnD`#q!C#tg7kg{gt+g%Ood4UnDHwM*p^clRs4F~= z3Qe+{xD_7$ZDQhS^xVZmNy}@`iwm&^hWYW=uhs?)W#Z#GN$GS>Ay=Lla zsLlTJ9>iA`{pn}LE#jVYJLknPZzkuzOKiujxm$DE&tmqoVnS=m!R!l@d*+n;h~WzA zE1rh)_vfrkrTQi_P={pHzpFR=XzSmz9>Ln_w4iGgQWMpspH98-;fnFa)ik5RAECc~ zTfd3)Xw}&q9TFK&-ygIUi&6TnC$5IcPuAbHw!ptYv%5_haaC0Bp3LDBXMY;F{$@5! z-C1Rl`J6c`?UF!{IpSf7E8iZ?dRrgNvo*H(P%L|WXX3WyPjX8K#&%mXSemeNfhfUNmo+&=84-V6Sj zA*0=RHQtxnBFi=Gp4`pN=Y;aPahJzx;|HVHc~+;DQbUg9A4)fsosJE3DdNvQ)=3~{ z8H|@V%(m(kp^{%+4jzy#{;@TELu%~-bG4?F!#lfk#vSipJ*mh%xNp{Rfh zCtsy=eoFU6I>d}tlwWFXM#cI$MmzH-@90+OG_Gi&eDPe5z8OP)`Nry9l+=MIKYo_AwC|*_Ku}(rdD*+CXVZ%%j~x;g@mqgJLR`w20d1Tow?%ht(t8PpC2`wx@pCfbpSeft45l)k^Rcdcht8It?%)l&Od!DD9`Yw-SsaL zVFuREREPgEaNx!|g$Q*B$K0ihrm6`!?`I2d9jKIh^_6bJBaRy!o1ON4 zEv5N&C`IMu#w4F4m1fgWY>~`(hw_6abCZSx3Fa5q4Q=szos(9Xi}M}uHdV~^wD8#{ z{nnDoq7bi~{B}0h=vHcagvqY-H1zgzprTPqe7XC*xUWkU&!!`{+T0etXs|ALS9si{ z4ziAtanxJxa+>ML6C}%j&GV#M<86;E@a=2o9`8YWp`%siH5yG;r%GoR*hDz`In-z# zxdo)QiW#Yl*jm}02=wW>c1Agdl^7M|KJCLmrAt;^XtRoc+3s0nh-vQ7AtQz(Uk-9j zNI4O{z5J@ecN|;Q&)9~`eHFi0Ci6Px4-}RjI3IfF_4l8$lne>cJ-Gu}ZiyuJm~Kwa z=qvS*MB?m+gt(#+n`Vy0>@nf`$N3ba@_F`!&arP=nGL1COnM%#*zebC6HFVqCt8?p zRhOyTt}{9ORp&-XL|RUuWA?Lam5+|gcYlgItTrnp^u{b)p3tarVET)tCu2uhNss;A z^BP4mze}-hSpN;Sk4;M>YW&d~JbYW1`+nT9ZtQY?@9$y*1ILz)uPD}ghx2dcjG8D< zeRo%6_Fa!?O8=M}&3k-c4oNJHryL$F>I<8AUgWG=o%X$f?bh%e5D&l!)?Js`X!3qEI@DS^)!Z$E<18H0QZ z;d4%_gAkMqfZ=Hre4e=McQuXJ}2RGMwAPD&e&xNpEGI&f#-}+ zk^`K?D*RvP?7%4k776F&cn*&a)c?nsItR=k3E)v%10@10`g-AobE+TEh?Rq{qTnNU z;|c5t4se)+?#{o0vQLi zEK)9lSp$f=1Qu^V&P7tGKNQ+mMU14Ld18LkK> z4}9+h8H7`Tt1#snNVzV!Z30sIV5K);(SDFJ0OlA0lGMNIVxjgI=4wDU5Rq%^9b2z|5954!Am3f4M({RDv z1rG2SOnsDbz!-|6MKPpUQ(VYkuEY`K@nG^eo`zMY6QTs=75x9>ddw3rg@SLYOrR)> ziY8H%r4%!YJ!KP_GM+|(S%i3GFz>-C;JuxL@P5vlUAK7S!CF{=8}L+ctznNKftCAn zAWJ86Dafxqwq`~cpXDfJ9?guRsU~3 zVESK6?1f;KBr%ey6fAg~LKs|Qi%$@>>Z?OTzoRJADqHa&RSrXc~|$$d@KKD1rp z+sYM-d?I<;Xoee;eI9K0?fZbK>lg6Ba(}egxsNb|ZV2#|C&u4appQ77F*kAlenwP8DdI!^Z;J=O0RHt;;JCKs{~QbiN5mwi+NwYO#z??d}A_I`iZBxpMo zj#^csx9JUPEe!r`=b!!-q@FTQ-V)&9r;wfEw+G@&cz?<*1HE(d&8elgA-f^JB!AxI z)LY;0cyK{Sq%rUB{s!cB7k@bNw>>S1q3myG1nq>oZ+CIghHCbN3;*y70s)WjE0-HX zg!~8hhpgDYeBD3%j1(LAd<$+v^Wb$wAGy4--%%?&E9NUSfAIJ`S2pUn*8bh!W3J35 xpR<440~V9y{|c=?rOz2Kyum~rt4t@;ne1qMIGajlgwdk$&=sKBG#=6He*nr6RIdO4 literal 14238 zcmb7r2Ut{1mnMP;f*=T@VnR?skt7CGauBdZp)p{qRVXLn})dGGDU@1O7Ande!2ynVas+^Rb1J*UpCE;rA}Ru$@U z3VaJRnW}u9FTSV>((_gMnM#`&)BSDS3ufmkgk5EVxmSv#nVFd)AtFT~kGKW|`JK|pa-8+hVaq^htxVyKb zh261X(|zJ!e~JyBDl`-*jEfK~WxG@C(PUhjBJa6Esm<02d&&eWuarIL$k1qu3_8I& zRoLs5D%g0*%w)T21zXuJ@nfINT>P+8r3&_@p8YZlvB^QUTl{d8S&APAWP8L9Cz+M_ z;SBPFUdBgVj0A^_J%?r1BEb<5xOy3c#^h&YD(|S#ws_PT0r)2j-E>oUuf-Qtuv1uEf&Y>)KCpuZ>vf zb&zbg`kKsAeOWkR&5S99r~8LqqW z-wOJK9pY*Zm6}7-=1`smlwko)m>VC@#N#n#!p%SJaf?6waZwGT$!BUZbV4i`7PTri z6lruqob33I?}ddggm~F8iDiWX6h);v;Z~|3Gqvb4a>SO~sX~I}M+!wwHUy>0)CqS| zg}W&SF|IWNJ;!CK_duc*HwrA`N3B+D)u zQk?OX^~1u&5)#LBYm?Go~4*$qO41T zLWGYxAx$nQ;l(Mt&>AeH-??)~gtT8s7nzw}$VgZFsRJZ$5cLFqD9{O+vVAGGctDhZ zHYXh(lqskh)S?hYx5-c&3iEV1FH}NS!f#j^*P_Z$mI~RiVZVib{7J&rWM%18dO;l< z_PeH5Cw#)cBWA0D9~$A2?ak*upiG9skEY2GZiNDC~Pn? zrp(VzS7zkt1kD|ZEoFX@S}7D6MeGYSQu!7b^7T1NtyWoTBibLpLa#Ms=xqeSq=>>y zW3Ql<>1_;Hq!gIM$QYVzkg%^_J)d(-u z$_BMwatJIrO#CmrQiHNgF1*$YZ_*{3(-Z-OpbLd@t>q0y_|Yqc3bmk?)=;U&z*Tag zS{!&^jZkA8*dHT`gZ}^g7iz@?)X9Zqy+tCW2G=Ew#|N=L_DZd0OQeu00U+=8TVl zT6vB?bBl1{>>m0ON}W-u`)QH4cq!h2KU!Xr(?=KIBev9PPg{XJK~vXU_6AXdGj&sC zs_K~2ky}nw<$R3Stld@Ap6A7*yp)@MmD*_QVFYjO_fASt&=R-utd9GRmKD+r^_KE1 zZ{D=0CYPg@&)j(3zvwW}(gF?pKI2L5<;6S;YNig}3ZXVTLk`!6M*7k!SiVXl7kMmz zzEcBzN%Eyy$0BOvcb=Dz$EIV<@nf{i)2Dfw2E*y&VF;lAp!&@lo=>BCEU1Ut4BSdf zNpG%GA8i-=(Ev|r5~;$92eNqK8L#m&FJ7V{+VVLawf}4iW9;Wx{8)I_8FUTMXKv!v zTbFJ*(70tVce~SCA||QiUT{_GdX8IYm8NLR&FYjzAkSIcJ55!&5coHGmts!C{I)dK zo7<`1EuFqbV%*9-=XhZkt;K?7jXpRY61a;#96b4eKGWMmDu=imptp}S!4160Ycz@# zthv%BUZL5-_=EJ`KZLvK3;wOps)~B)qup_Cq^Y9oV20k(e6B#_0%oq}-V6S`ZoxHZ zt-53Z?bX`tQPL(&7xR~N>ZJCQG)OI%xy_CisfCBQmd7Bh!5k>`>ouyQH6Erqe;UZ9 z*=y7qaVgw2h87*~@ORJ4&o4W>^C&fj`125T^Rgq&WLaFN(d+baf6=~jX~P85NC}*y+OR@qsqR@pVA@^Q2qJTU>fDk3v&8!iz>L*^=6@K0Da`Q z{DFrHpu1k0_u&sG@wJ~CZgVZwd2;1_eEG`r$yj%vzx|n0KCi*Y$HQrlXq;*+xWP&Z z$*t5FYdPRph;S$rBhr1&2_=7SN!=q z%|&3~uk^l>#xE2bUx2$=wl{-r15YYhhwxD=FI(EWzLgj8i=o!2Q5@W zE;Knv-RG%$4^7i2sx63u2x@^vpSbll7PrCeCA`E{^es4|n75PZQ##FYx4r0bG~n?_ zOBGM=-lv%Z_iV7VMw+E+-n!0f_jpDtFM(2Lucl^~xR6}2! zZr!=+j={g!r*lVEZkVq*{GdCU+qjvQ()k@!TWRy`rF$Yz2J#oIp^od461jo4VHks_ zX~vS)-v9t6Xbviv)|Q&$3JtW*;}FJ{HHNB#&KF~-`6W-Dg^%EWs!?4BrS_Z z_0-M--0~RGpX2xRf#y?rI=NI22@FynptTCp7~=6$GzLzpXz0N{u6V{@xihfjFwgUX zVL$W$Q={%fG$F!&;xV2nc6C2AMN1XzUeVB)n&-wsTC%HP+ z!6EHNFg$RjlxAaS?Q$-(+eTGK5hW_kQn@YA)5R$z2p~J^clh$b9jLU{Q2w4DC%*bF6VfQONgtT zhi(C&bGXMA0r!IyP;~G#FIfjza9>)qGXQJ``grG5!h@dSQ27$HICT{2Y{bfE9v8=| zF8C?A***R^wcA(~(+bzY2S=$k?K}^Pz8C?KFVku&*F6ToIulr^k8R z8GF$O?zu> zD4YXg;P9&7k@oOjX6oYNt=;J`F#?v94!R8mBsKU`kWX7oSz7^ z@sFb!Jq+y$H<+Kw!DDR(`t}r#Fn&YDgZmE9H*VpLQi!FTrtARB%REj4mONB;B_TOq zePMs9s|Qv)jcj0+r;qZ)1^RsFjDsV!-2mhLSbe!Sg49Pu%C=M1Ax|~@IpnSaV6#T^ zuOYn&`jX5uRuIU0YyQk*XA$goT?6`XeH?tanl}%?ewJt~0xBNJr!6gyyJNxygv9r{ zyS%U$lCA?db#Vu6Ui0_k(cOoY+~Jh=z$uugd4-30CiVqlQEN%;p5ynP0M$o$_z;hK zaWjw5WWw_o)K;R3zwJyt?lAB#dui@8Z@<|Gg!ZJjx+C!2ZR$trFdg`YHlWW@{#2yE z;&4;L-Os7t{St3KcLJ2OJWjj?^Ik&MaRdJR^*mzQ1RSO!05Q5FJj8~U5&joP5 zRxMhH2haG^WRd4XZd2u*dB~MC07pTvFX8nAzL&Yrhx+$XM-UB^a21b6B57HW^Q1ek zAGgf{1|Sn~R3tmW?%NQG7jE-H6fOC2Pp}IudBpO3oD9*s>z)RL4SMRk&(&9)Ae4!_ zyogA$W(Ki*_Jf}x8$%WY?2&XfM2i`(tY2lN(AE>4&w>fGT{MK!Kp@QHE#z54-=Da^ z5f^C0oF4{dl0OFfK`HTEDVl3Jc13^v_(X&!y0@EM$^ z%gyfsqIoor7umvJ){vs|EC7C!`XdiJCq3uR14xEyY^XPFH#MCB;;-MK?jZW4hPMpg zeRA~yLe59PQCFY`^|=PY!B!KW+VP|(Jg%3TmY^D>OcAUz+FT23^-cg$);_Ou1ULJ`x zq&zr_@N+Hi1~-P@qRLFjqgsG*lv`okdE4_q+^*F8-3S-m^eL6UV&#>v*LTNk5oY>< z*e&USJelUe>t5$_x#}RzIU|tNUZtjDS1NbpZ5O0;qqKDrt69AIl43%fu=01b~!sJkwzXR0PN;-VOcfwNkcPvNS4;< zOF_!MGVDPVgW$>GgZuTQ2&vtQ95B9lwzqQTUk(`{H0RY z$JnsM^n~C1z3D%YsxK%J6)E)@YLq@F?(9_Eb_!;bs9kpc8D*_xu|8lUOrfkbT_((U zsVmggQdLAJZt8Wy?9MiRPTa=N%Y_AFO>$qYuxPAd)L@0_FXBQWrLp)>?8!RfLvy(! zCUp0GZV0>U6&9RT~+ zF!>d+rRCN6u#lHg;nKHrT;~vO2P6s;fQ}8o%ORj&3w=9Cv)Gz_6P833EX#!zy|9|T zQ|FZGpr25oO;sz^C~vN*QPKRYMhSCWjY8!H#4gs^R3$|*`I_u>v4XO>!}AxB=dW^M z>j%%@j69d(3GU{F!r+IrrU_1>X-)78uYzC%jv*Qg3AD>MeqG}6{NRX?{9tT|Cm)3( zc6U4qjiVmt&;#z&7h>gzi~RvyJmYZ@LE}J(&i;6)^!2r#r$e$+(Gf=-Z*Fk^;a4}3 zB46GRzKeWr%Y{Fu7yf*@$PK`Iba9x?6BZD--vr>cwO^? zI?8Bd9}m+)gdtFbEbdKt;B}L$<6hB5I)4i_=f)5oQ#!;@MJcvs4k=LYHh8SWR;eA( zxvk)uba%we@7%GUhZNVIo}}r}gJLc>x{JE++@mQUk5V2wu`iedMOes`GL z6X{zfvZVTR&w^~I%^ag`@O-8N&!3_JS8AmJ;hCqk*=uaezVqNYnsiLzrdQkym}umB zC#pUI90j8fGCeeof{jvaRc?0-)Z}F6XQLU-iI90m> zwAO)68!zcy>X`sL`j&W64KQzo=ofkUl~2Fm3G10ul&c_< z66|qLYQX&V30~!a46Z%QV<&@=Uo3>2wPv~JHL5s8%`sf%U<<5o;_iGNL*b<2^~rnK6j#UYzJUEHwvk=exGTm2 z`|53K*mJ~;XRV)jLqlInX)_<&#WDW2%NmJnHzIRKL?pP)Wly{lewNP2PM( zW7wg=j@nS(X!1p!$oo8PB=FFa)3)K0_>5vVoQJ;`ecVlP%L3Eu5Iv(|U`5K|D z4fz|guJ5P(u=Y(ZPtT*;3)o3a+yV`F2@xtKEKkcvL%b$q9etNLBoGf5FQT z8af3t?R9q40L9x=?x#L9*B`L-_bLcKaV{QBU1l`k!mDXMw3vuOZt5wQP+D~6xyz?- z2`U=3;ms#7xg`{Kdd&=Zv=)9Je2*D0XP^09{!qjfcs~S{cKe;&gC#z4Kcxgg-9z}? z7T4h4Ca?3vc|cZWGW6Gg%&SWNIR_DOcj#* zcnSbG<#%1{W&uX4(*QeDRy_2O*6;EcAKtcyWA?n@jYuHxJAYbzniS{fakT6nH=YjW zk6G8lAjFvmDE-oG5Nxvp@>w|(a?w_w@R~YKXZL=bd>y-*5_0T6+^YGId|! zDNAhlt4{=Y$Z7o$ucgvD%(DX0T|L7?0X!8H>lB+r{V`%$Li{LVxRje?z&RnG4NSO^ZQ`;Ti6;@(3YPkD&NnL@y>eC*deO4Gy|ut zJao>UMzNCw6f9ULz(n7{#Zi8~!i~IgFND87HCGqV{IwV?@@?wN-Q1muy~Bh#)o_0- zZI^nZ;0b=`;p$6A_lil#(Bb3MTNLI*UoDWAZ@rEwFvnhY@Dk$R6byQqKfQFghh)(A zx@42R58fnExnM^m zVei^VG*Zw}`1)7XMKUgat``V3S< z2dV)M&hXV1cxU^0;Kn?Ey5V;gTIhS3l8lHyD(9Y?;k%J=w?TX}SiM zHK9hxGbgXR0)Cc{(4foJ*GIYSwJ(b`(P=A)eKdf#yp+PJdDTw6sBLRU`0}4R4=C02?aB4hkw5Rq)hvT6l5HPM{vg%iMb!7}F*0{5HgfaRX&D(-g6VJ@U-g8O)*W(L`9*-Uv*B`=2G&EeTMXK3B(zWw!N?ga7< z1|i#qsn4UP_m=8iXj^s+RULpfciyFee9u5?&4Rrzh;9fQUg5FRFt+|wYTrXWHU+$G zn`K3tcCk0;yLqH%3Qq^*!-KwHd)HN@l4pte3(%(XA&`EMzA1U}5!4bytB1JHkG3KK z1|#tCYFOyBP0ut)9+-rkr zmS8DgZYP8Ti`j7kt_~@bma0;sUG&-+&kphshHyh zE_q=6JGFzR?Q$*(3S*8zD8-J@Xf993#ULuKXK=TTJOvp*6SZa_(M0WLI)K&=DXIJ1 zORCph1AlF_dXQINTb=-}&T_?3`mzgR?bcF#*(vVM5)G>D7$V9tP3)oCLfUZFVa}es zBl{dLQf1&}XJEku^-)J?e4#3r-X*2*ssI%aKq5oY2y~qm7WR&`T8y>!+@gtlK@q5I z&{z>|?*&NBYF@$7`aF2t?Xrak`$!@TT$4o2d+EEG-Yxi^gfG}o`S9jbI8UeI!ZV2{evb@X`YnNtQlS>=`2v~?JJ`R3P-{?JQ3lA8f4SHe=@(ih-5%LphfkNs3w za6OP|5Z+89QCtC!T;twQOT>{8TUs`w(QDj|B(UNERXG)7Q!M)J26t!Z>0J(&AHT>o z@{=?gWEL9AA3ZM9M?|s3phUG@4k)|zaWsGBa!NuT6wf0XBu_Q|Nwns0zQi((r}0K5 zgFn-@T`r9RlPZB!DCRxTIk|Ba;Cq}`cEQ(Bf3$p7grHZR03JT`S6<61gocK)0lv-9 z=$8_J#ww4T115d9y;=&ant2w@i;)LCfPA&-`rU}Yh0iV^37t5E;?aT>kx!e0_oEWU$XwAl}3Qs)ftXofq?@X70QRA6oSWI056`0VCCCQNdK( zyU2Ap$hWGX%Z~_JNyP{=q0rekk0ONd(>{RWbpU%s33q5Rx%+^bzdnaHukwr;eX`7> zRrw1zS%WVW+s2Cki7Gp8@Z>s_03p#1xbGG~>-z~4^MaL5W0e%pA!;a|vw-}!$ZaQK(f0f*lc=Riqe7#VaQZHVVVs9b!3ZMr>%s3P5R zxh-r$x((;Am2S|C4{FXuz(k+*Bm8X&QG-PVs#zm$f zXmpv_l~O*V(S#xiFp6fFMotnOxY9{_=n@mKZlfhH8>l_-#8Qfh$ zi(Repc9@1tM#saCOqPbh8%(!mjGd2UlTi00a82Sp&CA3b_}S@COWqMed0w3H5iuVd z+F{re0A}PJC}graj5)O9ZfOAEAMOTt{OSJf4oP^9tC4#*YBc#WazlMkHINUHE zHhCNbJ#K5{KMi2FYErkCJ5SfIrB z5{1S9)Dces{=0@0*FTd~`1d=|^dCfM`j2wqKN+FPx>oql>faFydcQ87P&ECez|43f z1gP5-wp)YD(G~ADnj}S4nnloXD6o5(CatoN8;Q+So_Cwu^`S`mR=M6*i>pM9ZNSv& zP@BB~`XR`@lgGp{On-2Yxw9AO6Q1JF7Phw#A;s?So0m$cF7!H8z#@iE{CLQMI!=Wa z#%Q#N|KdPMnCo~&hK&$%J!uW5;daKqev{v6!~;rFFX}i*92aRQl*Yqo#oycXSR6uL zD|TU_w6q@~w-GVfIG!0BS!L9makvO!bq-8rA_jlN4rh$ojVwyDJH?c`DYL~lsu~;AKc_dhgG-5}Ml19wTen^lv^!gV#K4MzM&hl*F zQBB9L{&-BB!p0JE{0#0)V;K$QHdmjq!RZ*x>W1jhg5re9?tX|a?j$X`MDDkOpc1U zV!Vc7qL6S&gP3RER}MZPj!l`s5n zG>NCIag-P*Q+2}crwaeat4#R6UQ%n04#!WmD&Y_6Qx1TrP?eLdPSzDik(2uWKXxQL69@*)C;`)(oeH<=S1irX2a%)6&6NMNVO6PKHXSH3y0M zk5aW3_(ZCKGv-#R8trcBaZB-W@$9`cp>_|x;^eM)o)brNb21f0N-fSn>$Fzr5XZ2V z2r&Cz+{m>yxEji|w)pT;|3SU_4@H`NB8?en>?9g{TyuBUuwSIax%S<%Jy?T-MBpeA zNHd$|?#y!lZPFM{5{WaenCBoqewxQ+hvtw(a~Rhj^BfT=k#!m8ag_*;iUdEK=NQ_g zd5%jYCve3)C-L#qJZ>V5DDhJgjXSPC=J60Ik;xfldRihlBN7-T`GcRcXp_e9lt{dA z#XRTm@zXr#MH*3V-V)6PTz|~tBT`Dg7!c*=D-rmK1V7`)A8pbY0TRhYTrp1|K7N|# zl1L-+b6KLfg6ofYu8Nf4$3n7^Ac^3bNboa$uA@yF=Kc;);1<@$u6$V5ts zA4|-0TOvph34X@U9kfYf+?7ZYaW$y$0#JO(s?%r@T@@zW&}oyQ$yq7C)r0 zTc9ive=DK*RX5h&0}xQ`V?UC?zNAk36rXZ!Dy~L_?W@&36T75z;rAgQ{Bb>fOxsDv_+z&5R&6ld@bF%| zNf(2D9QW+xsVJD>n3*w!d$-e%fZ1xwRB5Up*4S-bw3Dgw6q2f)RJEPon^M)3L~Kx{ zOf)UB3=iY(p0Y~Q<}=b2>(rb~!yL}BS?B4V&HK+NJVHVU(~8K6mVfjF}N#4q9U zO+PBpW{UlX(f`XsNWV|n89^mdh(GALZTkI*HVZ9y Date: Sat, 14 Dec 2019 11:38:39 +0100 Subject: [PATCH 085/166] fixed inference --- gensim/models/ensemblelda.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index d975835351..f6dfb1b588 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -98,6 +98,8 @@ """ +print('asdf') + import logging import os from multiprocessing import Process, Pipe, ProcessError @@ -343,8 +345,8 @@ def generate_gensim_representation(self): params["eta"] = self.eta params["num_topics"] = num_stable_topics # adjust params in a way that no training happens - params["iterations"] = 0 # no training params["passes"] = 0 # no training + # iterations is needed for inference, pass it to the model classic_model_representation = self.topic_model_class(**params) From 766f562a065f08b3dfdc0754e138ea2a15e38090 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 14 Dec 2019 11:41:43 +0100 Subject: [PATCH 086/166] removed print asdf --- gensim/models/ensemblelda.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index f6dfb1b588..d3f9299b74 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -98,8 +98,6 @@ """ -print('asdf') - import logging import os from multiprocessing import Process, Pipe, ProcessError From 46b7cf6d4f2f098da4e16d86498942fe0df5d401 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 14 Dec 2019 11:51:13 +0100 Subject: [PATCH 087/166] added spec for inference --- gensim/test/test_ensemblelda.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index f2b05bb72e..4cefd7daeb 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -7,6 +7,9 @@ Automated tests for checking the EnsembleLda Class """ +import sys +sys.path = ['/mnt/data/Code/ensemble_lda_repo/gensim'] + sys.path + import logging import unittest @@ -341,6 +344,16 @@ def test_add_and_recluster(self): # same random state, hence topics should be still similar np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + def test_inference(self): + import numpy as np + # get the most likely token id from topic 0 + max_id = np.argmax(self.eLDA.get_topics()[0,:]) + self.assertGreater(self.eLDA.classic_model_representation.iterations, 0) + # topic 0 should be dominant in the inference. + # the difference between the probabilities should be significant and larger than 0.3 + infered = self.eLDA[[(max_id, 1)]] + self.assertGreater(infered[0][1] - 0.3, infered[1][1]) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN) From 6689eaee892b6a0504e78972ae24c047f9ba3dd5 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 14 Dec 2019 12:18:04 +0100 Subject: [PATCH 088/166] tox --- gensim/test/test_ensemblelda.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 4cefd7daeb..825c0862b7 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -7,9 +7,6 @@ Automated tests for checking the EnsembleLda Class """ -import sys -sys.path = ['/mnt/data/Code/ensemble_lda_repo/gensim'] + sys.path - import logging import unittest @@ -347,7 +344,7 @@ def test_add_and_recluster(self): def test_inference(self): import numpy as np # get the most likely token id from topic 0 - max_id = np.argmax(self.eLDA.get_topics()[0,:]) + max_id = np.argmax(self.eLDA.get_topics()[0, :]) self.assertGreater(self.eLDA.classic_model_representation.iterations, 0) # topic 0 should be dominant in the inference. # the difference between the probabilities should be significant and larger than 0.3 From 80f4f045f29bf1b0634c9d40a17cbb7d5c0e878c Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 23 Jan 2020 21:32:32 +0100 Subject: [PATCH 089/166] lazy loading topic_model_class --- gensim/models/ensemblelda.py | 46 ++++++++++++++++----------------- gensim/test/test_ensemblelda.py | 11 ++++++-- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index d3f9299b74..cda0b0f168 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -110,6 +110,8 @@ from gensim.models import ldamodel, ldamulticore, basemodel from gensim.utils import SaveLoad +print('#') + logger = logging.getLogger(__name__) @@ -268,29 +270,27 @@ def __init__(self, topic_model_class="lda", num_models=3, # create model that can provide the usual gensim api to the stable topics from the ensemble self.generate_gensim_representation() - @classmethod - def load(cls, *args, **kwargs): - eLDA = super().load(*args, **kwargs) - try: - module = importlib.import_module(eLDA.topic_model_module_string) - eLDA.topic_model_class = getattr(module, eLDA.topic_model_class_string) - del eLDA.topic_model_module_string - del eLDA.topic_model_class_string - except ModuleNotFoundError: - logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" ' - 'attribute. Try setting this manually instead after loading.' - .format(eLDA.topic_model_class_string, eLDA.topic_model_class_string)) - except AttributeError: - logger.error('Could not import the "{}" class from the "{}" module in order to set the ' - '"topic_model_class" attribute. Try setting this manually instead after loading.' - .format(eLDA.topic_model_class_string, eLDA.topic_model_module_string)) - return eLDA - - load.__doc__ = SaveLoad.load.__doc__ + def get_topic_model_class(self): + if self.topic_model_class is None: + try: + module = importlib.import_module(self.topic_model_module_string) + self.topic_model_class = getattr(module, self.topic_model_class_string) + del self.topic_model_module_string + del self.topic_model_class_string + except ModuleNotFoundError: + logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" ' + 'attribute. Try setting this manually instead after loading.' + .format(self.topic_model_class_string, self.topic_model_class_string)) + except AttributeError: + logger.error('Could not import the "{}" class from the "{}" module in order to set the ' + '"topic_model_class" attribute. Try setting this manually instead after loading.' + .format(self.topic_model_class_string, self.topic_model_module_string)) + return self.topic_model_class def save(self, *args, **kwargs): - self.topic_model_module_string = self.topic_model_class.__module__ - self.topic_model_class_string = self.topic_model_class.__name__ + if self.get_topic_model_class() is not None: + self.topic_model_module_string = self.topic_model_class.__module__ + self.topic_model_class_string = self.topic_model_class.__name__ kwargs['ignore'] = frozenset(kwargs.get('ignore', ())).union(('topic_model_class', )) super(EnsembleLda, self).save(*args, **kwargs) @@ -346,7 +346,7 @@ def generate_gensim_representation(self): params["passes"] = 0 # no training # iterations is needed for inference, pass it to the model - classic_model_representation = self.topic_model_class(**params) + classic_model_representation = self.get_topic_model_class()(**params) # when eta was None, use what gensim generates as default eta for the following tasks: eta = classic_model_representation.eta @@ -642,7 +642,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): kwArgs["random_state"] = random_states[i] - tm = self.topic_model_class(**kwArgs) + tm = self.get_topic_model_class()(**kwArgs) # adds the lambda (that is the unnormalized get_topics) to ttda, which is # a list of all those lambdas diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 825c0862b7..fbf084417f 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -13,7 +13,7 @@ import numpy as np from copy import deepcopy -from gensim.models import EnsembleLda, LdaMulticore +from gensim.models import EnsembleLda, LdaMulticore, LdaModel from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary num_topics = 2 @@ -138,9 +138,16 @@ def test_persisting(self): self.eLDA_mu.save(fname) loaded_eLDA_mu = EnsembleLda.load(fname) + # topic_model_class will be lazy loaded and should be None first + assert loaded_eLDA.topic_model_class is None + # was it stored and loaded correctly? - # memory friendly + # memory friendly. loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() + + # generating the representation also lazily loads the topic_model_class + assert loaded_eLDA.topic_model_class == LdaModel + topics = loaded_eLDA_representation.get_topics() ttda = loaded_eLDA.ttda amatrix = loaded_eLDA.asymmetric_distance_matrix From b05bf11861176d0d1eb62d5f80dab8bb152cbccb Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 23 Jan 2020 21:38:02 +0100 Subject: [PATCH 090/166] tox --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index cda0b0f168..fa8c3b4791 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -278,8 +278,8 @@ def get_topic_model_class(self): del self.topic_model_module_string del self.topic_model_class_string except ModuleNotFoundError: - logger.error('Could not import the "{}" module in order to provide the "{}" class as "topic_model_class" ' - 'attribute. Try setting this manually instead after loading.' + logger.error('Could not import the "{}" module in order to provide the "{}" class as ' + '"topic_model_class" attribute. Try setting this manually instead after loading.' .format(self.topic_model_class_string, self.topic_model_class_string)) except AttributeError: logger.error('Could not import the "{}" class from the "{}" module in order to set the ' From 39a9b62f88f97fb9f3fb93f1f5ef3484b876a553 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 23 Jan 2020 21:41:08 +0100 Subject: [PATCH 091/166] removed debug thing --- gensim/models/ensemblelda.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index fa8c3b4791..17703b6bbb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -110,8 +110,6 @@ from gensim.models import ldamodel, ldamulticore, basemodel from gensim.utils import SaveLoad -print('#') - logger = logging.getLogger(__name__) From 09ded139d6a60230436ddd61611b1c5d3e21897b Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 23 Jan 2020 21:49:06 +0100 Subject: [PATCH 092/166] Documents now compile --- docs/src/apiref.rst | 1 + docs/src/auto_examples/index.rst | 82 ++++++++++++++++---------------- gensim/models/ensemblelda.py | 54 +++++++++++---------- 3 files changed, 71 insertions(+), 66 deletions(-) diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 6218336b06..4158dcc6a7 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -29,6 +29,7 @@ Modules: corpora/wikicorpus models/ldamodel models/ldamulticore + models/ensemblelda models/nmf models/lsimodel models/ldaseqmodel diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index 5566611a8b..62dba3d50a 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a .. raw:: html -

+
@@ -33,9 +33,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png - :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` + :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` .. raw:: html @@ -53,9 +53,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png - :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` + :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` .. raw:: html @@ -73,9 +73,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png - :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` + :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` .. raw:: html @@ -93,9 +93,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png - :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` + :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` .. raw:: html @@ -108,7 +108,7 @@ Understanding this functionality is vital for using gensim effectively. /auto_examples/core/run_similarity_queries .. raw:: html -
+
@@ -127,9 +127,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` + :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` .. raw:: html @@ -147,9 +147,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` + :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` .. raw:: html @@ -167,9 +167,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -187,9 +187,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html @@ -207,9 +207,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html @@ -227,9 +227,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py` + :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py` .. raw:: html @@ -247,9 +247,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -267,9 +267,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py` + :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py` .. raw:: html @@ -287,9 +287,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py` + :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py` .. raw:: html @@ -302,7 +302,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod /auto_examples/tutorials/run_pivoted_doc_norm .. raw:: html -
+
@@ -321,9 +321,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` + :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` .. raw:: html @@ -341,9 +341,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc.py` .. raw:: html @@ -361,9 +361,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` .. raw:: html @@ -381,9 +381,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` + :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` .. raw:: html @@ -396,7 +396,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u /auto_examples/howtos/run_compare_lda .. raw:: html -
+
@@ -440,7 +440,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. raw:: html -
+
@@ -452,13 +452,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. container:: sphx-glr-download - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` .. container:: sphx-glr-download - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index d3f9299b74..6fb15abde8 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -3,6 +3,8 @@ # # Authors: Tobias Brigl , Alex Salles , # Alex Loosley , Data Reply Munich +# + """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. @@ -93,11 +95,10 @@ Other Notes ----------- -.. The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and +The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and comments. """ - import logging import os from multiprocessing import Process, Pipe, ProcessError @@ -170,10 +171,13 @@ def __init__(self, topic_model_class="lda", num_models=3, (technically a divergence) from distribution A to B is more of a measure of if A is contained in B. At a high level, this involves using distribution A to mask distribution B and then calculating the cosine distance between the two. The masking can be done in two ways: - mass: forms mask by taking the top ranked terms until their cumulative mass reaches the - `masking_threshold` - rank: forms mask by taking the top ranked terms (by mass) until the `masking_threshold` is reached. - For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask. + + 1. mass: forms mask by taking the top ranked terms until their cumulative mass reaches the + 'masking_threshold' + + 2. rank: forms mask by taking the top ranked terms (by mass) until the 'masking_threshold' is reached. + For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask. + masking_threshold : float, optional Default: None, which uses 0.95 for "mass", and 0.11 for masking_method "rank". In general, too small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy distance @@ -1132,32 +1136,32 @@ def id2word(self): class CBDBSCAN(): """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). - The algorithm works based on DBSCAN-like parameters `eps` and `min_samples` that respectively define how far a + The algorithm works based on DBSCAN-like parameters 'eps' and 'min_samples' that respectively define how far a "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html). The algorithm works as follows: - 1. (A)symmetric distance matrix provided at fit-time (called `amatrix`). - For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5), - T_1, T_2, T_3, T_4, T_5: + + 1. (A)symmetric distance matrix provided at fit-time (called 'amatrix'). + For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5), + T_1, T_2, T_3, T_4, T_5: 2. Start by scanning a candidate topic with respect to a parent topic - (e.g. T_1 with respect to parent None) - 3. Check which topics are nearby the candidate topic using `self.eps` as a threshold and call them neighbours - (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours) - 4. If there are more neighbours than `self.min_samples`, the candidate topic becomes a core candidate for a cluster - (e.g. if `min_samples`=1, then T_1 becomes the first core of a cluster) + (e.g. T_1 with respect to parent None) + 3. Check which topics are nearby the candidate topic using 'self.eps' as a threshold and call them neighbours + (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours) + 4. If there are more neighbours than 'self.min_samples', the candidate topic becomes a core candidate for a cluster + (e.g. if 'min_samples'=1, then T_1 becomes the first core of a cluster) 5. If candidate is a core, CheckBack (CB) to find the fraction of neighbours that are either the parent or the - parent's neighbours. If this fraction is more than 75%, give the candidate the same label as its parent. - (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given) + parent's neighbours. If this fraction is more than 75%, give the candidate the same label as its parent. + (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given) 6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as - the parent and the previous neighbours as the parent_neighbours - repeat steps - 2.-6. - 2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5) - 3. (e.g. T5 is the only neighbour) - 4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core) - 5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3. - Therefore the candidate T_3 does NOT get the same label as its parent T_1) - 6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5) + the parent and the previous neighbours as the parent_neighbours - repeat steps 2-6: + 2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5) + 3. (e.g. T5 is the only neighbour) + 4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core) + 5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3. + \ \ \ Therefore the candidate T_3 does NOT get the same label as its parent T_1) + 6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5) The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is From 4bd7cf7516b397ae18cf8d35cfc6a1626d4c0d5d Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 23 Jan 2020 21:59:02 +0100 Subject: [PATCH 093/166] escape sequence thing indent fix --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index bad7a16fe2..7f043de437 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1158,7 +1158,7 @@ class CBDBSCAN(): 3. (e.g. T5 is the only neighbour) 4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core) 5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3. - \ \ \ Therefore the candidate T_3 does NOT get the same label as its parent T_1) + Therefore the candidate T_3 does NOT get the same label as its parent T_1) 6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5) The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for From 49253df75c4d15627657dd7af8dfd3e912511dab Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 23 Jan 2020 22:06:07 +0100 Subject: [PATCH 094/166] Better document rendering and added opinosiscorpus to apirefs --- docs/src/apiref.rst | 1 + docs/src/models/ensemblelda.rst | 9 +++++++++ gensim/models/ensemblelda.py | 3 ++- 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 docs/src/models/ensemblelda.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 4158dcc6a7..996a6079d9 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -22,6 +22,7 @@ Modules: corpora/malletcorpus corpora/mmcorpus corpora/_mmreader + corpora/opinosiscorpus corpora/sharded_corpus corpora/svmlightcorpus corpora/textcorpus diff --git a/docs/src/models/ensemblelda.rst b/docs/src/models/ensemblelda.rst new file mode 100644 index 0000000000..42e63c94be --- /dev/null +++ b/docs/src/models/ensemblelda.rst @@ -0,0 +1,9 @@ +:mod:`models.ensembelda` -- Ensemble Latent Dirichlet Allocation +================================================================ + +.. automodule:: gensim.models.ensemblelda + :synopsis: Ensemble Latent Dirichlet Allocation + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index bad7a16fe2..411ba6e18b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1154,11 +1154,12 @@ class CBDBSCAN(): (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given) 6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as the parent and the previous neighbours as the parent_neighbours - repeat steps 2-6: + 2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5) 3. (e.g. T5 is the only neighbour) 4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core) 5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3. - \ \ \ Therefore the candidate T_3 does NOT get the same label as its parent T_1) + Therefore the candidate T_3 does NOT get the same label as its parent T_1) 6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5) The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for From 5669f5866e5b2c6308885379334e78688f5a6bf0 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 23 Jan 2020 22:56:56 +0100 Subject: [PATCH 095/166] citation opinosis --- docs/notebooks/ensemble_lda_with_opinosis.ipynb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb index 72ad2a26ba..c79f2a1416 100644 --- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb +++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb @@ -52,8 +52,7 @@ "\n", "Opinosis [1] is a small (but redundant) corpus that contains 289 product reviews for 51 products. Since it's so small, the results are rather unstable.\n", "\n", - "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions,_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348.\n", - "http://kavita-ganesan.com/opinosis-opinion-dataset/ https://github.com/kavgan/opinosis" + "[1] Kavita Ganesan, ChengXiang Zhai, and Jiawei Han, _Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions [online],_ Proceedings of the 23rd International Conference on Computational Linguistics, Association for Computational Linguistics, 2010, pp. 340–348. Available from: https://kavita-ganesan.com/opinosis/" ] }, { @@ -168,7 +167,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.8.1" } }, "nbformat": 4, From 2aabe8f1f929f41aee979115658e820942bc0866 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Fri, 24 Jan 2020 15:03:04 +0100 Subject: [PATCH 096/166] missing opinosiscorpus.rst file committed --- docs/src/corpora/opinosiscorpus.rst | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 docs/src/corpora/opinosiscorpus.rst diff --git a/docs/src/corpora/opinosiscorpus.rst b/docs/src/corpora/opinosiscorpus.rst new file mode 100644 index 0000000000..3f62454677 --- /dev/null +++ b/docs/src/corpora/opinosiscorpus.rst @@ -0,0 +1,9 @@ +:mod:`corpora.opinosiscorpus` -- Topic related review sentences +=============================================================== + +.. automodule:: gensim.corpora.opinosiscorpus + :synopsis: Topic related review sentences + :members: + :inherited-members: + :undoc-members: + :show-inheritance: From 9c25cf5abd666a57d0106bb115f60391d6d54177 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 6 Feb 2020 20:35:07 +0100 Subject: [PATCH 097/166] p names refactored to be descriptive, now using append for appending singletons to list --- gensim/models/ensemblelda.py | 66 ++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1132141a32..5833c5a41c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -573,34 +573,34 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): # get the chunk from the random states that is meant to be for those models random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] - p = Process(target=self._generate_topic_models, + process = Process(target=self._generate_topic_models, args=(num_subprocess_models, random_states_for_worker, childConn)) - processes += [p] - pipes += [(parentConn, childConn)] - p.start() + processes.append(process) + pipes.append((parentConn, childConn)) + process.start() num_models_unhandled -= num_subprocess_models except ProcessError: logger.error("could not start process {}".format(i)) # close all pipes - for p in pipes: - p[1].close() - p[0].close() + for pipe in pipes: + pipe[1].close() + pipe[0].close() # end all processes - for p in processes: - if p.is_alive(): - p.terminate() - del p + for process in processes: + if process.is_alive(): + process.terminate() + del process # stop raise # aggregate results # will also block until workers are finished - for p in pipes: - answer = p[0].recv() # [0], because that is the parentConn - p[0].close() + for pipe in pipes: + answer = pipe[0].recv() # [0], because that is the parentConn + pipe[0].close() # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): if not self.memory_friendly_ttda: self.tms += answer @@ -610,8 +610,8 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): self.ttda = np.concatenate([self.ttda, ttda]) # end all processes - for p in processes: - p.terminate() + for process in processes: + process.terminate() def _generate_topic_models(self, num_models, random_states=None, pipe=None): """Train the topic models that form the ensemble. @@ -731,39 +731,39 @@ def _generate_asymmetric_distance_matrix(self): else: n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) - p = Process(target=self._asymmetric_distance_matrix_worker, + process = Process(target=self._asymmetric_distance_matrix_worker, args=(i, ttdas_sent, n_ttdas, childConn, threshold, method)) ttdas_sent += n_ttdas - processes += [p] - pipes += [(parentConn, childConn)] - p.start() + processes.append(process) + pipes.append((parentConn, childConn)) + process.start() except ProcessError: logger.error("could not start process {}".format(i)) # close all pipes - for p in pipes: - p[1].close() - p[0].close() + for pipe in pipes: + pipe[1].close() + pipe[0].close() # end all processes - for p in processes: - if p.is_alive(): - p.terminate() - del p + for process in processes: + if process.is_alive(): + process.terminate() + del process raise distances = [] # note, that the following loop maintains order in how the ttda will be concatenated # which is very important. Ordering in ttda has to be the same as when using only one process - for p in pipes: - answer = p[0].recv() # [0], because that is the parentConn - p[0].close() # child conn will be closed from inside the worker + for pipe in pipes: + answer = pipe[0].recv() # [0], because that is the parentConn + pipe[0].close() # child conn will be closed from inside the worker # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): - distances += [answer[1]] + distances.append(answer[1]) # end all processes - for p in processes: - p.terminate() + for process in processes: + process.terminate() self.asymmetric_distance_matrix = np.concatenate(distances) From 4079c27dbb75ee3f9fff8791c1e83d2398aabea8 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 6 Feb 2020 20:48:30 +0100 Subject: [PATCH 098/166] Changing to hanging indents where they were not used before --- gensim/models/ensemblelda.py | 37 ++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 5833c5a41c..b0e40a6f6c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -280,11 +280,13 @@ def get_topic_model_class(self): del self.topic_model_module_string del self.topic_model_class_string except ModuleNotFoundError: - logger.error('Could not import the "{}" module in order to provide the "{}" class as ' + logger.error( + 'Could not import the "{}" module in order to provide the "{}" class as ' '"topic_model_class" attribute. Try setting this manually instead after loading.' .format(self.topic_model_class_string, self.topic_model_class_string)) except AttributeError: - logger.error('Could not import the "{}" class from the "{}" module in order to set the ' + logger.error( + 'Could not import the "{}" class from the "{}" module in order to set the ' '"topic_model_class" attribute. Try setting this manually instead after loading.' .format(self.topic_model_class_string, self.topic_model_module_string)) return self.topic_model_class @@ -487,9 +489,10 @@ def add_model(self, target, num_new_models=None): # 1. ttda array if isinstance(target.dtype.type(), (np.number, float)): - raise ValueError('ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, ' - 'you can call convert_to_memory_friendly, but it will discard the stored gensim' - 'models and only keep the relevant topic term distributions from them.') + raise ValueError( + 'ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, ' + 'you can call convert_to_memory_friendly, but it will discard the stored gensim ' + 'models and only keep the relevant topic term distributions from them.') # 2. list of ensembles elif isinstance(target[0], type(self)): @@ -509,15 +512,17 @@ def add_model(self, target, num_new_models=None): # in this case, len(self.tms) should # always match self.num_models if num_new_models is not None and num_new_models + self.num_models != len(self.tms): - logger.info('num_new_models will be ignored. num_models should match the number of ' - 'stored models for a memory unfriendly ensemble') + logger.info( + 'num_new_models will be ignored. num_models should match the number of ' + 'stored models for a memory unfriendly ensemble') self.num_models = len(self.tms) logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) if self.ttda.shape[1] != ttda.shape[1]: - raise ValueError(("target ttda dimensions do not match. Topics must be {} but was" - "{} elements large").format(self.ttda.shape[-1], ttda.shape[-1])) + raise ValueError( + "target ttda dimensions do not match. Topics must be {} but was {} elements large" + .format(self.ttda.shape[-1], ttda.shape[-1])) self.ttda = np.append(self.ttda, ttda, axis=0) # tell recluster that the distance matrix needs to be regenerated @@ -636,16 +641,16 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): assert len(random_states) == num_models - kwArgs = self.gensim_kw_args.copy() + kwargs = self.gensim_kw_args.copy() tm = None # remember one of the topic models from the following # loop, in order to collect some properties from it afterwards. for i in range(num_models): - kwArgs["random_state"] = random_states[i] + kwargs["random_state"] = random_states[i] - tm = self.get_topic_model_class()(**kwArgs) + tm = self.get_topic_model_class()(**kwargs) # adds the lambda (that is the unnormalized get_topics) to ttda, which is # a list of all those lambdas @@ -721,8 +726,7 @@ def _generate_asymmetric_distance_matrix(self): try: parentConn, childConn = Pipe() - # figure out how many ttdas to send to the worker - # 9 ttdas to 4 workers: 2 2 2 3 + # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3. n_ttdas = 0 if i == workers - 1: # i is a index, hence -1 # is this the last worker that needs to be created? @@ -741,15 +745,16 @@ def _generate_asymmetric_distance_matrix(self): except ProcessError: logger.error("could not start process {}".format(i)) - # close all pipes + for pipe in pipes: pipe[1].close() pipe[0].close() - # end all processes + for process in processes: if process.is_alive(): process.terminate() del process + raise distances = [] From f5379ff7485b77de05c102d453a8c3e35a51546a Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 6 Feb 2020 21:32:38 +0100 Subject: [PATCH 099/166] Adding :meth: and `` `` styling for RST --- gensim/models/ensemblelda.py | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index b0e40a6f6c..0f7fec955b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -8,7 +8,7 @@ """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. -Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that +Extracts reliable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that the user does not need to know the exact number of topics the topic model should extract ahead of time For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). @@ -319,7 +319,7 @@ def generate_gensim_representation(self): ------- :py:class:`gensim.models.LdaModel` A Gensim LDA Model classic_model_representation for which: - classic_model_representation.get_topics() == self.get_topics() + ``classic_model_representation.get_topics() == self.get_topics()`` """ logger.info("generating classic gensim model representation based on results from the ensemble") @@ -374,7 +374,7 @@ def generate_gensim_representation(self): # the factor, that will be used when get_topics() is used, for normalization # will never change, because the sum for eta as well as the sum for sstats is constant. # Therefore predicting normalization_factor becomes super easy. - # corpus is a mapping of id to occurences + # corpus is a mapping of id to occurrences # so one can also easily calculate the # right sstats, so that get_topics() will return the stable topics no @@ -394,7 +394,7 @@ def generate_gensim_representation(self): return classic_model_representation def add_model(self, target, num_new_models=None): - """Add the ttda of another model to the ensemble. + """Add the topic term distribution array (ttda) of another model to the ensemble. This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use the exact same dictionary/idword mapping. @@ -403,12 +403,12 @@ def add_model(self, target, num_new_models=None): self._generate_asymmetric_distance_matrix() self.recluster() - The ttda of another ensemble can also be used, in that case set num_new_models to the num_models parameter - of the ensemble, that means the number of classic models in the ensemble that generated the ttda. This is - important, because that information is used to estimate "min_samples" for _generate_topic_clusters. + The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models`` + parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda. + This is important, because that information is used to estimate "min_samples" for _generate_topic_clusters. If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other - models, you can get it from: self.id2word. + models, you can get it from: ``self.id2word``. Parameters ---------- @@ -435,7 +435,7 @@ def add_model(self, target, num_new_models=None): If target is a 2D-array of float values, it assumes 1. - If the ensemble has memory_friendly_ttda set to False, then it will always use the number of models in + If the ensemble has ``memory_friendly_ttda`` set to False, then it will always use the number of models in the target parameter. """ @@ -775,7 +775,7 @@ def _generate_asymmetric_distance_matrix(self): return self.asymmetric_distance_matrix def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): - """Calculate an (asymmetric) distance from each topic in ttda1 to each topic in ttda2. + """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. Parameters ---------- @@ -783,10 +783,10 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. threshold : float, optional - threshold defaults to: {"mass": 0.95, "rank": 0.11}, depending on the selected method + threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method start_index : int this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the - complete ttda in that case. start_index would be 0 if ttda1 == self.ttda. When self.ttda is split into two + complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 method : {'mass', 'rank}, optional method can be "mass" for the original masking method or "rank" for a faster masking method that selects @@ -796,7 +796,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s Returns ------- 2D Numpy.numpy.ndarray of floats - Asymmetric distance matrix of size len(ttda1) by len(ttda2). + Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. """ # initialize the distance matrix. ndarray is faster than zeros @@ -860,14 +860,15 @@ def rank_masking(a): def _generate_topic_clusters(self, eps=0.1, min_samples=None): """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices. - The final approval and generation of stable topics is done in _generate_stable_topics(). + The final approval and generation of stable topics is done in ``_generate_stable_topics()``. Parameters ---------- eps : float - eps is 0.1 by default. - min_samples : int - min_samples is int(self.num_models / 2) + dbscan distance scale + min_samples : int, optional + defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold which corresponds to finding + stable topics and should scale with the number of models, ``self.num_models`` """ if min_samples is None: @@ -878,18 +879,16 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None): self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) self.cluster_model.fit(self.asymmetric_distance_matrix) - def _validate_core(self, core): - """Check if the core has only a single valid parent, which is also the label assigned to the core. - - If the presumed core is not even a core returns False. + def _is_valid_core(self, topic): + """Check if the topic is a valid core. Parameters ---------- - core : {'is_core', 'valid_parents', 'labels'} - eps is 0.1 by default. + topic : {'is_core', 'valid_parents', 'label'} + topic to validate """ - return core["is_core"] and (core["valid_parents"] == {core["label"]}) + return topic["is_core"] and (topic["valid_parents"] == {topic["label"]}) def _group_by_labels(self, results): """Group all the learned cores by their label, which was assigned in the cluster_model. @@ -969,31 +968,32 @@ def _aggregate_topics(self, grouped_by_labels): return sorted_clusters def _remove_from_all_sets(self, label, clusters): - """Remove a label from every set in "neighboring_labels" for each core in clusters.""" + """Remove a label from every set in "neighboring_labels" for each core in ``clusters``.""" for cluster in clusters: for neighboring_labels_set in cluster["neighboring_labels"]: if label in neighboring_labels_set: neighboring_labels_set.remove(label) def _contains_isolated_cores(self, label, cluster, min_cores): - """Check if the cluster has at least min_cores of cores that belong to no other cluster.""" + """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster.""" return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores def _generate_stable_topics(self, min_cores=None): """Generate stable topics out of the clusters. The function finds clusters of topics using a variant of DBScan. If a cluster has enough core topics - (c.f. parameter min_cores), then this cluster represents a stable topic. The stable topic is specifically + (c.f. parameter ``min_cores``), then this cluster represents a stable topic. The stable topic is specifically calculated as the average over all topic-term distributions of the core topics in the cluster. This function is the last step that has to be done in the ensemble. After this step is complete, - Stable topics can be retrieved afterwards using the get_topics() method. + Stable topics can be retrieved afterwards using the :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics` + method. Parameters ---------- min_cores : int Minimum number of core topics needed to form a cluster that represents a stable topic. - Using None defaults to min_cores = min(3, max(1, int(self.num_models /4 +1))) + Using ``None`` defaults to ``min_cores = min(3, max(1, int(self.num_models /4 +1)))`` """ # min_cores being 0 makes no sense. there has to be a core for a cluster @@ -1039,7 +1039,7 @@ def _generate_stable_topics(self, min_cores=None): topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels} # keeping only VALID cores - valid_core_mask = np.vectorize(self._validate_core)(results) + valid_core_mask = np.vectorize(self._is_valid_core)(results) valid_topics = self.ttda[valid_core_mask] topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask] unique_labels = np.unique(topic_labels) @@ -1059,20 +1059,20 @@ def _generate_stable_topics(self, min_cores=None): def recluster(self, eps=0.1, min_samples=None, min_cores=None): """Reapply CBDBSCAN clustering and stable topic generation. - Stable topics can be retrieved using get_topics(). + Stable topics can be retrieved using :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics`. Parameters ---------- eps : float epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. - default: 0.1 + default: ``0.1`` min_samples : int The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. - default: int(self.num_models / 2) + default: ``int(self.num_models / 2)`` min_cores : int how many cores a cluster has to have, to be treated as stable topic. That means, how many topics that look similar have to be present, so that the average topic in those is used as stable topic. - default: min(3, max(1, int(self.num_models /4 +1))) + default: ``min(3, max(1, int(self.num_models /4 +1)))`` """ # if new models were added to the ensemble, the distance matrix needs to be generated again From 591cf77bffddf35ad1badae408b4ac007a8b042b Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 6 Feb 2020 21:37:50 +0100 Subject: [PATCH 100/166] a bunch of reviews --- .../ensemble_lda_with_opinosis.ipynb | 16 +++++++--- gensim/corpora/opinosiscorpus.py | 30 ++++++------------- gensim/models/ensemblelda.py | 14 +++++---- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb index c79f2a1416..7380a87f71 100644 --- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb +++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb @@ -2,11 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "####\n" + ] + } + ], "source": [ "import logging\n", "from gensim.models import EnsembleLda, LdaMulticore\n", @@ -23,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py index ffc0da8eba..1a68ec670c 100644 --- a/gensim/corpora/opinosiscorpus.py +++ b/gensim/corpora/opinosiscorpus.py @@ -54,33 +54,21 @@ def __init__(self, path): stemmer = PorterStemmer() for directory, b, filenames in os.walk(path): - # iterates over folders, so in this - # scope a new topic is prepared - - # root directory? - if len(filenames) == 0: - continue - - # folder = directory.split(os.sep)[-1] - # print("processing folder ", "'"+folder+"'", "...") - + # each subdirectory of path is one collection of reviews to a specific product # now get the corpus/documents for filename in filenames: - filepath = directory + os.sep + filename # write down the document and the topicId and split into train and testdata with open(filepath) as file: - doc = file.read() - # overwrite dictionary, add corpus to test or train afterwards - # the following function takes an array of documents, so wrap it in square braces - processed = [ - stemmer.stem(token) for token in re.findall(r'\w+', doc.lower()) - if token not in STOPWORDS - ] - - dictionary.add_documents([processed]) - corpus += [dictionary.doc2bow(processed)] + + preprocessed_doc = [ + stemmer.stem(token) for token in re.findall(r'\w+', doc.lower()) + if token not in STOPWORDS + ] + + dictionary.add_documents([preprocessed_doc]) + corpus += [dictionary.doc2bow(preprocessed_doc)] # and return the results the same way the other corpus generating functions do self.corpus = corpus diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1132141a32..ba80a4ab3c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -113,6 +113,8 @@ logger = logging.getLogger(__name__) +print('####') + class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. @@ -274,6 +276,8 @@ def __init__(self, topic_model_class="lda", num_models=3, def get_topic_model_class(self): """Get the class that is used for :meth:`gensim.models.EnsembleLda.generate_gensim_representation`.""" if self.topic_model_class is None: + instruction = 'Try setting topic_model_class manually to what the individual models were based on, ' \ + 'e.g. LdaMulticore.' try: module = importlib.import_module(self.topic_model_module_string) self.topic_model_class = getattr(module, self.topic_model_class_string) @@ -281,12 +285,12 @@ def get_topic_model_class(self): del self.topic_model_class_string except ModuleNotFoundError: logger.error('Could not import the "{}" module in order to provide the "{}" class as ' - '"topic_model_class" attribute. Try setting this manually instead after loading.' - .format(self.topic_model_class_string, self.topic_model_class_string)) + '"topic_model_class" attribute. {}' + .format(self.topic_model_class_string, self.topic_model_class_string, instruction)) except AttributeError: logger.error('Could not import the "{}" class from the "{}" module in order to set the ' - '"topic_model_class" attribute. Try setting this manually instead after loading.' - .format(self.topic_model_class_string, self.topic_model_module_string)) + '"topic_model_class" attribute. {}' + .format(self.topic_model_class_string, self.topic_model_module_string, instruction)) return self.topic_model_class def save(self, *args, **kwargs): @@ -361,7 +365,7 @@ def generate_gensim_representation(self): # to generate the proper sstats for the new gensim model: # transform to dimensionality of stable_topics. axis=1 is summed eta_sum = 0 - if int == type(eta) or float == type(eta): + if isinstance(eta, (int, float)): eta_sum = [eta * len(stable_topics[0])] * num_stable_topics else: if len(eta.shape) == 1: # [e1, e2, e3] From f1aba3e7aa5c2c1f992e9f42ff6fae03dbde2b30 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 6 Feb 2020 21:58:25 +0100 Subject: [PATCH 101/166] * Changed ensemblelda default to use ldamulticore instead of old lda model. * Added more :meth: and `` `` styling for RST, fixed a few typos --- gensim/models/ensemblelda.py | 38 +++++++++++++++++------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 0f7fec955b..b9e6129403 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -122,7 +122,7 @@ class EnsembleLda(SaveLoad): """ - def __init__(self, topic_model_class="lda", num_models=3, + def __init__(self, topic_model_class="ldamulticore", num_models=3, min_cores=None, # default value from _generate_stable_topics() epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, min_samples=None, masking_method="mass", masking_threshold=None, @@ -135,13 +135,13 @@ def __init__(self, topic_model_class="lda", num_models=3, ---------- topic_model_class : str, topic model, optional Examples: - 'ldamulticore' (recommended), 'lda' (default), + * 'ldamulticore' (default, recommended) + * 'lda' ensemble_workers : number, optional Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. num_models should be a multiple of ensemble_workers. - Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1 - gensim.models.ldamodel, gensim.models.ldamulticore + Setting it to 0 or 1 will both use the non-multiprocessing version. Default: 1 num_models : int, optional How many LDA models to train in this ensemble. Default: 3 @@ -153,7 +153,7 @@ def __init__(self, topic_model_class="lda", num_models=3, Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. num_models should be a multiple of ensemble_workers. - Setting it to 0 or 1 will both use the nonmultiprocessing version. Default:1 + Setting it to 0 or 1 will both use the nonmultiprocessing version. Default: 1 memory_friendly_ttda : boolean, optional If True, the models in the ensemble are deleted after training and only a concatenation of each model's topic term distribution (called ttda) is kept to save memory. @@ -179,12 +179,12 @@ def __init__(self, topic_model_class="lda", num_models=3, For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask. masking_threshold : float, optional - Default: None, which uses 0.95 for "mass", and 0.11 for masking_method "rank". In general, too small a mask - threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy distance - calculations. Defaults are often a good sweet spot for this hyperparameter. + Default: None, which uses ``0.95`` for "mass", and ``0.11`` for masking_method "rank". In general, too + small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy + distance calculations. Defaults are often a good sweet spot for this hyperparameter. distance_workers : int, optional - When distance_workers is None, it defaults to os.cpu_count() for maximum performance. Default is 1, which - is not multiprocessed. Set to > 1 to enable multiprocessing. + When ``distance_workers`` is ``None``, it defaults to ``os.cpu_count()`` for maximum performance. Default is + 1, which is not multiprocessed. Set to ``> 1`` to enable multiprocessing. **gensim_kw_args Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble. @@ -399,9 +399,9 @@ def add_model(self, target, num_new_models=None): This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use the exact same dictionary/idword mapping. - In order to generate new stable topics afterwards, use - self._generate_asymmetric_distance_matrix() - self.recluster() + In order to generate new stable topics afterwards, use: + 1. ``self._generate_asymmetric_distance_matrix()`` + 2. ``self.``:meth:`~gensim.models.ensemblelda.EnsembleLda.recluster` The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models`` parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda. @@ -986,7 +986,7 @@ def _generate_stable_topics(self, min_cores=None): calculated as the average over all topic-term distributions of the core topics in the cluster. This function is the last step that has to be done in the ensemble. After this step is complete, - Stable topics can be retrieved afterwards using the :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics` + Stable topics can be retrieved afterwards using the :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics` method. Parameters @@ -1059,7 +1059,7 @@ def _generate_stable_topics(self, min_cores=None): def recluster(self, eps=0.1, min_samples=None, min_cores=None): """Reapply CBDBSCAN clustering and stable topic generation. - Stable topics can be retrieved using :meth:`gensim.models.ensemblelda.EnsembleLda.get_topics`. + Stable topics can be retrieved using :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`. Parameters ---------- @@ -1181,7 +1181,6 @@ def __init__(self, eps, min_samples): ---------- eps : float epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering. - default: 0.1 min_samples : int The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN. @@ -1212,15 +1211,14 @@ def fit(self, amatrix): def scan_topic(topic_index, current_label=None, parent_neighbors=None): """Extend the cluster in one direction. - Results are accumulated in topic_clustering_results, a variable outside of this function. + Results are accumulated to ``self.results``. Parameters ---------- topic_index : int - The topic that might be added to the existing cluster, or which might create a new cluster if - neccessary. + The topic that might be added to the existing cluster, or which might create a new cluster if necessary. current_label : int - The label of the cluster that might be suitable for topic_index + The label of the cluster that might be suitable for ``topic_index`` """ neighbors_sorted = sorted( From a9428de68d97652f90f0bd602cd21d59f19b3072 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 6 Feb 2020 22:02:47 +0100 Subject: [PATCH 102/166] More docstring polish --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 29c7f7fee8..1529d80b6b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -137,7 +137,7 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3, Examples: * 'ldamulticore' (default, recommended) * 'lda' - ensemble_workers : number, optional + ensemble_workers : int, optional Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. num_models should be a multiple of ensemble_workers. @@ -164,7 +164,7 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3, If False, any topic term matrix can be suplied to add_model. min_samples : int, optional Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. - masking_method : {'mass', 'rank'}, optional + masking_method : str, optional Choose one of "mass" (default) or "rank" (percentile, faster). For clustering, distances between topic-term distributions are asymmetric. In particular, the distance From 3bdeaf262bf6bf6a36af4fd1c89ffcc673d73504 Mon Sep 17 00:00:00 2001 From: Alex Loosley Date: Thu, 6 Feb 2020 22:15:13 +0100 Subject: [PATCH 103/166] removing some camel-case vars for pep8 compliance. --- gensim/models/ensemblelda.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1529d80b6b..4cb384c0b1 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -568,7 +568,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): for i in range(workers): try: - parentConn, childConn = Pipe() + parent_conn, child_conn = Pipe() num_subprocess_models = 0 if i == workers - 1: # i is a index, hence -1 # is this the last worker that needs to be created? @@ -581,10 +581,10 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] process = Process(target=self._generate_topic_models, - args=(num_subprocess_models, random_states_for_worker, childConn)) + args=(num_subprocess_models, random_states_for_worker, child_conn)) processes.append(process) - pipes.append((parentConn, childConn)) + pipes.append((parent_conn, child_conn)) process.start() num_models_unhandled -= num_subprocess_models @@ -726,7 +726,7 @@ def _generate_asymmetric_distance_matrix(self): for i in range(workers): try: - parentConn, childConn = Pipe() + parent_conn, child_conn = Pipe() # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3. n_ttdas = 0 @@ -738,11 +738,11 @@ def _generate_asymmetric_distance_matrix(self): n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) process = Process(target=self._asymmetric_distance_matrix_worker, - args=(i, ttdas_sent, n_ttdas, childConn, threshold, method)) + args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method)) ttdas_sent += n_ttdas processes.append(process) - pipes.append((parentConn, childConn)) + pipes.append((parent_conn, child_conn)) process.start() except ProcessError: From 806952ec97ccd59928a5aa1ef6d2c1c61591a9eb Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Thu, 6 Feb 2020 22:49:11 +0100 Subject: [PATCH 104/166] a bunch of reviews --- gensim/models/ensemblelda.py | 80 +++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 2db339522c..5f8a697926 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -530,6 +530,31 @@ def add_model(self, target, num_new_models=None): # tell recluster that the distance matrix needs to be regenerated self.asymmetric_distance_matrix_outdated = True + def _teardown(self, pipes, processes, i): + """close pipes and terminate processes + + Parameters + ---------- + pipes : {list of :class:`multiprocessing.Pipe`} + list of pipes that the processes use to communicate with the parent + processes : {list of :class:`multiprocessing.Process`} + list of worker processes + i : int + index of the process that could not be started + + """ + logger.error("could not start process {}".format(i)) + + for pipe in pipes: + pipe[1].close() + pipe[0].close() + + for process in processes: + if process.is_alive(): + process.terminate() + del process + + def _generate_topic_models_multiproc(self, num_models, ensemble_workers): """Generate the topic models to form the ensemble in a multiprocessed way. @@ -567,21 +592,22 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): num_models_unhandled = num_models # how many more models need to be trained by workers? for i in range(workers): - try: - parentConn, childConn = Pipe() - num_subprocess_models = 0 - if i == workers - 1: # i is a index, hence -1 - # is this the last worker that needs to be created? - # then task that worker with all the remaining models - num_subprocess_models = num_models_unhandled - else: - num_subprocess_models = int(num_models_unhandled / (workers - i)) + parentConn, childConn = Pipe() + num_subprocess_models = 0 + if i == workers - 1: # i is a index, hence -1 + # is this the last worker that needs to be created? + # then task that worker with all the remaining models + num_subprocess_models = num_models_unhandled + else: + num_subprocess_models = int(num_models_unhandled / (workers - i)) - # get the chunk from the random states that is meant to be for those models - random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] + # get the chunk from the random states that is meant to be for those models + random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] - process = Process(target=self._generate_topic_models, - args=(num_subprocess_models, random_states_for_worker, childConn)) + try: + process = Process( + target=self._generate_topic_models, + args=(num_subprocess_models, random_states_for_worker, childConn)) processes.append(process) pipes.append((parentConn, childConn)) @@ -590,18 +616,8 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): num_models_unhandled -= num_subprocess_models except ProcessError: - logger.error("could not start process {}".format(i)) - # close all pipes - for pipe in pipes: - pipe[1].close() - pipe[0].close() - # end all processes - for process in processes: - if process.is_alive(): - process.terminate() - del process - # stop - raise + self._teardown(pipes, processes, i) + raise ProcessError # aggregate results # will also block until workers are finished @@ -746,18 +762,8 @@ def _generate_asymmetric_distance_matrix(self): process.start() except ProcessError: - logger.error("could not start process {}".format(i)) - - for pipe in pipes: - pipe[1].close() - pipe[0].close() - - for process in processes: - if process.is_alive(): - process.terminate() - del process - - raise + self._teardown(pipes, processes, i) + raise ProcessError distances = [] # note, that the following loop maintains order in how the ttda will be concatenated From 3d24f62ef229a7cb4acb884811c032e19120e077 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 9 Feb 2020 20:27:52 +0100 Subject: [PATCH 105/166] fixed linter --- gensim/models/ensemblelda.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 6ce0289c76..e793a9edcc 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -522,9 +522,8 @@ def add_model(self, target, num_new_models=None): logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) if self.ttda.shape[1] != ttda.shape[1]: - raise ValueError( - "target ttda dimensions do not match. Topics must be {} but was {} elements large" - .format(self.ttda.shape[-1], ttda.shape[-1])) + raise ValueError("target ttda dimensions do not match. Topics must be {} but was {} elements large" + .format(self.ttda.shape[-1], ttda.shape[-1])) self.ttda = np.append(self.ttda, ttda, axis=0) # tell recluster that the distance matrix needs to be regenerated @@ -554,7 +553,6 @@ def _teardown(self, pipes, processes, i): process.terminate() del process - def _generate_topic_models_multiproc(self, num_models, ensemble_workers): """Generate the topic models to form the ensemble in a multiprocessed way. @@ -794,8 +792,8 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method start_index : int this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the - complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into two - pieces, each 100 ttdas long, then start_index should be be 100. default is 0 + complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into + two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 method : {'mass', 'rank}, optional method can be "mass" for the original masking method or "rank" for a faster masking method that selects by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the From 619922d28d34ad113a340a8beb22439887902501 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 11:57:00 +0200 Subject: [PATCH 106/166] less precision for windows --- gensim/test/test_ensemblelda.py | 34 ++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 6794cb71a0..84ca9804cb 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -7,6 +7,7 @@ Automated tests for checking the EnsembleLda Class """ +import os import logging import unittest @@ -20,6 +21,9 @@ num_models = 4 passes = 50 +# windows tests fail due to the required assertion precision being too high +rtol = 1e-04 if os.name == 'nt' else 1e-05 + class TestModel(unittest.TestCase): def setUp(self): @@ -59,7 +63,7 @@ def test_eLDA(self): # compare with a pre-trained reference model reference = EnsembleLda.load(datapath('ensemblelda')) - np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=rtol) # small values in the distance matrix tend to vary quite a bit around 2%, # so use some absolute tolerance measurement to check if the matrix is at least # close to the target. @@ -84,7 +88,7 @@ def test_clustering(self): self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) self.assertEqual(reference.sorted_clusters, sorted_clusters) - np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=1e-05) + np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=rtol) def test_not_trained(self): # should not throw errors and no training should happen @@ -115,14 +119,14 @@ def test_memory_unfriendly(self): # both should be 100% similar (but floats cannot # be compared that easily, so check for threshold) self.assertEqual(len(self.eLDA_mu.tms), num_models) - np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=1e-05) - np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=1e-05) + np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=rtol) + np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=rtol) self.check_ttda(self.eLDA_mu) def test_generate_gensim_rep(self): gensimModel = self.eLDA.generate_gensim_representation() topics = gensimModel.get_topics() - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol) def assert_cluster_results_equal(self, a, b): """compares important attributes of the cluster results""" @@ -152,9 +156,9 @@ def test_persisting(self): topics = loaded_eLDA_representation.get_topics() ttda = loaded_eLDA.ttda amatrix = loaded_eLDA.asymmetric_distance_matrix - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) - np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=1e-05) - np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol) + np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=rtol) + np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=rtol) a = self.eLDA.cluster_model.results b = loaded_eLDA.cluster_model.results @@ -164,7 +168,7 @@ def test_persisting(self): # memory unfriendly loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() topics = loaded_eLDA_mu_representation.get_topics() - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=1e-05) + np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol) def test_multiprocessing(self): # same configuration @@ -186,8 +190,8 @@ def test_multiprocessing(self): random_state=random_state, ensemble_workers=workers, distance_workers=workers, memory_friendly_ttda=False) - np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=1e-05) - np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=1e-05) + np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol) + np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol) def test_add_models(self): # same configuration @@ -310,14 +314,14 @@ def test_add_and_recluster(self): iterations=30, random_state=1, topic_model_class='ldamulticore', distance_workers=4, memory_friendly_ttda=False) # both should be similar - np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) - np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol) + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol) # and every next step applied to both should result in similar results # 1. adding to ttda and tms new_eLDA.add_model(self.eLDA) new_eLDA_mu.add_model(self.eLDA_mu) - np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=1e-05) + np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol) self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) @@ -347,7 +351,7 @@ def test_add_and_recluster(self): new_eLDA_mu.generate_gensim_representation() # same random state, hence topics should be still similar - np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=1e-05) + np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol) def test_inference(self): import numpy as np From 3f0041434eb9c33ab37c7ef108f39e5219b74c4d Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 12:10:45 +0200 Subject: [PATCH 107/166] hanging indents --- gensim/models/ensemblelda.py | 68 +++++++++++------ gensim/test/test_ensemblelda.py | 126 ++++++++++++++++++++------------ 2 files changed, 123 insertions(+), 71 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index e793a9edcc..a501fb01df 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -122,11 +122,13 @@ class EnsembleLda(SaveLoad): """ - def __init__(self, topic_model_class="ldamulticore", num_models=3, - min_cores=None, # default value from _generate_stable_topics() - epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, - min_samples=None, masking_method="mass", masking_threshold=None, - distance_workers=1, random_state=None, **gensim_kw_args): + def __init__( + self, topic_model_class="ldamulticore", num_models=3, + min_cores=None, # default value from _generate_stable_topics() + epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, + min_samples=None, masking_method="mass", masking_threshold=None, + distance_workers=1, random_state=None, **gensim_kw_args, + ): """Create and train a new EnsembleLda model. Will start training immediatelly, except if iterations, passes or num_models is 0 or if the corpus is missing. @@ -210,7 +212,8 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3, raise ValueError( "at least one of corpus/id2word must be specified, to establish " "input space dimensionality. Corpus should be provided using the " - "`corpus` keyword argument.") + "`corpus` keyword argument." + ) if type(topic_model_class) == type and issubclass(topic_model_class, ldamodel.LdaModel): self.topic_model_class = topic_model_class @@ -221,7 +224,9 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3, } if topic_model_class not in kinds: raise ValueError( - "topic_model_class should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModel") + "topic_model_class should be one of 'lda', 'ldamulticode' or a model " + "inheriting from LdaModel" + ) self.topic_model_class = kinds[topic_model_class] self.num_models = num_models @@ -274,8 +279,10 @@ def __init__(self, topic_model_class="ldamulticore", num_models=3, def get_topic_model_class(self): """Get the class that is used for :meth:`gensim.models.EnsembleLda.generate_gensim_representation`.""" if self.topic_model_class is None: - instruction = 'Try setting topic_model_class manually to what the individual models were based on, ' \ + instruction = ( + 'Try setting topic_model_class manually to what the individual models were based on, ' 'e.g. LdaMulticore.' + ) try: module = importlib.import_module(self.topic_model_module_string) self.topic_model_class = getattr(module, self.topic_model_class_string) @@ -285,12 +292,14 @@ def get_topic_model_class(self): logger.error( 'Could not import the "{}" module in order to provide the "{}" class as ' '"topic_model_class" attribute. {}' - .format(self.topic_model_class_string, self.topic_model_class_string, instruction)) + .format(self.topic_model_class_string, self.topic_model_class_string, instruction) + ) except AttributeError: logger.error( 'Could not import the "{}" class from the "{}" module in order to set the ' '"topic_model_class" attribute. {}' - .format(self.topic_model_class_string, self.topic_model_module_string, instruction)) + .format(self.topic_model_class_string, self.topic_model_module_string, instruction) + ) return self.topic_model_class def save(self, *args, **kwargs): @@ -516,14 +525,18 @@ def add_model(self, target, num_new_models=None): if num_new_models is not None and num_new_models + self.num_models != len(self.tms): logger.info( 'num_new_models will be ignored. num_models should match the number of ' - 'stored models for a memory unfriendly ensemble') + 'stored models for a memory unfriendly ensemble' + ) self.num_models = len(self.tms) logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) if self.ttda.shape[1] != ttda.shape[1]: - raise ValueError("target ttda dimensions do not match. Topics must be {} but was {} elements large" - .format(self.ttda.shape[-1], ttda.shape[-1])) + raise ValueError( + "target ttda dimensions do not match. Topics must be {} but was {} elements large".format( + self.ttda.shape[-1], ttda.shape[-1], + ) + ) self.ttda = np.append(self.ttda, ttda, axis=0) # tell recluster that the distance matrix needs to be regenerated @@ -605,7 +618,8 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): try: process = Process( target=self._generate_topic_models, - args=(num_subprocess_models, random_states_for_worker, child_conn)) + args=(num_subprocess_models, random_states_for_worker, child_conn), + ) processes.append(process) pipes.append((parent_conn, child_conn)) @@ -663,7 +677,6 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): # loop, in order to collect some properties from it afterwards. for i in range(num_models): - kwargs["random_state"] = random_states[i] tm = self.get_topic_model_class()(**kwargs) @@ -698,7 +711,8 @@ def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pip # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( - ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method) + ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method, + ) pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory pipe.close() @@ -725,7 +739,8 @@ def _generate_asymmetric_distance_matrix(self): # singlecore if workers is not None and workers <= 1: self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( - ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method) + ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method, + ) return self.asymmetric_distance_matrix # else, if workers > 1 use multiprocessing @@ -751,8 +766,10 @@ def _generate_asymmetric_distance_matrix(self): else: n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) - process = Process(target=self._asymmetric_distance_matrix_worker, - args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method)) + process = Process( + target=self._asymmetric_distance_matrix_worker, + args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method), + ) ttdas_sent += n_ttdas processes.append(process) @@ -962,14 +979,17 @@ def _aggregate_topics(self, grouped_by_labels): "max_num_neighboring_labels": max_num_neighboring_labels, "neighboring_labels": neighboring_labels, "label": label, - "num_cores": len([topic for topic in group if topic["is_core"]]) + "num_cores": len([topic for topic in group if topic["is_core"]]), }) - sorted_clusters = sorted(sorted_clusters, + sorted_clusters = sorted( + sorted_clusters, key=lambda cluster: ( cluster["max_num_neighboring_labels"], - cluster["label"] - ), reverse=False) + cluster["label"], + ), + reverse=False, + ) return sorted_clusters @@ -1202,7 +1222,7 @@ def fit(self, amatrix): "is_core": False, "neighboring_labels": set(), "neighboring_topic_indices": set(), - "label": None + "label": None, } for _ in range(len(amatrix))] amatrix_copy = amatrix.copy() diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 84ca9804cb..d637af9616 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -31,13 +31,17 @@ def setUp(self): # the topics are equal random_state = 0 - self.eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=random_state, - topic_model_class=LdaModel) - - self.eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=random_state, - memory_friendly_ttda=False, topic_model_class=LdaModel) + self.eLDA = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state, + topic_model_class=LdaModel, + ) + + self.eLDA_mu = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=random_state, + memory_friendly_ttda=False, topic_model_class=LdaModel, + ) def check_ttda(self, ensemble): """tests the integrity of the ttda of any ensemble""" @@ -68,8 +72,10 @@ def test_eLDA(self): # so use some absolute tolerance measurement to check if the matrix is at least # close to the target. atol = reference.asymmetric_distance_matrix.max() * 1e-05 - np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, - reference.asymmetric_distance_matrix, atol=atol) + np.testing.assert_allclose( + self.eLDA.asymmetric_distance_matrix, + reference.asymmetric_distance_matrix, atol=atol, + ) def test_clustering(self): # the following test is quite specific to the current implementation and not part of any api, @@ -94,23 +100,31 @@ def test_not_trained(self): # should not throw errors and no training should happen # 0 passes - eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=0, num_models=num_models, random_state=0) + eLDA = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=0, num_models=num_models, random_state=0, + ) self.assertEqual(len(eLDA.ttda), 0) # no corpus - eLDA = EnsembleLda(id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=0) + eLDA = EnsembleLda( + id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=num_models, random_state=0, + ) self.assertEqual(len(eLDA.ttda), 0) # 0 iterations - eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - iterations=0, num_models=num_models, random_state=0) + eLDA = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + iterations=0, num_models=num_models, random_state=0, + ) self.assertEqual(len(eLDA.ttda), 0) # 0 models - eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=0, random_state=0) + eLDA = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, + passes=passes, num_models=0, random_state=0 + ) self.assertEqual(len(eLDA.ttda), 0) def test_memory_unfriendly(self): @@ -130,10 +144,14 @@ def test_generate_gensim_rep(self): def assert_cluster_results_equal(self, a, b): """compares important attributes of the cluster results""" - np.testing.assert_array_equal([row["label"] for row in a], - [row["label"] for row in b]) - np.testing.assert_array_equal([row["is_core"] for row in a], - [row["is_core"] for row in b]) + np.testing.assert_array_equal( + [row["label"] for row in a], + [row["label"] for row in b], + ) + np.testing.assert_array_equal( + [row["is_core"] for row in a], + [row["is_core"] for row in b], + ) def test_persisting(self): fname = get_tmpfile('gensim_models_ensemblelda') @@ -180,15 +198,19 @@ def test_multiprocessing(self): workers = 3 # memory friendly. contains List of topic word distributions - eLDA_multi = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, - num_topics=num_topics, passes=passes, num_models=num_models, - random_state=random_state, ensemble_workers=workers, distance_workers=workers) + eLDA_multi = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, + num_topics=num_topics, passes=passes, num_models=num_models, + random_state=random_state, ensemble_workers=workers, distance_workers=workers, + ) # memory unfriendly. contains List of models - eLDA_multi_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, - num_topics=num_topics, passes=passes, num_models=num_models, - random_state=random_state, ensemble_workers=workers, distance_workers=workers, - memory_friendly_ttda=False) + eLDA_multi_mu = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, + num_topics=num_topics, passes=passes, num_models=num_models, + random_state=random_state, ensemble_workers=workers, distance_workers=workers, + memory_friendly_ttda=False, + ) np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol) np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol) @@ -208,10 +230,12 @@ def test_add_models(self): num_new_topics = 3 # 1. memory friendly - eLDA_base = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_class=LdaMulticore, - workers=3, ensemble_workers=2) + eLDA_base = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_class=LdaMulticore, + workers=3, ensemble_workers=2, + ) # 1.1 ttda a = len(eLDA_base.ttda) @@ -254,10 +278,12 @@ def test_add_models(self): self.check_ttda(eLDA_base) # 2. memory unfriendly - eLDA_base_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_class=LdaMulticore, - workers=3, ensemble_workers=2, memory_friendly_ttda=False) + eLDA_base_mu = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=1, num_models=num_new_models, + iterations=1, random_state=0, topic_model_class=LdaMulticore, + workers=3, ensemble_workers=2, memory_friendly_ttda=False, + ) # 2.1 a single ensemble a = len(eLDA_base_mu.tms) @@ -305,14 +331,18 @@ def test_add_and_recluster(self): num_new_topics = 3 # train - new_eLDA = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class='ldamulticore', - distance_workers=4) - new_eLDA_mu = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, - num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class='ldamulticore', - distance_workers=4, memory_friendly_ttda=False) + new_eLDA = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_class='ldamulticore', + distance_workers=4, + ) + new_eLDA_mu = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, + num_topics=num_new_topics, passes=10, num_models=num_new_models, + iterations=30, random_state=1, topic_model_class='ldamulticore', + distance_workers=4, memory_friendly_ttda=False, + ) # both should be similar np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol) np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol) @@ -331,8 +361,10 @@ def test_add_and_recluster(self): # 2. distance matrix new_eLDA._generate_asymmetric_distance_matrix() new_eLDA_mu._generate_asymmetric_distance_matrix() - np.testing.assert_allclose(new_eLDA.asymmetric_distance_matrix, - new_eLDA_mu.asymmetric_distance_matrix) + np.testing.assert_allclose( + new_eLDA.asymmetric_distance_matrix, + new_eLDA_mu.asymmetric_distance_matrix, + ) # 3. CBDBSCAN results new_eLDA._generate_topic_clusters() @@ -345,7 +377,7 @@ def test_add_and_recluster(self): new_eLDA._generate_stable_topics() new_eLDA_mu._generate_stable_topics() np.testing.assert_allclose(new_eLDA.get_topics(), - new_eLDA_mu.get_topics()) + new_eLDA_mu.get_topics()) new_eLDA.generate_gensim_representation() new_eLDA_mu.generate_gensim_representation() From a15bb42728401f66e6d9f3c792896d741f82fcfd Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 12:19:48 +0200 Subject: [PATCH 108/166] somehow recognized some old versions of unrelated files as current changes --- CHANGELOG.md | 69 +++++++++++++++++++++++++++- docs/src/conf.py | 115 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 164 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc0ce80727..52bf5a989f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,73 @@ Changes ======= +## Unreleased + +This release contains a major refactoring. + +### :+1: Improvements + +* Refactor ldamulticore to serialize less data (PR [#2300](https://github.com/RaRe-Technologies/gensim/pull/2300), __[@horpto](https://github.com/horpto)__) +* KeyedVectors & X2Vec API streamlining, consistency (PR [#2698](https://github.com/RaRe-Technologies/gensim/pull/2698), __[@gojomo](https://github.com/gojomo)__) +* No more wheels for x32 platforms (if you need x32 binaries, please build them yourself). + (__[menshikh-iv](https://github.com/menshikh-iv)__, [#6](https://github.com/RaRe-Technologies/gensim-wheels/pull/6)) +* Speed up random number generation in word2vec model (PR [#2864](https://github.com/RaRe-Technologies/gensim/pull/2864), __[@zygm0nt](https://github.com/zygm0nt)__) +* Fix deprecations in SoftCosineSimilarity (PR [#2940](https://github.com/RaRe-Technologies/gensim/pull/2940), __[@Witiko](https://github.com/Witiko)__) +* Remove Keras dependency (PR [#2937](https://github.com/RaRe-Technologies/gensim/pull/2937), __[@piskvorky](https://github.com/piskvorky)__) +* Bump minimum Python version to 3.6 (PR [#2947](https://github.com/RaRe-Technologies/gensim/pull/2947), __[@gojomo](https://github.com/gojomo)__) + +### :books: Tutorial and doc improvements + + * Clear up LdaModel documentation - remove claim that it accepts CSC matrix as input (PR [#2832](https://github.com/RaRe-Technologies/gensim/pull/2832), [@FyzHsn](https://github.com/FyzHsn)) + * Fix "generator" language in word2vec docs (PR [#2935](https://github.com/RaRe-Technologies/gensim/pull/2935), __[@polm](https://github.com/polm)__) + +### :warning: Removed functionality + + * Remove gensim.summarization subpackage, docs and test data (PR [#2958](https://github.com/RaRe-Technologies/gensim/pull/2958), __[@mpenkov](https://github.com/mpenkov)__) + +## :warning: 3.8.x will be the last gensim version to support Py2.7. Starting with 4.0.0, gensim will only support Py3.5 and above + +## 3.8.3, 2020-05-03 + +This is primarily a bugfix release to bring back Py2.7 compatibility to gensim 3.8. + +### :red_circle: Bug fixes + +* Bring back Py27 support (PR [#2812](https://github.com/RaRe-Technologies/gensim/pull/2812), __[@mpenkov](https://github.com/mpenkov)__) +* Fix wrong version reported by setup.py (Issue [#2796](https://github.com/RaRe-Technologies/gensim/issues/2796)) +* Fix missing C extensions (Issues [#2794](https://github.com/RaRe-Technologies/gensim/issues/2794) and [#2802](https://github.com/RaRe-Technologies/gensim/issues/2802)) + +### :+1: Improvements + +* Wheels for Python 3.8 (__[@menshikh-iv](https://github.com/menshikh-iv)__) +* Prepare for removal of deprecated `lxml.etree.cElementTree` (PR [#2777](https://github.com/RaRe-Technologies/gensim/pull/2777), __[@tirkarthi](https://github.com/tirkarthi)__) + +### :books: Tutorial and doc improvements + +* Update test instructions in README (PR [#2814](https://github.com/RaRe-Technologies/gensim/pull/2814), __[@piskvorky](https://github.com/piskvorky)__) + +### :warning: Deprecations (will be removed in the next major release) + +* Remove + - `gensim.models.FastText.load_fasttext_format`: use load_facebook_vectors to load embeddings only (faster, less CPU/memory usage, does not support training continuation) and load_facebook_model to load full model (slower, more CPU/memory intensive, supports training continuation) + - `gensim.models.wrappers.fasttext` (obsoleted by the new native `gensim.models.fasttext` implementation) + - `gensim.examples` + - `gensim.nosy` + - `gensim.scripts.word2vec_standalone` + - `gensim.scripts.make_wiki_lemma` + - `gensim.scripts.make_wiki_online` + - `gensim.scripts.make_wiki_online_lemma` + - `gensim.scripts.make_wiki_online_nodebug` + - `gensim.scripts.make_wiki` (all of these obsoleted by the new native `gensim.scripts.segment_wiki` implementation) + - "deprecated" functions and attributes + +* Move + - `gensim.scripts.make_wikicorpus` ➡ `gensim.scripts.make_wiki.py` + - `gensim.summarization` ➡ `gensim.models.summarization` + - `gensim.topic_coherence` ➡ `gensim.models._coherence` + - `gensim.utils` ➡ `gensim.utils.utils` (old imports will continue to work) + - `gensim.parsing.*` ➡ `gensim.utils.text_utils` + ## 3.8.2, 2020-04-10 ### :red_circle: Bug fixes @@ -71,8 +138,6 @@ Changes ## 3.8.0, 2019-07-08 -## :warning: 3.8.x will be the last gensim version to support Py2.7. Starting with 4.0.0, gensim will only support Py3.5 and above - ### :star2: New Features * Enable online training of Poincare models (__[koiizukag](https://github.com/koiizukag)__, [#2505](https://github.com/RaRe-Technologies/gensim/pull/2505)) diff --git a/docs/src/conf.py b/docs/src/conf.py index 84d2722c27..61ebaa76b1 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -21,11 +21,17 @@ # -- General configuration ----------------------------------------------------- -html_theme = 'gensim_theme' +html_theme = 'sphinx_rtd_theme' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'sphinx.ext.imgmath', 'sphinxcontrib.programoutput'] +extensions = [ + 'sphinx.ext.autodoc', + 'sphinxcontrib.napoleon', + 'sphinx.ext.imgmath', + 'sphinxcontrib.programoutput', + 'sphinx_gallery.gen_gallery', +] autoclass_content = "both" napoleon_google_docstring = False # Disable support for google-style docstring @@ -48,16 +54,16 @@ # General information about the project. project = u'gensim' -copyright = u'2009-now, Radim Řehůřek ' +copyright = u'2009-now' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '3.8' +version = '4.0' # The full version, including alpha/beta/rc tags. -release = '3.8.2' +release = '4.0.0.dev0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -76,6 +82,8 @@ # for source files. exclude_trees = ['_build'] +exclude_patterns = ['gallery/README.rst'] + # The reST default role (used for this markup: `text`) to use for all documents. # default_role = None @@ -109,19 +117,19 @@ # main_colour = "#ffbbbb" html_theme_options = { -# "rightsidebar": "false", -# "stickysidebar": "true", -# "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", -# "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", -# "sidebarbgcolor": "fuckyou", -# "footerbgcolor": "#771111", -# "relbarbgcolor": "#993333", -# "sidebartextcolor": "#000000", -# "sidebarlinkcolor": "#330000", -# "codebgcolor": "#fffff0", -# "headtextcolor": "#000080", -# "headbgcolor": "#f0f0ff", -# "bgcolor": "#ffffff", + # "rightsidebar": "false", + # "stickysidebar": "true", + # "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", + # "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", + # "sidebarbgcolor": "fuckyou", + # "footerbgcolor": "#771111", + # "relbarbgcolor": "#993333", + # "sidebartextcolor": "#000000", + # "sidebarlinkcolor": "#330000", + # "codebgcolor": "#fffff0", + # "headtextcolor": "#000080", + # "headbgcolor": "#f0f0ff", + # "bgcolor": "#ffffff", } @@ -149,6 +157,15 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +# These paths are either relative to html_static_path +# or fully qualified paths (eg. https://...) +# html_css_files = [ +# 'erp/css/global.css', +# 'erp/css/structure.css', +# 'erp/css/erp2.css', +# 'erp/css/custom.css', +# ] + # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = '%b %d, %Y' @@ -218,3 +235,65 @@ # latex_use_modindex = True suppress_warnings = ['image.nonlocal_uri', 'ref.citation', 'ref.footnote'] + + +def sort_key(source_dir): + """Sorts tutorials and guides in a predefined order. + + If the predefined order doesn't include the filename we're looking for, + fallback to alphabetical order. + """ + core_order = [ + 'run_core_concepts.py', + 'run_corpora_and_vector_spaces.py', + 'run_topics_and_transformations.py', + 'run_similarity_queries.py', + ] + + tutorials_order = [ + 'run_word2vec.py', + 'run_doc2vec_lee.py', + 'run_fasttext.py', + 'run_annoy.py', + 'run_lda.py', + 'run_wmd.py', + 'run_summarization.py', + ] + + howto_order = [ + 'run_downloader_api.py', + 'run_binder.py', + 'run_doc.py', + 'run_doc2vec_imdb.py', + 'run_news_classification.py', + 'run_compare_lda.py', + ] + + order = core_order + tutorials_order + howto_order + files = sorted(os.listdir(source_dir)) + + def key(arg): + try: + return order.index(arg) + except ValueError: + return files.index(arg) + + return key + + +import sphinx_gallery.sorting +sphinx_gallery_conf = { + 'examples_dirs': 'gallery', # path to your example scripts + 'gallery_dirs': 'auto_examples', # path where to save gallery generated examples + 'show_memory': True, + 'filename_pattern': 'run', + 'subsection_order': sphinx_gallery.sorting.ExplicitOrder( + [ + 'gallery/core', + 'gallery/tutorials', + 'gallery/howtos', + 'gallery/other', + ], + ), + 'within_subsection_order': sort_key, +} From 17e3e1179f42edb48c262dc33bccbd40d4fb7732 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 12:20:57 +0200 Subject: [PATCH 109/166] fixed autoformat of IDEA --- docs/src/conf.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index 61ebaa76b1..13d26e254b 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -117,19 +117,19 @@ # main_colour = "#ffbbbb" html_theme_options = { - # "rightsidebar": "false", - # "stickysidebar": "true", - # "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", - # "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", - # "sidebarbgcolor": "fuckyou", - # "footerbgcolor": "#771111", - # "relbarbgcolor": "#993333", - # "sidebartextcolor": "#000000", - # "sidebarlinkcolor": "#330000", - # "codebgcolor": "#fffff0", - # "headtextcolor": "#000080", - # "headbgcolor": "#f0f0ff", - # "bgcolor": "#ffffff", +# "rightsidebar": "false", +# "stickysidebar": "true", +# "bodyfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", +# "headfont": "'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', 'Verdana', 'sans-serif'", +# "sidebarbgcolor": "fuckyou", +# "footerbgcolor": "#771111", +# "relbarbgcolor": "#993333", +# "sidebartextcolor": "#000000", +# "sidebarlinkcolor": "#330000", +# "codebgcolor": "#fffff0", +# "headtextcolor": "#000080", +# "headbgcolor": "#f0f0ff", +# "bgcolor": "#ffffff", } From 1bb0194ccdc4e298934259b37a198c26c5513053 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 12:22:11 +0200 Subject: [PATCH 110/166] same thing for tox.ini --- tox.ini | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/tox.ini b/tox.ini index f30bdc068d..1d0c0b0e09 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] minversion = 2.0 -envlist = {py27,py35,py36,py37}-{win,linux}, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi +envlist = {py36,py37,py38}-{win,linux}, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi skipsdist = True platform = linux: linux win: win64 @@ -17,25 +17,15 @@ ignore = F821 ; TODO remove me when all examples in docstrings will be executab exclude=.venv, .git, .tox, dist, doc, build, gensim/models/deprecated [pytest] -addopts = -rfxEXs --durations=20 --showlocals --reruns 3 --reruns-delay 1 +addopts = -rfxEXs --durations=20 --showlocals [testenv] recreate = True -install_command = python most_recent_pip_install.py {env:TOX_PIP_OPTS:} {opts} {packages} +install_command = python -m pip install --timeout=60 {env:TOX_PIP_OPTS:} {opts} {packages} deps = pip>=19.1.1 - py37: numpy==1.14.5 - py37: scipy==1.1.0 - - py27: numpy==1.11.3 - py27: scipy==1.0.0 - py35: numpy==1.11.3 - py35: scipy==1.0.0 - py36: numpy==1.11.3 - py36: scipy==1.0.0 - linux: .[test] win: .[test-win] @@ -47,46 +37,50 @@ setenv = MALLET_HOME={env:MALLET_HOME:} SKIP_NETWORK_TESTS={env:SKIP_NETWORK_TESTS:} BOTO_CONFIG={env:BOTO_CONFIG:} + PIPELINE_WORKSPACE={env:PIPELINE_WORKSPACE:} PYTHONHASHSEED=1 + TOX_PARALLEL_NO_SPINNER=1 commands = python --version pip --version - python -c "from gensim.models.word2vec import FAST_VERSION; print(FAST_VERSION)" python setup.py build_ext --inplace - python -c "from gensim.models.word2vec import FAST_VERSION; print(FAST_VERSION)" pytest {posargs:gensim/test} [testenv:flake8] recreate = True -deps = flake8 +deps = + flake8==3.7.9 # 3.8.0 triggers "AttributeError: 'Namespace' object has no attribute 'output_file'" commands = flake8 gensim/ {posargs} [testenv:flake8-docs] recreate = True -deps = flake8-rst==0.4.3 +deps = + flake8-rst==0.4.3 + flake8==3.7.9 commands = flake8-rst gensim/ docs/ {posargs} [testenv:compile] -basepython = python2 +basepython = python3 recreate = True -deps = numpy==1.11.3 +deps = numpy commands = python setup.py build_ext --inplace [testenv:docs] -basepython = python2 +basepython = python3 recreate = True whitelist_externals = make deps = .[docs] -changedir = docs/src -commands = make clean html +commands = + python setup.py build_ext --inplace + make -C docs/src clean html [testenv:docs-upload] From 9baa79fce0c9f473e71934f4ecf698767b97ee29 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 14:16:32 +0200 Subject: [PATCH 111/166] hanging indents in opinosis notebook --- .../ensemble_lda_with_opinosis.ipynb | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb index 7380a87f71..cf96d3d74a 100644 --- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb +++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb @@ -2,19 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "####\n" - ] - } - ], + "outputs": [], "source": [ "import logging\n", "from gensim.models import EnsembleLda, LdaMulticore\n", @@ -31,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -42,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -135,9 +127,11 @@ "metadata": {}, "outputs": [], "source": [ - "elda = EnsembleLda(corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n", - " passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n", - " topic_model_class='ldamulticore', masking_method='rank')\n", + "elda = EnsembleLda(\n", + " corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n", + " passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n", + " topic_model_class='ldamulticore', masking_method='rank',\n", + ")\n", "pretty_print_topics()" ] }, @@ -175,7 +169,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.1" + "version": "3.8.3" } }, "nbformat": 4, From 1fa37292bb79af26afb561ea55d1563947e995b2 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 14:45:06 +0200 Subject: [PATCH 112/166] I hate windows --- gensim/test/test_ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index d637af9616..f297e2e6e6 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -22,7 +22,7 @@ passes = 50 # windows tests fail due to the required assertion precision being too high -rtol = 1e-04 if os.name == 'nt' else 1e-05 +rtol = 1e-03 if os.name == 'nt' else 1e-05 class TestModel(unittest.TestCase): From d3a615198dd2e78c43879530f8290ee288206ee2 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 14:55:32 +0200 Subject: [PATCH 113/166] test for LdaMulcitore ensemble similarity --- gensim/test/test_ensemblelda.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index f297e2e6e6..966269783d 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -22,7 +22,7 @@ passes = 50 # windows tests fail due to the required assertion precision being too high -rtol = 1e-03 if os.name == 'nt' else 1e-05 +rtol = 1e-04 if os.name == 'nt' else 1e-05 class TestModel(unittest.TestCase): @@ -325,8 +325,25 @@ def test_add_models(self): self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) self.check_ttda(eLDA_base_mu) + def test_ldamulticore_reproducibility(self): + # Two LdaMulticore ensembles should be similar, but since LdaMulticore has some + # non-determinism due to scheduling rtol is lower + a = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, + num_topics=3, passes=10, num_models=3, workers=3, + iterations=30, random_state=2, topic_model_class='ldamulticore', + distance_workers=4, + ) + b = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, + num_topics=3, passes=10, num_models=3, workers=3, + iterations=30, random_state=2, topic_model_class=LdaMulticore, + distance_workers=4, memory_friendly_ttda=False, + ) + np.testing.assert_allclose(a.ttda, b.ttda, rtol=(rtol * 10)) + def test_add_and_recluster(self): - # 6.2: see if after adding a model, the model still makes sense + # See if after adding a model, the model still makes sense num_new_models = 3 num_new_topics = 3 @@ -334,13 +351,13 @@ def test_add_and_recluster(self): new_eLDA = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class='ldamulticore', + iterations=30, random_state=1, topic_model_class='lda', distance_workers=4, ) new_eLDA_mu = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class='ldamulticore', + iterations=30, random_state=1, topic_model_class=LdaModel, distance_workers=4, memory_friendly_ttda=False, ) # both should be similar From 216ae64b9e73e34841f92efa0fc3a874bede59e8 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 15:37:19 +0200 Subject: [PATCH 114/166] no --- gensim/test/test_ensemblelda.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 966269783d..68afaba170 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -325,23 +325,6 @@ def test_add_models(self): self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) self.check_ttda(eLDA_base_mu) - def test_ldamulticore_reproducibility(self): - # Two LdaMulticore ensembles should be similar, but since LdaMulticore has some - # non-determinism due to scheduling rtol is lower - a = EnsembleLda( - corpus=common_corpus, id2word=common_dictionary, - num_topics=3, passes=10, num_models=3, workers=3, - iterations=30, random_state=2, topic_model_class='ldamulticore', - distance_workers=4, - ) - b = EnsembleLda( - corpus=common_corpus, id2word=common_dictionary, - num_topics=3, passes=10, num_models=3, workers=3, - iterations=30, random_state=2, topic_model_class=LdaMulticore, - distance_workers=4, memory_friendly_ttda=False, - ) - np.testing.assert_allclose(a.ttda, b.ttda, rtol=(rtol * 10)) - def test_add_and_recluster(self): # See if after adding a model, the model still makes sense num_new_models = 3 From e238fc9c26736b42f2d0b10ea345cc47f3277fa1 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 24 Oct 2020 23:59:20 +0200 Subject: [PATCH 115/166] fixed wrong max calculation in loop --- gensim/models/ensemblelda.py | 41 ++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a501fb01df..4aeb7629b8 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -349,8 +349,10 @@ def generate_gensim_representation(self): num_stable_topics = len(stable_topics) if num_stable_topics == 0: - logger.error("the model did not detect any stable topic. You can try to adjust epsilon: " - "recluster(eps=...)") + logger.error( + "the model did not detect any stable topic. You can try to adjust epsilon: " + "recluster(eps=...)" + ) self.classic_model_representation = None return @@ -411,7 +413,6 @@ def add_model(self, target, num_new_models=None): the exact same dictionary/idword mapping. In order to generate new stable topics afterwards, use: - 1. ``self._generate_asymmetric_distance_matrix()`` 2. ``self.``:meth:`~gensim.models.ensemblelda.EnsembleLda.recluster` The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models`` @@ -966,11 +967,11 @@ def _aggregate_topics(self, grouped_by_labels): sorted_clusters = [] for label, group in grouped_by_labels.items(): - num_neighboring_labels = 0 + max_num_neighboring_labels = 0 neighboring_labels = [] # will be a list of sets for topic in group: - max_num_neighboring_labels = max(topic["num_neighboring_labels"], num_neighboring_labels) + max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels) neighboring_labels.append(topic["neighboring_labels"]) neighboring_labels = [x for x in neighboring_labels if len(x) > 0] @@ -982,15 +983,6 @@ def _aggregate_topics(self, grouped_by_labels): "num_cores": len([topic for topic in group if topic["is_core"]]), }) - sorted_clusters = sorted( - sorted_clusters, - key=lambda cluster: ( - cluster["max_num_neighboring_labels"], - cluster["label"], - ), - reverse=False, - ) - return sorted_clusters def _remove_from_all_sets(self, label, clusters): @@ -1038,7 +1030,20 @@ def _generate_stable_topics(self, min_cores=None): grouped_by_labels = self._group_by_labels(results) - sorted_clusters = self._aggregate_topics(grouped_by_labels) + clusters = self._aggregate_topics(grouped_by_labels) + + # Start with the one that is the easiest to verify for sufficient isolated cores. + # Over time, invalid clusters will drop out and therefore clusters that once had a few (noise) neighbors + # will become more isolated, which makes it possible to mark those as stable. + sorted_clusters = sorted( + clusters, + key=lambda cluster: ( + cluster["max_num_neighboring_labels"], + cluster["num_cores"], + cluster["label"], + ), + reverse=False, + ) for cluster in sorted_clusters: cluster["is_valid"] = None @@ -1054,8 +1059,6 @@ def _generate_stable_topics(self, min_cores=None): cluster["is_valid"] = True else: cluster["is_valid"] = False - # This modifies parent labels which is important in _contains_isolated_cores, so the result depends on - # where to start. self._remove_from_all_sets(label, sorted_clusters) # list of all the label numbers that are valid @@ -1082,6 +1085,8 @@ def _generate_stable_topics(self, min_cores=None): self.sorted_clusters = sorted_clusters self.stable_topics = stable_topics + logger.info("found %s stable topics", len(stable_topics)) + def recluster(self, eps=0.1, min_samples=None, min_cores=None): """Reapply CBDBSCAN clustering and stable topic generation. @@ -1163,7 +1168,7 @@ def id2word(self): return self.gensim_kw_args["id2word"] -class CBDBSCAN(): +class CBDBSCAN: """A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN). The algorithm works based on DBSCAN-like parameters 'eps' and 'min_samples' that respectively define how far a From 68d5814a58c513e11f53e88e9e1e10c2b5c7f960 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 25 Oct 2020 00:02:36 +0200 Subject: [PATCH 116/166] improved comment for sorting clusters --- gensim/models/ensemblelda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 4aeb7629b8..a5ecb5d314 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -1032,9 +1032,9 @@ def _generate_stable_topics(self, min_cores=None): clusters = self._aggregate_topics(grouped_by_labels) - # Start with the one that is the easiest to verify for sufficient isolated cores. - # Over time, invalid clusters will drop out and therefore clusters that once had a few (noise) neighbors - # will become more isolated, which makes it possible to mark those as stable. + # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the + # easy ones and potentially already remove some noise by also sorting smaller clusters to the front. + # This clears up the clusters a bit before checking the ones with many neighbors. sorted_clusters = sorted( clusters, key=lambda cluster: ( From d299f07091bd761f7ffd11242839f060dcd15a9f Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 25 Oct 2020 00:57:42 +0200 Subject: [PATCH 117/166] added ensemblelda tutorial --- docs/src/gallery/tutorials/run_ensemblelda.py | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 docs/src/gallery/tutorials/run_ensemblelda.py diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py new file mode 100644 index 0000000000..a50e14e355 --- /dev/null +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -0,0 +1,155 @@ +r""" +Ensemble LDA +============ + +Introduces Gensim's EnsembleLda model + +""" + +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +############################################################################### +# This tutorial will explain how to use the EnsembleLDA model class. +# +# EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models, +# it can therefore be used to remove topics from your results that are noise and are not reproducible. +# + +############################################################################### +# Corpus +# ------ +# We will use the gensim downloader api to get a small corpus for training our ensemble. +# +# The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`, +# so it won't be explained again in detail. +# + +import gensim.downloader as api +from gensim.corpora import Dictionary +from nltk.stem.wordnet import WordNetLemmatizer + +lemmatizer = WordNetLemmatizer() +docs = api.load('text8') +docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] +dictionary = Dictionary(docs) +dictionary.filter_extremes(no_below=1, no_above=0.5) +corpus = [dictionary.doc2bow(doc) for doc in docs] + +############################################################################### +# Training +# -------- +# +# Training the ensemble works very similar to training a single model, +# +# You can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble. +# In experiments, LdaMulticore showed better results. +# + +from gensim.models import LdaModel +topic_model_class = LdaModel + +############################################################################### +# Any arbitrary number of models can be used, but it should be a multiple of your workers so that the +# load can be distributed properly. In this example, 4 processes will train 8 models each. +# + +ensemble_workers = 4 +num_models = 8 + +############################################################################### +# After training all the models, some distance computations are required which can take quite some +# time as well. You can speed this up by using workers for that as well. +# + +distance_workers = 4 + +############################################################################### +# All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as +# +num_topics = 20 +passes = 2 + +############################################################################### +# Now start the training +# +# Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics. +# The number of stable topics which are clustered from all those topics is smaller. +# + +from gensim.models import EnsembleLda +ensemble = EnsembleLda( + corpus=corpus, + id2word=dictionary, + num_topics=num_topics, + passes=passes, + num_models=num_models, + topic_model_class=LdaModel, + ensemble_workers=ensemble_workers, + distance_workers=distance_workers +) + +print(len(ensemble.ttda)) +print(len(ensemble.get_topics())) + +############################################################################### +# Tuning +# ------ +# +# Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters. +# +# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around +# +# until you get as many topics as you desire, which however may reduce their quality. +# If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough. +# +# Having an epsilon that is smaller than the smallest distance doesn't make sense. +# Make sure to chose one that is within the range of values. +# + +import numpy as np +shape = ensemble.asymmetric_distance_matrix.shape +without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) +without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() + +ensemble.recluster(eps=0.1, min_samples=2, min_cores=2) + +print(len(ensemble.get_topics())) + +############################################################################### +# Increasing the Size +# ------------------- +# +# If you have some models lying around that were trained on a corpus based on the same dictionary, +# they are compatible and you can add them to the ensemble. +# +# By setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is +# entirely made out of your existing topic models with the following method. +# +# Afterwards the number and quality of stable topics might be different depending on your added topics and parameters. +# + +from gensim.models import LdaMulticore + +model1 = LdaMulticore( + corpus=corpus, + id2word=dictionary, + num_topics=9, + passes=4, +) + +model2 = LdaModel( + corpus=corpus, + id2word=dictionary, + num_topics=11, + passes=2, +) + +# add_model supports various types of input, check out its docstring +ensemble.add_model(model1) +ensemble.add_model(model2) + +ensemble.recluster() + +print(len(ensemble.ttda)) +print(len(ensemble.get_topics())) From 149339626a0ca7e186cd07fa23c760e56849e0b3 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 25 Oct 2020 00:58:06 +0200 Subject: [PATCH 118/166] added test that starts with an empty ensemble --- gensim/test/test_ensemblelda.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 68afaba170..01c1ffa085 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -215,6 +215,20 @@ def test_multiprocessing(self): np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol) np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol) + def test_add_models_to_empty(self): + ensemble = EnsembleLda(id2word=common_dictionary, num_models=0) + ensemble.add_model(self.eLDA.ttda[0:1]) + ensemble.add_model(self.eLDA.ttda[1:]) + ensemble.recluster() + np.testing.assert_allclose(ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol) + + # persisting an ensemble that is entirely built from existing ttdas + fname = get_tmpfile('gensim_models_ensemblelda') + ensemble.save(fname) + loaded_ensemble = EnsembleLda.load(fname) + np.testing.assert_allclose(loaded_ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol) + self.test_inference(loaded_ensemble) + def test_add_models(self): # same configuration num_models = self.eLDA.num_models @@ -376,8 +390,10 @@ def test_add_and_recluster(self): # 4. finally, the stable topics new_eLDA._generate_stable_topics() new_eLDA_mu._generate_stable_topics() - np.testing.assert_allclose(new_eLDA.get_topics(), - new_eLDA_mu.get_topics()) + np.testing.assert_allclose( + new_eLDA.get_topics(), + new_eLDA_mu.get_topics(), + ) new_eLDA.generate_gensim_representation() new_eLDA_mu.generate_gensim_representation() @@ -385,14 +401,16 @@ def test_add_and_recluster(self): # same random state, hence topics should be still similar np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol) - def test_inference(self): - import numpy as np + def test_inference(self, eLDA=None): + if eLDA is None: + eLDA = self.eLDA + # get the most likely token id from topic 0 - max_id = np.argmax(self.eLDA.get_topics()[0, :]) - self.assertGreater(self.eLDA.classic_model_representation.iterations, 0) + max_id = np.argmax(eLDA.get_topics()[0, :]) + self.assertGreater(eLDA.classic_model_representation.iterations, 0) # topic 0 should be dominant in the inference. # the difference between the probabilities should be significant and larger than 0.3 - infered = self.eLDA[[(max_id, 1)]] + infered = eLDA[[(max_id, 1)]] self.assertGreater(infered[0][1] - 0.3, infered[1][1]) From 87a11c5000b4d3d807760667ad3d7c867dd67a04 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 25 Oct 2020 00:58:55 +0200 Subject: [PATCH 119/166] some logging of auto parameters --- gensim/models/ensemblelda.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a5ecb5d314..417b43e441 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -466,7 +466,6 @@ def add_model(self, target, num_new_models=None): # for memory friendly models/ttdas, append the ttdas to itself detected_num_models = 0 - ttda = [] # 1. ttda array, because that's the only accepted input that contains numbers @@ -496,7 +495,6 @@ def add_model(self, target, num_new_models=None): self.num_models += num_new_models else: # memory unfriendly ensembles - ttda = [] # 1. ttda array @@ -891,14 +889,15 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None): eps : float dbscan distance scale min_samples : int, optional - defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold which corresponds to finding - stable topics and should scale with the number of models, ``self.num_models`` + defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold required to consider + a topic to be a core. Should scale with the number of models, ``self.num_models`` """ if min_samples is None: min_samples = int(self.num_models / 2) - - logger.info("fitting the clustering model") + logger.info("fitting the clustering model, using %s for min_samples", min_samples) + else: + logger.info("fitting the clustering model") self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) self.cluster_model.fit(self.asymmetric_distance_matrix) @@ -964,7 +963,7 @@ def _aggregate_topics(self, grouped_by_labels): order. There is one single element for each cluster. """ - sorted_clusters = [] + clusters = [] for label, group in grouped_by_labels.items(): max_num_neighboring_labels = 0 @@ -976,14 +975,16 @@ def _aggregate_topics(self, grouped_by_labels): neighboring_labels = [x for x in neighboring_labels if len(x) > 0] - sorted_clusters.append({ + clusters.append({ "max_num_neighboring_labels": max_num_neighboring_labels, "neighboring_labels": neighboring_labels, "label": label, "num_cores": len([topic for topic in group if topic["is_core"]]), }) - return sorted_clusters + logger.info("found %s clusters", len(clusters)) + + return clusters def _remove_from_all_sets(self, label, clusters): """Remove a label from every set in "neighboring_labels" for each core in ``clusters``.""" @@ -1022,7 +1023,7 @@ def _generate_stable_topics(self, min_cores=None): if min_cores is None: # min_cores is a number between 1 and 3, depending on the number of models min_cores = min(3, max(1, int(self.num_models / 4 + 1))) - logger.info("generating stable topics, each cluster needs at least {} cores".format(min_cores)) + logger.info("generating stable topics, using %s for min_cores", min_cores) else: logger.info("generating stable topics") From beedbd3822ec76a8f931a93ec1733cdab9633433 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 25 Oct 2020 01:15:37 +0200 Subject: [PATCH 120/166] update auto_examples --- docs/src/auto_examples/index.rst | 37 +- .../thumb/sphx_glr_run_ensemblelda_thumb.png | Bin 0 -> 26786 bytes .../tutorials/run_ensemblelda.ipynb | 205 +++++++++ .../tutorials/run_ensemblelda.py | 155 +++++++ .../tutorials/run_ensemblelda.py.md5 | 1 + .../tutorials/run_ensemblelda.rst | 420 ++++++++++++++++++ .../tutorials/run_ensemblelda_codeobj.pickle | Bin 0 -> 8641 bytes .../tutorials/sg_execution_times.rst | 30 +- docs/src/gallery/tutorials/run_ensemblelda.py | 2 +- 9 files changed, 827 insertions(+), 23 deletions(-) create mode 100644 docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.ipynb create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.py create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda.rst create mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index ca3c1ec019..dfee557a6c 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -71,7 +71,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -92,7 +92,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -169,7 +169,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html @@ -190,7 +190,28 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
+ +.. only:: html + + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA + + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /auto_examples/tutorials/run_ensemblelda + +.. raw:: html + +
.. only:: html @@ -267,7 +288,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -288,7 +309,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -405,13 +426,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. container:: sphx-glr-download sphx-glr-download-python - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png new file mode 100644 index 0000000000000000000000000000000000000000..233f8e605efca4bef384a7c603d53fdc385428bc GIT binary patch literal 26786 zcmdRV^;2BU^L2u2U~zXzaCgrl!EKS?!Gl|HcbDK2+!l9tx8Uwhki}V?FVCmmf8zb& zR^6I=Yo=zVr~8~U-QmiL(eC?7t2K>sEKRQ>SbW8C{gfQ0bg;;5VB{^5g#S%F4jS*yo)sIw9jjnBhR8wuiv*@haoMbsRi`-;s) z9j%)m&qag`YmLl;-VH-wNFilp)`}=RE6iLb8P1ZXot7lPS$|;j zQ6%{N?}NV#OoGh&8zR;Jmp{}RVNldC0&_43DEv|>!G?j^q;OxAyX7Gs_2A9hI-55o zJ)F=U@-pAz+w*?0hK)p%z(s{qJ1Q+Wv_8Z|z3!m1Cs=&-?)J_5Y815AaDH?$jYLqo(p$kO2iW|9P%S0_{dC|2h=c{Q!&bPfpfv0!@c%J z#_Pjh7|{hTI0kIgMg zxdOWkb%vZV^4lVL61EU=I9L*Tl+?h$RKCIz*L82w5j+y@957yB-O`t_fpVnvVkw>7 zP^nvW>L7(+HJXphZ(3gXtZxLHZqdPVywxY{9!|2K&AEn|@;JMhAZj7?4y>5SAxJrx zurYaog3=|Jtznf^wPDLtdAwz{-~VzM&<1f=cKIarDeaAWbp~AUPnA+mO6+`%gWKCS zLp?PEPTuCPmyhdAo0>J$4)AJv7RQniq@AMH$$_|;sq({O!`Uktc6UT#*dpB361N~I z%}5Y5rnnv`0pPZ?4^O==rd+Ct+d=y*e@vt1VbH@M#e;)=cJ$F$(v2B^Vh2r+Vx~*+Z!G1Zgmq&GRTTsGq2_{@Nw>D#DWRt=`t`6c z7T8oA;ZHP!1QLB9M_mrUm>*io2P8vwgJhjCQtU#iZyN}MKEQUSOs}$gCgi)#1GuGMe*f#+Z&G1I+vYi6K9g)e*Db>=_;17hfr#kK~YJsMnHz>ujQG zarIZk$|9PqN6mLW?3l6YSILh*)(}TG6B!@=Y&v9B#NjQ@cyJA{AT*Rq{1RO3KfRt& zITYih)&2p&c2`~JyxOI7rZ8o2AVPcMJ0duaT+CTY?{Jzelx=^QZxo4TA}5e|0FnJx z2h9h%+q97bj6^`YgpWV zZE^bI*Om;wM(XMc%-aGKa}>axr_VZ=;zDWt8O2-J^Bxv}Ut2%&^MW1SJF|SbFz&kG zy&cWX=>0)&urGzaisqDY!NnC_{E{vDTOHp*`@+VJID&^uD425yvR;&V7JOge@>7l;gt``i}FMDWyMVa`YI-%kD!E>S> zL|JS2ysS@b3@)Qb=BIHHjmnEDA|!p~!={`J0RJ#+a8dEKKlk|iw9a|2w`u3eeq7J2 zf-BAe-~bY*&0(nx$VAm(#iNTWBKx>hl>cQ(d|miS=*dXB;Oo=bVeiP)u35s(Z6STU zHpM@WqxIr<<7Z_^I!@Fq-s0HMekIN5ZdZuA8p<$z5M*`7l{e(44IGp#EYwEGo9l@@ z6GbbJ5wt%PcTH`mE9=xg_LPL9p}aIOXebdn84NVw&M*s5qmw zSAf$>HX1)|Ed8?UGO(gY6zH<4agAW*RQ-Ux7>@~#J zmS$%j>>%DF4Y=98ql77up`%Kpu;DPvUrv*EhV?47yMlP0yl6hzZ|fFAuTWA614#IsaO$_k zcURoeoZI7P`+2*MMC}ycnKbq zdhu&vPwa4VxmvM1`j?s0O}oTP&vV-WMtq+90RkmnkO&!Yvc#TgvQ`BFLbdrz?S;T~ zehN^!jvpxcNekpZ#eH3Pq#LsoY`b@>QN$aAV{~(&#mAEtd`K+bsZ4IGv=%(zq*3Xs z;3Rfjc zkktoX6uL9`be86_LQik`_Ya-b3(;-vd_U$lnA*wemDJ|?HO9AdUpJdGrMmL_VXdOB z53f;FLk`&=fS`S&PG^o`)InjY@-!!OH&@QVDqhZ(i_7rANfL{LH&pQ<#6nd=k@R&QX0zW~~Wq*_`b9rNZYq=4x|Qbh_5fW+|0jZXKA2Gglg!KYS4dpoJq9281$ ztI5PC9ZkFni;lbsSRI+`dV_<2vI#7bicpH+{d`{I&a(w2bj5n;~YRsn$5{c$bbM zLp`gi<}PqEx5=fu{@$`b8|&0pik;)ojkp*SGu9J-aY?~8d}E6YnF?L^Q*N$;h)+)M zM%+AaC)QY@X5SI#N78RF95e}jrx?i{u%sUH( z&F-em&Q+v^3Hfvted*?*6eORtd44he9$m_2uf_-u6H^xFve=)EF>h`gg3$ic6_)ea zQjmCUv-pK|(_$10b)X&)yZeaX`^%g0HIP?RAgl7^kiyoT z2Sc+M8OsBmIm<3h^&9K>>%hL~oMI^=-xk0PY9Hx^Ewq?q#4JliEENe7 zE(%!vQ~SSRB+U}ICT{;Xs%G|W-``$ zTF?jz$B)r(#XFeWV2$xsxW0HgSK7HZ@UUu-{kkE@Lky5%Pt6>X5`q95F(?F5Eu70V z!Y>OcUe9E%E7;u4;(y!~BRy<>z1^!Gdg6;?irc1Kk2vwYfn1HrnS{p1m`s{7WIG?q zAmwgD`j59UQbjRcnPP5pEhm5`vOhBW)AUYPuYPW_hIU&}F7HbX3fy+AX{6>soT1g# zD2<)GielaDxB>of?4Ec<(TnLceijD3Ze7h0suV%Imdj$OeLqMG6_R z2~|YF_uo{q{8~vyb`SpCf4k4BOkZ)C9Z#YMq^*{p-nuS;<;qOG_UAM1&feCCiddBx zMJw3EZGul1{lKuaW+!gvI&1gqo8~S?EHUJEyGCTllavF1YKf&rfe_)nm%h6q7}tQG zoP&Jl>0O2#)6-ziNKN4>5eq5R1U*{hVX|Ea$nt8Up;o}NK_}Sa^+yp2c6@lwN7ugJ zx0l!|AeDK{vY)+KzY##WxgI_mwN07=>*i9|zWkt|6wvLW-)tY*xl*^^CiPvX-1&~S zyr_gH%_YmC|;YvYVG~9*;Mw3dd$|J7DCGwH}Xc#hH=bz`L$TM>C*V9Kc%UCtgd=DUUdizG>g|erOOUj<7DvV(6^D9or9O(7bT=6$}@LsxS zk@>EP%gzAq6D(&R$zi@M3fmbeY2f0**!_C6MG`}CfPGjfhD5fNZ!~+ytpwiY6q(sD%>cY^p2zfX5}_gov(_;9#$3S2FiPC0h8M$c1_s|YrK#c9hz3Z zS^Mo0f{rwaO8HX#f0Cl>E3B&p7Iimm*->hfWS|jI^s54--;kJs@ivU1Uvm0JIKAxQ z9j51*FUG(H%gM>y2h={YoqQRrIN@HuB#z3i#`*@4*>PfO$_#ANdi@V-3-h5ytMH$} zA5)R2hKPDZ!<|e!qLyWl&f!h1?T77=v{Ed_)U)7s4lI!ezI@H@ITI_=4w;|$gGC-- zqnT{sKOCjQJ+dG{&a7cEarZm7OYq%#s7Z=E$&_Mo)5?Yc_!+|zWIxKujt7;W{M*z7 zdIpA^lJFVpS|T&%8BGUV4|TSSSKRLlbJ3V zpf#PCmbS|L{_K<&Jxj6cCd9S3EnNFPkhg4(T8rL!hEfy9guQ5#m7F?d5$6xV!L9Ki zvHaoMuQd_TC&Et~x3#ncbG9jf(}$D_`H$A`s#`h?WvFA{O5II{L~ z;n;$McVrUq$V7R=1u8=R4}3M6KNs{7Y?KY2aI97+-_26QPIK_Q^3L0#MTZob${Z(6 z;c{kO{Db;5W(oKwO=vOta`Vb9@fp)=U%2jM&T~ZXGYiH5+Y`dVL)}tKBc;PN=8Rjf z-lq^DIQt9dUauz1cphzsnIBy<6&VKNtG+TDk7yTT1&JG^#$7@9Ug0QLX8Cesr zDT4$yL9BpBb*hLn(j~jnr0Mpala3iKY%U z9&0$&f~ukr%wUCJ1$7_S%k3^gJY3Qo&x#D+>5*k7asU{pbo=+tuT>=ZkisU2J)SQl zNnsdHAtoh@YGRKCd;MzEo9XlJJANNAeL_%VDc2jS_@r*mfT-^FE0_4GiV6BUA}jj3 zYmvDcoN{XT@}^wT`ScccVr~%P1~c;45AR?55X4Dy_6w+|WBYIFo`5ZT5n4GZ9r7^C zZX9%iE?D4>((*4f@oZ2xAnJ=_!6C=pi;GTYt2Q&=X^4`L+uHX!t9K@4Hvsq)9-|7JXfeN3?RjZTGBQ)tpe$Ph3xgNn2vX-#!!xgl!h<}a>k3zZ}Q1r z0=yeM$ERRwwY0GCDS7BQS)I28KOC|IpeAsl+sXUdhjm2);H+zeM(`3>r4Ccp?+Aaw zRonhho{{ZHfJ*r}dGr)&XAt)$f)^ zwG&gX6WQYDMf$JF+%(T%22m)b!H$5&i`8gDlg-JaP^D5JIqak@ckkoWIn|{&fjV`` z+xegEsEnub@7%Hx{->Wu$h*c;fA5MEQ5lQHLpHa!np8c#)i1GirdNF(^RKg-G@5)C zw@yQS;dKgEo*B%2PoUn{gD90lUy+<(B#ycGfxvZBICB^%4sg@xoh*-q<$OZ7heJ;I zudhy0@1Im0s6L<7M4Jh+I<*C#s!OxX;2ZfBx9h0JseLe=xN zah5^>ps>R(59~(zOi1-jll(L&4e$yQNO;xc@w$CJU!{IebA?~WmdMuW$_>|;<)AAR0^ZkVi0M% zrpexd?5bveEUm!wFQDq%{l$YPiOH|r+Vr<7#lYb7Sd?Q|X#(=({MdHW^jcbJaG!Fs zn1U(1Rk^!U;6(oPG4N5MB<*4AuwOAA&IE&C>F>XVbzVNPA}Z{p<)uQ~miwp0rj5^8 zWl-NN>uAVrQ_J1_^??qBKq0u)d?q(dtyJX}Zpx^;yA~#TH6YVMkmLV{rQX{Nt<(naEB6e?+*x4oOmJGPF?|oNY*0^LAB{K!~Icu^pN9Md|nEz@X?5?@C zXxIg5O#lXlWePd&$Uki$wj$A{KMx6hdpAL;95@4~em@@p?CLIidi+$4roQPHu_=3l zL$PrisR0at1aiHkYR$NJFed!+2Ll!4ti;i_y7$3M)eG=}$>WT!5{u^o=u|YnJqXZP z#7UDzX@(O8*TZ7h-_*3t@G!Qp^a?6qN*^Ki4*>XBhH`(FoMnW6wWpu|TQtC9+Gs*c zI^1%AbEds0r9=@M>V^U1Zm=yHK8le(C>wU={CdijvQu*HblVgb9Z}EAq-rqh-me+d zT!nQ*hxP6=*#40WvtS45hp?uFz)K5+twjXVm=9f)Og`{`qQp)UH+gm}e!e3jA zd`@(yV>EMEUg!1q@uuOaA;(nqZzMhowr#lb_1GOfLBRKWeG-|nj^97;GAgTTKh!{e z7f>HIq68dR*RlvE>{}u83_s*7j?g@HV~PHowFWL{WUO@9+ ztHGK7i_RP{8-K&Z+sT1EY?}p;?5rikIZL!k7)muurAQ?y<9^8PGMk+`7F~bFZWxYV zk(tpGGH*}}#R+EPuoT1ajCk+=0#1HT{sa^8q8X%@p_PPu@rG= z1jY0v?OGE>OZ6S9Dt%bs}Q=LyFq&oKYyAF5m5xBgI8Zv6{@AOb(I)-*E^Eqpc` zBINhLnkP&cKL7kB+7-(G^CBg-zIS56Vt*omIL}dLOBpMUlCvayA9AcnCjpWDXyOwa zJUGr71oK%LosW`~Y7G%h07D=#U|re9*<#W+K!_ai4~%MF-=}x(1m(|T+Z?_b)XFtz zolau3{WSx)Hu(ra1p-L|!uG%#%`ymd=f3VtVL_RgjzCT!y)o}V!7aV^){r2V+BJXA1{kCc%t=x-G4m_l$hRDAHZ zgf8y){c#SiOj531*iB^?%|BhrYdWRENof_1$SA94lw0JH&bOJ6oPU582DlOqjZ0+o zym^uKnz{4hoiIMC=sGZpp{#M@PWxlVb4Z4E=hy`d9NFqeJb^Rx!pT^umM4rb&Q<;+ zr zaUJ=~IQ+N1J_p@uoY5psZ6?)QhK`#y>ezxKhKFF7o^{8r#V&325P?SN)WqNh*KxVHE+;RVXR1G0zQt-C*wNv-{FJJT z%2=gOcyxE4wv+wB&=k|dBR5Xsicx^N;_B*%%%7k?`ojfV<}t-_`nzog%01*FM^uLt z-_SpQf>rwDni$JsxmxcU#rT*;@%*R-pqtpYCOo66HpTn>Qz)vSV9Gl%!+1KcJ3!}w zXJD8}`(!q!3n`Q2?w$-ZbJjH!2HqEE(OWLjhx7=3VzkxRhBUo2hj;X@pV;08@Z0jK z*kgB?GbABO^oR2$!;pXdq3l0Cf#!uBuG#MT0!rjXd=bTJduL-E<{bc&YcWCR$ZnLs z3cLocLPcVa_5?TZ>=zc8l3b6%W03G|EdL`kI2z004bJ=VW}QW&kJG_S*&8|Bvnt46 z+&ae!2!_e>r>sLRTCxSF{9+9kyTkWf`RHTGwR?DwzI}qae*BbS z3o;_%zK-{xlp|me%s={fd?wOVMcVB7#Wp4G zp5bASD>|*~$AaY`*;DS0z&(I9&uy6}WNu2rnk4%_DU*U-W;ri0M-$Er^$pIfhB=qTaRJ!3B zo!Ekl`c5O0d98e>?C-)?Fc}%1;oYp$g!zU(Mn$x~%a`6DJ_SRMr9e+Kc2{0?#w}Rq zN@BQa(~I-wo{ZBq?5*w%QI7 zVaW5>!PUoZ8=ycP!>pS7l7)c-%Py4IoH+Nkac@Ygj75$B_z|VJ`Pdy0p_@n0A;tbe zli&;B(hC%(8MzV3QAQ6XUJ;l})H2a^e$MPmFUfn~7SFjfA&6NF_IhE|!3%}H<0L_t z6m)5*JXUR9{!6fM^1)ce^he2J*`CN@a=|kA?U5=1wx#?_WQGULb^+O6_Z3#0 zlIlN^(2mg))Jw~pl717rLqznOjKbd@3Axyit$M7Rc_vXD`cXdX@td|be=UvS?6y{2 zN=vV_IX<$|k(D=Dmj3q!dKC|Tchm7T#eI&zZe{G7!_LCXc}xqadtzzxxl(ieJO^80 zB0am9B~=T}y(E^@60|}-4nuAibh=zBwJ^}f@6h6BzrNnMc2Y+fuGa%71s=7`q9#Bi zgc70U!G%55@eHW$O6sjtGgV5p^uIE7q($2MZQz?2^L{N{$CvF<*L5qwn)@YsHw+a} zJidzg<&4HZnBBEx^Dni#8P$E)`Leqhcrh)%uZ>QbI^f*ze&`U$+|#BO?R8~ryqy-` z?a+2;%Of~Gr?`&AjfsR&QQCT)W^XeN!$UY}E$VU!^PaoX5|`qvl| zi(`C$dYtc2w+Q-?)hA=;IJFqSKFZlFKrEmo9OUhRXaQ-+1di zHOtP>cUtozS;VFdN1A10VlKi__;Qk^-*s|jt3BwBn}!pg*C>(jrS>b(bwSdrgPB98 zn|+G{TA9y<2v^?{6hynG*@JQVYEMoM3_6--U;WS(j>l`dcOMi(@`utYTf9&Hss|&5 zhT`crTl&94rKk&ui|4cJ_2WW)TqSOxFUn^G^cw_$grGg~B^J-4kd|eEp2di$uP91t zI^W~j0UM721B&fnM2cHjxlKDJp8>-0%OUXDQU7XIoI)`CZHPMpr~ zo$Xc0D1YJ#6yaY+5(emlqt5vqN5&T>m~KBUQvwSdjc`rbVUfXgbHVrT2y57n4osM5 z1?fcFP-v}zMPoQi*TKI*u(k6z^_IJ?K{d%ooId4i@S_ire_tepd6wRm=$SX`{3rvb zXT&gl`=bp#d-m{=7YdH*4LFk6gH99=>P7qWn|0AQYd!SsB80Z^*;~O;i^B1ir(KPo z4Gb<0vUqU%7^A!atZ(6XyPMLr`$n;x4jJ!se>2f$KvWoXR94n#Xj|265%MIXeH)9C z{!KL&ac~_o7m+FQxPhYcr}Dp-M4I?1@m_^kGZSkMZ!(&sSvRpY(r|XF+K^TLEl-PG zJu?xNnRM)_pG(H_YwxykMFM*=jrF~IR=u93wmGoZNG^@uvLPY0)4`_%ThbaYMzq7 zx^EF=_wQ*R#=gY8v)$W9vD^+B?>n7Aa$nwSP9KT+H!8&Lc{zH1(K3j0E5CUUq)moG zl;Gwi*7)SoEGc~?So@L5v|6jg*0F2a4%yqHa_a_DL-I(gURRR4fvz26jZ$jdTkQgh zsULhVyT2{5AarwWamnky+ec=$W1HY7L}M~Kq!@j@C>=TNx~X;2)i`y|9Bs0!fk5um zWa-~%(cN*^Lofbti3F1;7R6^3z=}{eUz&Hs#mRi@lj1e6W!aWuX}ms78%&E&L^UIo z5b#U`iNHWs&gL+{J{4wcGr+q%XTNi%CKrCp#!Ano&?b9IzybQYx$&sIW~7BYKfXsG zH}AS0mFe4 zP5F+GUS{-T!_3ghpJwc`Bf8F?AfecS^I3zZ-wddmk!OA85+YZdxsUkY916X^)rygk zIa06&h`3s-i$YUk6}r5AoK8CSJ702JS#iY;ocWmU)sy#uk;<;xv}XZ!by@#*6-9S6 ziHB#VHGDk4@!22$Y`@}zVJYiH$cL0R;%={HTO0|EKJmq=R=f!{paP~=7V5yKc|NeQ zcGzR*C1&O>kJQYBf|u~f)Z~Z~(biHNo^sWE`ULP_m4~t+96rZ(@HQ{p>hz()hC&<~h5TO51VcYZuQ+`_q(@io}Y*O^Oww8#VFbyn>NQ{E@=f zbx6F#gxRn$B3r*3{qWNFEv`kkI$SRXEvZ+P>+mkLT$9?p!R>znEe^t<1z;X~9f>|q ztncS;JPir4e@M)4`T+&ED(Xt;u6yAa5?FDeDI=Ko8KEl^tbe)6co*fmZZ?X%ufG>R z=OQZd4v2<0jZ4x=Brno?G6yUh|653~NeS11uSL<-EpxM}y@8jk^1j!c0?`PLEktWt zkb9^WxC;-u$*g_H;^=vH$hguWv?~!q!lX`x_TaDWVg%mmOG)Efk}_2A3ra6MlI+nd z3k#pt(2Ed!6C=}i3_etMXIU(tA1yBiFTyl-V!fy`l}QD;0Ag-D*1v$h*iU|t1n#ky z71C0r3LaQDFMCGl@(W1Zjk{TKERx=nln_i@s+ft0Kwjp#*ER-(ocIEUjc@teKxGw; zZs^xCQH2>cFpKtGdpL)dicB+^|E2{vD&0@GE}Z5j=CLr46ISd+P?pfA#=Pq$I$J7| z?+!3h+BQ2^BBS8kDKQ>RDm?`FLVKjIsqld{D~Nh+4P@f)5s6I`6G>YtDqg?K%ebzd zb%MJt*t{s>;h%gg!nApSzDP;ob0>yhdS-LG-%clLSYEmVs-x!rC%S-ZuR`;3=mdsc8h4fLz~W!K8$0l5ZqcZve}2oT5L>vXpvm%C-|YJVuRd$vG^oLD zVU)YmpzQP?pcIP&L~5j0YovAQv~+PM$wBul^4XI@VX4!6I6h+wiqtQ8FON<*Iaij? zbcE=kVvpSTXnJTDlaXXq>pR2ZxoSz5oX^ceTAdv2ZBHu|la z_H0~(J(GX*nw;J{;b1$xs|)Q*RE+w-K74mLUY1zw_XIeaY3%5zuJsSRU!&~{F<*Ac zS$=gT;(L@Zd!d<*=q&=DtINhUYKZdlrZ9n!CAHH{zL9JtrC+YPB)om=dO^&AxKm|- zw@OL(+bx-29-V}*veZW^zfSDGTZmNUx(#NEUN}C;{m?no>sglzTGmR+4=}kU(rvPO zaX2>@y~tlQuzkboXLlm0vTp>M5bmXSOJWO0-fki}T$)P#NzeDbaSGZwJV68-9ehzS+`0ns6zyK+E{o#td|pbbTGbm%1_8fNi&Kj9o{Wd2@JD zS1+1DaB1FgAn8-ZjeN85n~0pvZC_Q` z7&mX15R1N^2k!<4xevA3%->tFbMFX0Ar2d8f&Hx#nSSS1*ibY$%B0D!Ed7jCrw*F&Ak z{+@Z!m3g+cQl-XyjV&RD`mo8r*Rs@YHa-~%9e~nP z*V@mF#4=|-h)~+=oeICmp2K)k^`Z<)n+9vj!3 z(6&A7m9P9AEkDG$=u?ShfMy-V`_){2Z$y{>FWLJ^S@}S8G;B$i?bB*Op4$H?T^>>A z|Fc)9v4rQZobN*@p0_&9cutg|YO_q=I&lCF%Z9xL-;DA!8ad7~7*>PhXv;Np>v`Dg z=@(krAL|xc!5!--#*?lvTm4+;fr=A|UL3O_s<~Y7#HbDElD0N^Dm{%FXIF;j>&6kb z!P?G6KS}g7y%vn&aN^T0C{Qv(@!0tX1V$wy(h>r_}UAL~mmu$7&2W@UpmP zNM_RXz9~z4)^Sc!%jnXkJ4i=g~|MdjCqvp|^d#%FdQdoEl;euf0ysPC(dZ zH32?wAHlQ%XD+4smbfDcHFEpDO0*)M{!Q+kC9)M%frZrtQ-SKmB}#Y2=*QHGt=+M# zoZa#SI8hU}_+}dxrUI|of6_;Jd7juc8n(9MsefdyRoCPq%nssJPM9Ipb@#Mw9#56+#N-ReXm&8#k~io@{r0BJ@0yanPW5Cm21znR zhDomFwb~Gx{dj?g`t&P^b7uPGHN#^L=&Gpb!l=IK1J?%7hwNjizPm?B#mVaC83Rh} zBP~9eXit@MAnOES&Ns{`t79@mNCD@Fe}fDkwNN9px3=`^1aC?ePiaSBzCC{i+~@YR zeKMO~VM))>$#=BdnJHiUvR-K`xzW)G>b&|Zd;2%peg3Zta4jM(I7$nb$^?>7CL0q^ ziG`^Kj6RyC0K4G=$7aH(J4S(wP%WEO8>!i=<&o#P8jaT$jW~MFq;Luo6mQ&r$QznT zfAUC!I}M${k0cl=CwmIjby{3&(wMCFR)z~~O(9r;+H3uzWh(|RJXuZpV1Y|QwQA`GxF91ddFomjA-ORw)_?V(;hYyeAjvsfdmwMzh zef$m|B{}R_o)Kz6s0H~viwwClk=w^q(mOc15b0VmEK%7HB6)#PY1KA)YaI=z;zxwP zd}?#$__w|2J*qHCHIrZ}uE3hOd+rCxre-x_KZ>Y&kX+K^)zU%_z0Mw_Iru&7!cO&gGbD@#w(J3=}(v zsuDR}JYBAE7Vrp=V~KChF!84$OC;{b-C7@QMo~vV_cwHj>1xfxLq2BJ)=vRdp46);>a5#(NlsMDtWSRkJX~b`X=^XKSXuuClu4x@?TBS z>4*Qwf4&d(HuS z$Pqf9xX5#GRN*(vrkg|~Z^iWxZXs_G27moZ^3(^!;6rRT9TWt&FNR81ffeW@C6dI= zKp^*I(-s*L^q}A3_u5-(@AVg<*=|eUU=rL!(Mn42!4=}d=VBeD6_ccM6ICa9?JoO; zVRgv!An;#)4_<><)I+?{ieQ$`?SxpefvL153rHYequ`FVH>o)g)R{bbhnE4BgK3vvGy&5bh7hF2X2M+sRkQ-$v0T~ ze3o@P^Ec~UgLm^J@r2l&I>zNOSgI))42b3CjWl*Lx^YW}nx}6&Z+oZ$9h&l3?#fu2 z4&3cU5an-@xo4+VWXIi`5BX~X3!Eszq4}t~@cJ>P=Ym>x%*u*8JTXH6p+*NQKYYFx z0O!E`S{Aq~qQc-};}K?o_2ZDPf>4QUe+LQ?jfYBH!}kPk`r!!Vlpl0K);K8^7RWd1 zeRxx}COQWzAaos>C5bx8Vd~s_k_%#?wKJ724Jn^84AkizXYJ+GaISovGKmZ^&h#b{ zyH>T-xHN50E0arflI=dDULv@Q!qn%y+T8AL;h#D4^v)X2{ zzCi$gHyN{By4i$LtJ#l$Cu!M!9XK$-`qIT(rT4nua@of`O2`%UO*>jHIU39s@{kIS zYIYw3p@=&IM%MXhdL4Qq{N4{baAf=|Pe~d1dMVsLn=~ko>`B>hy63VhbVl+?KV|GZ zzb0<&=#yn7wx`&;`#Lwe3)OPL zJyFa<9ea=|jsJo9U^(8s@^_FI(1)+1u!f@f}g z+gJywAIoZmQIa{tQqXFJdxm?EUprnrp&SLe0vg9FV6Dxn!=KFRAwQ<72aam!nX{jm zRxl^7Bxww!dSSlfSlqe)^nLT$l7jbDPt*GxUBj?qz3C;eqgLkadA;UwdzMtY@-vi; z8f%C$D(oo?TrXL$4`Ie7g8ZQ-g5$t-jFSk`U2`hs4HgH^HPta8qUU1DI>~-*je+)m zSr7j-J;0cW5>Hl5+v|br-4Oua(-um!{OtU7vOYi&k4OUZ1YEsMEDtg#DDE{t~VJTB^gO)s|KR-|-H9sNWpV*VEgD;pi0O+jwiCWGCMJdb*7<%U}a~nlM zu84HXc&AS}cFwWNpe4zhKX!ugu$7|38d>3J()IM5Ud}{2TQ9T5_orkkm5?rJ=;+Z7 zt|)wsS@x)#c_nLmql)qeax67t!l+#Q?{t(r%FGit=bP*e-PeLKaA@beHEk!H)(7Og zr&8A88Pr?dNuQ1dS{sZyTWYjzBF0cunZZjgL5$NkR}x*7oU@a0wMS&c z?{HYZJg_5r0D8edt--0`QhEQ_c5Q1F9CQ_Q<2j_>hsGAo8GTgHCXZ}HGEE23MiNIh z5^VHzRa$LZo!EGd6kXi88@l1`avJzej!PV9!OYuN0ilE;TmQG)NwZ+rdduO&nCY1L zx}c>cYDFhGw=nKXsYA2JXz`2S2K!W8M~74JkjH;)c4%as3^EVKsHb{r{Dgv6iQUqE0 z&bnOdga(%)$Vgqkw%gI2us<@$j*Vy7F7PBdT=Rhj+aGLW|7sPJAErOZrOD7^v?d$> z7QeMTE^l={I0ewORFjdDxTS!vb#7RE!W#zSQ}-t$Q?4~`iw1Vf!^Yi7S3moq2~~M| z#38q4xr3VIaUoh$B1dCypP|tEGeMW9Kcvrtm<&-U@tt9=>lM`Z5v?pb!H<9;|A&O45hG$PGwTG)yi-E=KKDawzf zsDSript;XE2l?AiE%mMf+B8-lQ;{uRrW)_3yVxbSz8-w;s6hTAHAE>$lOP%%;(k$s zDWPu#KR33tm1avjyTQSIf8iPXu6vW`gxf=DX81tCNoErP4uZ*j@K2*p^}Jh)L9X zphTR`nF_=$yXk2Fc6Ii38|n_dMr%6sU|XiT6+%V#+N#2#O{Rqj)zv!IXm5jnPLny7 zhv>pUzZ|3X7Tyb6Cv-5LT^q6GXb1%8c*OSKeykb2|JaSFdXl!kfq-)3Q5LD;d z&i-mkAnX)2c)#g?(XSX=rKa(=H)?iE&Uy>3sl2r25-eq*S9kuN*x1aAiF`}t{p7v& zU<<*C%f0ehh()rEX(Ln?a9-2qG)KP7h|VFVPE(JjU<~WzPAVm}4mPlYe+d@$LJZfV z(WZ#SX+&T$rfY5(Z{_vA-a(PGvrgcBO2$PO%9NFfVfb2m7q;>>9>=@UDJGIvNMVokh8R;@+2sPTB& zV8;w);Ypl~!h0;0xUHw&b>HY_-m!IU4m+Lf>cZfUBi(=DL4$-LfyE^Y{AK#GzC_Ne zgicua7`N=L`&50`hq${|rnzu_!86ZbLO8OV`Xi=e6jF~TocOL9sPyiTt5v@7%(wp7 zzpm-xmutZZ*Fo1c17wLiO8}+dzW{GP;e$xSy>V<2TR1A!n3&H#N2lV^pgk=)MMcPOd;rqW#= zJ5#vg6*!R6IdM>!%zZF8WsQQ)nl}LOm)M5@Q&+ApjvTCvewAQo2bvHDaOFEl)K8;o zxIE$eyp;`1XRjLW{fm8R`LhR|Trt!3zJ9ybJ(EWZ(Yysjr@m9sj#KGemh!Hj-3}Xw`p8+Wy>xQdAZq;bf32NmSDRfEsEfM3p1pA*!hR;nP1Ol`or& zji4{X$=c^f8pN*|rBlC&qcS1+9MmLeb*Ox&(phY0o;Q+YUlx9oLZdS-Pw_8@D5K#r zAV@z7NPm;|{E%a@oiXkF+B*e~xs&ZOyE$G}JHy5n>a1$?Tj1cMxMcDxEfNwuFh5Z_ zm|}d?co$}LoVLK+YACk2ktQRYgdrh(Qp+63kLc%Qm zY~r5-lmQStd$@at0H_CW>1rgj_T&}S`N8g)r(g-|Xh4Y;Gtx_t<9U;c0}>xURK3SA ziL5c~aQB=^kuVpH#{k}nT$T>}yvm(#z+qzK@BqsNV&&&yA~2D`U&z)3IHW=~NA1`1 z6EiJ!UvqOlw2Lx>KiUla)_O&&y= zUbgaiE*Bb!h1f`c(U!AA@&(QQ@-UDN==lo0e5)K)I}zvdfxj*s)NJUCP!fS>0Q$KbcU| zCR?u*2I-~wmxg|L6hVMeeii0>MqCS%{kvY=iiW})OLt0G_XyFFuu>(F3F&@A*nVN- z?^dZryzJ+TRI|A&8tY4`1C2(rh9br+ZFc_MsU*U_a`xy4jGdcd|4(x=V`|MKL&{Rd ztxKndY@i&4mHrmwJra4PNW5D-1n(ebwLbTJBvk2E%X4K6GAMDzw=?o%4hWBDb8p!6U#LamFcEUM#0vipCaC*8# zr^XVR_@832HF|+|h0F+3kR?H{&mIE4)+sN}=-kV_od-OT7u@|SRda|{i1G2a329Ts zb|p+SUoAm1gN|cokyxl19xW=g0tx<}`n7cC7blmCFKOOC+EQ3|wlrK{>^eSbQ=B4y z;*YX;6Y)Pc7yn#J-LV6jJF3$^Qj&YdaQ8Xc-jJRN1FlB5kwPD5Dv2W*8Ueqmhrgb% zCuDPRiuxt76lwK^KE`GgLQo3V$bvU=5p)>its7-CtCQl!<%imO4Jhu?=6>JM!NRpT5c1UwOzc zyAWV|YsbnIc^X(`Xl#8>Wk3qQER`g16}Pb|2#p~5>ahIvUqSfPucp>+f|u^LzO{E_ z<$mIhyBJhIf-Jvo`|dJhJ19}r8A)rsvC#WHRKYO$Mcl`Q!%9l`wDM|9ImH_{-n|#b zVUG?bBNq|b&y&D#1%(OX|4KO#i&=f{`i4s81aeRfXEYHH>tln~vdWQDoL16^PW?m4 zvY{GsZg3Py(F|na6JioY9Q^n>oUPa=r6kqET1G@9PtGMK5V?mS)1y11X6;?MVB%LB zV6Zb~<|n+YoHrOSZhB07Z1t!E^5q``TddvdI`79SvdB#9w8F4J*~nQev4J{^P@Y0C z@{H?w=!aV}h`()pjAZsPTJX8Mqy-ESe%eo6U!KN;Dh)D3ue9^t_M|pbqqlej%;+WN z_UG!09?9<_4Fxk(IB*I!EASab;*RSFgHNYk)rqOGFngv*udKG>cD)iL4cjmBfnUh` zqum|310+{)A2K4vrUnxm%qh7gzTk0~G>%`cN8v~bAJ@K(j(A(DHqTa|B9%$m));{THTKuo8L)wbpP`#c7Z0d@f4QPC(pu|*f#ut#W=+msoZ9~+ zbo_CJmk9%&Td=sDZonl2_3g=?1AejyKC^_N-BjCnzS##5Yi$@)f+|ACzy(NaHR-R3 zyeFq$*_Xz|r064X<^53DkDEM3@&S+}@teDuwJ=+uUI`!t(KOa&gOn6K)%F0i0?>td zPYNVQm3Xs&Yt>Zp({Quu!Dfh(bQX75ON4 zVtm?pp14-%SLw8f1xiG&Y>1^Hx^Cy_bO>IByKVloN^eQRDz$lF#lxxmNIuc(_S=sC zW|GIzsCE80_w6G2%_8tWT(i11^RokujhfUoQ%bIjFgru4Fp`R%j-L`_$@5H#3=Inj zFLPZio(u#Eq3e#PE2LxdY+P8iwWDjm&DYu4!P$;y870rQYDleuaU=G$yfN?^{kVKS zQ7j9Mm4HpCO{hRW0%)$4Lj{Nj5}ud7bd+tS#h@xlm(%kpo@&jFjSpxEi>Du_1}qRl z2I_V!mb4I={~hJPk0V}omTw|S%-}Cl_<7@L25<5g%yhljVC(L!l|LH@+x-*h%(4!a|e za{TQ$OXmvL%b9l0KK4ojdL|7EGR4@1Jkwa+N6Os+5-uHu4?TQIK4*y2cSM|6l)}%R z4%Ea|SpX{T^o*Qx=wv1uo$ z>(bBgzWyb8oI~fxwoNp$=vTir>%VYsq_@rty3vZXzqX8#8Hl=7yp$tHIDgv-gI3$2 zRLG1-37+heo%R1-F7DHwrQUfvEN=U0BC?jX@b_}fj`Enr((U@vKS)ZXLrG;E?UbM7 z>m`Xs2+rs~d?SI}A^_sU?D-sTCc@QNJ&jmKQ6zfb^&onM^A>y6L+mzm#n^GI9 z_x(h8BoiGE>R}MmN63f4g#0E9c!L0Hqa14!r^tS}dXaf4$?-<^)8Q5uB7gA6&#`&^ zQ7C5#WSJLo*oLU-C3+Q-T#YfI69V$7hP$Dt_QY)}9g8Y`#eJ0@NtKgs$q3(oeQDbh z7111xO?~^6SILg!M=OsC1tW7isCmc7@_*}8M1r1%!6wtI-UJX&Q^=LCd>=Xf_UsMQn z#vVf4UM=uRr?{RFQ&ZEM8K+Fk*8Azxfr*S{*V{RbN~t;72A`#WesUMgR$6a%Bz!$k z3jbZQ2*XE6N4Gw)y3fo{;Uve^{)XcIAkc9j8iMncHwx1E6U2+S!zv?%Z`>vX3-guU z0${Pka+YFg+=oMiE?&;E-^CU79`@{X1V>+ZDeI^hMWo8lbiVD?4BnR}efM6dwOfpr z+S%%Y6U{wP7S5K&{Q_6Qu`$XfzxCBc+75T@N`Pk>eJAa@mAWeeLfRM=nR^FU-K&L70ox%e{sB#P&D!-xsC!!&Vjku`M=GZ2&+6*1OO0WN`O6`3xB$$$1Cc~P2 zZvZ;-`G{OcKuAXMls2ap51--}H zptl=EM4@mcj3RV|IXagPYshgsecOeafP}%}Cm~IPPm#d@psNUyXWi6B8>f>TmgMa^RW>1~ANq%#_s5CR)dF%~< zcO1UgMsi51$+gZT=SRN2k8idb5?$(}cbM=8&ani~!zHyv=6UaXg45G;OO4b2RIqR3 zbEUZpy6gV?+TKX@S&O$|if+4#->4=28#C?pf8t1!3l9ioLdm+cWGu?2zRDwc#OQqx+1Hb zCr6{jo8`XhhD;q^A^$0hw_w*w2D=c3K)pQkH`G)#$8vFzl-m9kxau?lN;K`2=BmGO zFGOPQq6Be=)KDNVj7cyoy9_qD`5e2RALME3V4R3=p}~g@?2m|uvR%RfCSIr=j|IhWUVUG z4wE|M3M86pgDvJ86c3lRQwM{G$Ny{W^`aR{zedy<`N3!*R9gliJdw?Tjb2Z4ql&uQ zl3awrju8-VkB)Z3mQ_>}g@af{#9Seqn5dsNL`k^_c+qh?N+hOjZTVGk^U)chEfSy% z)kMo3uuxfx!b86KEK3IF=INbk(wdBp`fjnRAJK@`xEa923uB$G|w_cZ!>&4{?@TjBYG5cFY z^l5PobrAswMhKX3G2dc9?Kl0u69Bz~G&T6zxh3RA1rjefirxV7%{2_npzkHfk3w&K zn*V5@Rn7(a7#k);4QlN8w9d$y!OfYmfuCJoE2K4+GnI`57*T&gDq9j5s!~a zI0mo(T=wU1FJG8G8Ho%EOvg_1a@1@mJ--F%c2~H2eXOeJP=G$K(RiVLbm;O-M?6F# z{Y>v6J#76KYi;amLV!EO9tp2JE|1*uVhl;3OJ}psuMjMm{Pue3(9BU_RvA<+cj@u> zP1*!GrIoN6)=aS^Q}&{?ItPPol0yLgc_z`h^oShF z_~v)HFtZF#@fkiA(in}sR*lz?L9qW0+0(*!tWYenV5 z?oUr8^11yk9puWv;pR`Q%gjMKq(Bu_1-%+=S(2ycOeC$-)#}`V+G%OUZ{~f)T#5hU z`Cv&bg>^du(V-#bPxF}JTl?Qu!T83flX|?!Vjk^-+F}O?&BZyfyxAz(%+OvA` zCfM1z#6%52Li|V!v;t4J*XVsfG{BFG_0prMHSt>XRFUfxaDTV(ThpYzA%V8(T5UZc zDv)EFXbU@FIrc5LOkV&mtW4+u%GgZh=&8}S#z5<&G4@DOcf&NbdZ?pevs)-7rg!ks ztEi(QqaR%kEaUtPi4(9tJMC$&zIE{@sUKJkEuASY5R-qpk;WyH= z;()&U0O1sM2M^6yT~T3E{i0b(K&1MN&sYJVg$vuSSeNh4@VDlET0&|fZ-?qS6b(oi z@4yEBlcN*m$!CHLDt>$ssx6C^TjdhlT#?~6wN6%f?AK6k^%DISF{4!>-#rd8GCvOK zMMNVtA;nO)E&hn9@-UJ2<ie;$?sEKP9+TSLG|18a?+l_+Qds01{PN?(6kf_+4veh)cfwX{4% zhT3a6vqe?MS#7#WUoV*^9)4y*G^lV7iR(C^&sBes8Fp`ak`ppcaSO5aB67s`MulPC zOZTEr$0iI;T?tAeaV{hb3w=*^3x0D!6{{wIjM3>1+PXNx87ps=Qg|5N#3KmBpZLn` z{X}`aAfkf+Mg#-ln8&7wM{KYxfN(=3yh|!Aa$T4&J;N?7ped%^cEZoBlB1^U+-LYL8kn@YvEgJfH7g{7XYu zLJ19&86g%gCtGvlJVH3tArUCkCsBV(m9^W=&_kXRENdTn+2x4u69>>ta%z5+AVl53cg*0hPswpCm$ z3|boDuK$EWR~Us!YHm5LJ^u7)wH`EudtoeVi7(u9#x`5xNBQ5B(|4NTS>QbeE?>fI|Y1lOJI)Igu zqRZV?;IV^6p0db*q^q8E8%EV`If3m+T8FE|)_H3I-}ss3(bw{2qFl-l8dO5xo{iD* z-y-^-Dazi!mn?O8OK`u~%{^hKdo5UF%XcjiAO%8o3nf>Hb-`=$dJV6#J6E%zfko7A zr)D>rfa*)iV&6b@K%2V2D`L0Ple@R(^s287W$%Jpx@@5fW?DC#@{Duiz9yXV)nKr| zzPja>>??$n}?mOGPg2)K1~L#|$b=M3e}*3z&@|NhH_lz_I?6{e2hvekd%51f5F@ z>y@LD%7*8+??1hzM4_WmCWOZUh*2*&rbk7W4T(=$GRFdx__%)Ux3UHPd(r0u-NW*C zdj#pCCdTDEbI~$E{bQ0_J^q+_DzB)1-Ti)&8~=OscU2ic3b$G}O1ywi#@n7-wB5C} zEK^LCL`Y01p&yM=EDT^HCyqc9FP!d&z`AuZT9qH_bZ=wMp@``X?Hf*@)K@x&uCpJpMD z)9qL%Lc^%;Xm7plwKG`D8|yV03JH>*CT17S-K8L9lx$`JcEJnP_yGWc(Sut*1IR%T z`o&DNr*?_s{-?uzPi}6;=!TuZYH);1xwyPm4b4)y=OLPt^G=vK&W89AzyxwQO0d z2~K70E%=7GNDqGSRpvSNZ#BNpf!MguMv(sT;DWL7T5DjZ#vgrp|2m4R)U6-*BkkBl z43r=eFY`9PnWB+XbVCoE1NLQKsbxKq7U2b4oI!cleDmz$ zE6R%Otdd7B=$e!J=pamMu5?Wq7T``6y&}vfKe!}d3J^Z-gQ?5~l?5Fdm^cLNI9V?( z*1nL|y&#ioe@%hxV4ihRjPK0tv*f%C2KjRbUb`t+lsUsJz;?|YU%;0!iKfPP<_!y; zv6+5{&h>H6>TOA*g9>#YCmQP}aReeHItP;x9>O z3ZT;5xucB%ec4(ZTyO)4YS^>4IS8H1&I^0x#udc0RU<3NyOigj>Ep8i8$oFPE>XSf zIO*NbCb`O8Erw|dzSH>~OVLYa6l_H_V{au2Cd7MGe;2eEjjV1aN9RPZddjzCW!1-LXb2KXZi%C6EH+wjG z&QGRY_D$+;Fjgjt(PlQ_df$jnyMEAcIW%XfJ3lbuCb=v8^(FK2XFTk&UPhC=LgXs) ziX6S(pSAkSPC<4pmtqS-;N;=L^Fx_OM*1@uLJ{h^;J4j0L7)WUuS4(Y<{bzGm1C=& zG@pw@PiyW^ywAhSINQlVgCTYKU_5bcM6>ysLAG9R3ZRc+&Dyq5s$Hd?CiN;|;6MSn z0U|**XMlDKjOds7!xfKY%;S!3EV70~8Z5^ppRT$R+*; zGML;gT2idEh}1eh#Jtkqo_)4RZtC7N@y>-sj}A`U3y#%=p?6T3#K%hc_g4C<$4=+V zUVZO%w|O11RH?ac0F4b8FzP-R1MmCBWv-+*bB5-!cp^a6_3y*hQflUV#EO5wl!A1s zMVnYDiBzGZj)tYHzXbBvSOX3rl}8`B9d71x7-)I;pQ7m8niIo03HWQyR_&9%)dm?E ztdN$)6w(t`_L!YC!y5y?J5^_#PpT#EjFo`whI#GcplwcGluooj_L~O{WHdFxGL@N=rkmxa0S)*L-sQok%{Ah*|J-y`FIqayme&($ivGXsfUpv)+(%afz8j! z!08;``p~;z9%QLnH$RBkuc7FIJQAYk#w{4*nM5OKsIh?ziPh zlbF$66>8*moO}Dcm$Ni9{e^haU->dRpFtoGgt%WI7|>t$F4oZA=XTUKfv@#E^iK^D zG|J@-hX9GJjY0U943TpF{G+a**VbZc$f497GF{NtXC_D^aV5bpvoAie#%A|lk_`=|;2Q==D_-0YCu zV6)%XI{`}(>=eXYTtEHWE$(DWT`C|F-(bfts9dKE=0c6L9tzpn9nkVdka4scQMxn> z5=cfK@xbOXX#oQ`yO(l|VtS?i$CYy@jo4N3yXpn4d)@>r*|9AKP2ep|W5EtbXRU>= ztH&3UC_2X*Odb8c)YG|D*c`$O{O4Y>c^$jP9Wyou;j=~ewVuF0z0Ik-gm*3M{8~+s*9pXJB9L;Y{&+j8s?AcHKy@Z+&z+w*+gk!M=~x$x`>wE6j|Zc!_R##A z-;avRTrv)(&7nRJrH}1>hhh~mgLaOd<7_ z31;zf?gX+tSlzUBkaw9V+h|SWr|r!ak6u}kEbw@q#P5RH-GnL67!gMnW`2}4QkK-# zdqs$~C46elll_dQtlR4%l{DVrcx!&)^lsq~2QiC$FQ+zL4mYRvp3eA_`k|{QQ)=b* z2Z^_kG5kGtGJR>C?5xh8XQS^s{WLPY3IF%70z1~Mu919i2HX^%F*XMf?~wj@Dp6e` zgLrT~+l2I4nQp&epDISBI?Oew3)Z>G@(B5PNhPev^?rKHxtg%dnWy-V;5=Ga#Hh&k zdd-}5mYl`HfRpMO@*wkP%@7IeM-IIxx{q>VY&Geni76OwHjbsRE-d@Gu$;oPKD_BM| zQP}w%WulqpHFgyrUgiH{lh>pccj#8B8U#F&pVT){{aU#kiNM4*Edk&lCF|?RfqA5M zQ6(oI5l4tF?iy|AV`e7*~%lf0XEPQ)*! z3m}7SUV#oaW~AHA`8+lK0p+B_3Tb(!=btpW4#+ zjX5}^^s^@+Crj7;m9}qsywTmj{6|p6s0J6OSR3MYdT4m*4!%fKYa9NNGo(%q!#)TW zMeb1uI)uJq5BNb;fa4D7IOaZ{ZaFeru*#N`%o#06=Cl{!7ZIu|m>aqG*r@lD)QE=v zCpW90Reba4emlkw9uMj{5_(ohH!fq%3vB(zn{TbEnXH5$5OB;hu1Q#Cqt2bdS)w$I+C%U@=3Y6OJzFS>y zcJk2QaG7C%x)uu`vz8^Y zInlWbfWplmH3OMG%7zQ*7}NI3Y(?)|LV^C)pYYMvZ7S~0gwfX6DuzyNIlV^m3~i$| zJwuq$%dz!9(8H(lyXv{q?=d&OByaff51lu9-}B?p8uNd-(biDH`=koAKDsQS2r`Z3 z2TtjJD45F}5RTz5gmT4d5w{2X1$(1gG_#^$WGJ3meAy@=>Lye@e%2OYz$03g`c2>1NzH&4jN$56{J!?jJREqVY>=p=y>wZsp47}@FE~vWIn6HZEw=wqU!RGOeIXC z#;w|Ie)9Bu^%)(QCh^u!%OS0$#MXd)MfFDcPw_ybtG}kJ(SY~xeeNSXwlGog{VSW= w`d9B`zrC7bpnKJTiTR2P8TtR)$NYAw_Qv~}e-4QM9(tuDrv|E$`4aMf0M43JzW@LL literal 0 HcmV?d00001 diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb new file mode 100644 index 0000000000..f4aea41496 --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n# Ensemble LDA\n\nIntroduces Gensim's EnsembleLda model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial will explain how to use the EnsembleLDA model class.\n\nEnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,\nit can therefore be used to remove topics from your results that are noise and are not reproducible.\n\n\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Corpus\nWe will use the gensim downloader api to get a small corpus for training our ensemble.\n\nThe preprocessing is similar to `sphx_glr_auto_examples_tutorials_run_word2vec.py`,\nso it won't be explained again in detail.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import gensim.downloader as api\nfrom gensim.corpora import Dictionary\nfrom nltk.stem.wordnet import WordNetLemmatizer\n\nlemmatizer = WordNetLemmatizer()\ndocs = api.load('text8')\ndocs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]\ndictionary = Dictionary(docs)\ndictionary.filter_extremes(no_below=20, no_above=0.5)\ncorpus = [dictionary.doc2bow(doc) for doc in docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training\n\nTraining the ensemble works very similar to training a single model,\n\nYou can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble.\nIn experiments, LdaMulticore showed better results.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.models import LdaModel\ntopic_model_class = LdaModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Any arbitrary number of models can be used, but it should be a multiple of your workers so that the\nload can be distributed properly. In this example, 4 processes will train 8 models each.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ensemble_workers = 4\nnum_models = 8" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After training all the models, some distance computations are required which can take quite some\ntime as well. You can speed this up by using workers for that as well.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "distance_workers = 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "num_topics = 20\npasses = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now start the training\n\nSince 20 topics were trained on each of the 8 models, we expect there to be 160 different topics.\nThe number of stable topics which are clustered from all those topics is smaller.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.models import EnsembleLda\nensemble = EnsembleLda(\n corpus=corpus,\n id2word=dictionary,\n num_topics=num_topics,\n passes=passes,\n num_models=num_models,\n topic_model_class=LdaModel,\n ensemble_workers=ensemble_workers,\n distance_workers=distance_workers\n)\n\nprint(len(ensemble.ttda))\nprint(len(ensemble.get_topics()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tuning\n\nDifferent from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.\n\nYou can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around\n\nuntil you get as many topics as you desire, which however may reduce their quality.\nIf your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.\n\nHaving an epsilon that is smaller than the smallest distance doesn't make sense.\nMake sure to chose one that is within the range of values.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.1, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Increasing the Size\n\nIf you have some models lying around that were trained on a corpus based on the same dictionary,\nthey are compatible and you can add them to the ensemble.\n\nBy setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is\nentirely made out of your existing topic models with the following method.\n\nAfterwards the number and quality of stable topics might be different depending on your added topics and parameters.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.models import LdaMulticore\n\nmodel1 = LdaMulticore(\n corpus=corpus,\n id2word=dictionary,\n num_topics=9,\n passes=4,\n)\n\nmodel2 = LdaModel(\n corpus=corpus,\n id2word=dictionary,\n num_topics=11,\n passes=2,\n)\n\n# add_model supports various types of input, check out its docstring\nensemble.add_model(model1)\nensemble.add_model(model2)\n\nensemble.recluster()\n\nprint(len(ensemble.ttda))\nprint(len(ensemble.get_topics()))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py new file mode 100644 index 0000000000..b8e8e20887 --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py @@ -0,0 +1,155 @@ +r""" +Ensemble LDA +============ + +Introduces Gensim's EnsembleLda model + +""" + +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +############################################################################### +# This tutorial will explain how to use the EnsembleLDA model class. +# +# EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models, +# it can therefore be used to remove topics from your results that are noise and are not reproducible. +# + +############################################################################### +# Corpus +# ------ +# We will use the gensim downloader api to get a small corpus for training our ensemble. +# +# The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`, +# so it won't be explained again in detail. +# + +import gensim.downloader as api +from gensim.corpora import Dictionary +from nltk.stem.wordnet import WordNetLemmatizer + +lemmatizer = WordNetLemmatizer() +docs = api.load('text8') +docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] +dictionary = Dictionary(docs) +dictionary.filter_extremes(no_below=20, no_above=0.5) +corpus = [dictionary.doc2bow(doc) for doc in docs] + +############################################################################### +# Training +# -------- +# +# Training the ensemble works very similar to training a single model, +# +# You can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble. +# In experiments, LdaMulticore showed better results. +# + +from gensim.models import LdaModel +topic_model_class = LdaModel + +############################################################################### +# Any arbitrary number of models can be used, but it should be a multiple of your workers so that the +# load can be distributed properly. In this example, 4 processes will train 8 models each. +# + +ensemble_workers = 4 +num_models = 8 + +############################################################################### +# After training all the models, some distance computations are required which can take quite some +# time as well. You can speed this up by using workers for that as well. +# + +distance_workers = 4 + +############################################################################### +# All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as +# +num_topics = 20 +passes = 2 + +############################################################################### +# Now start the training +# +# Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics. +# The number of stable topics which are clustered from all those topics is smaller. +# + +from gensim.models import EnsembleLda +ensemble = EnsembleLda( + corpus=corpus, + id2word=dictionary, + num_topics=num_topics, + passes=passes, + num_models=num_models, + topic_model_class=LdaModel, + ensemble_workers=ensemble_workers, + distance_workers=distance_workers +) + +print(len(ensemble.ttda)) +print(len(ensemble.get_topics())) + +############################################################################### +# Tuning +# ------ +# +# Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters. +# +# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around +# +# until you get as many topics as you desire, which however may reduce their quality. +# If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough. +# +# Having an epsilon that is smaller than the smallest distance doesn't make sense. +# Make sure to chose one that is within the range of values. +# + +import numpy as np +shape = ensemble.asymmetric_distance_matrix.shape +without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) +without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() + +ensemble.recluster(eps=0.1, min_samples=2, min_cores=2) + +print(len(ensemble.get_topics())) + +############################################################################### +# Increasing the Size +# ------------------- +# +# If you have some models lying around that were trained on a corpus based on the same dictionary, +# they are compatible and you can add them to the ensemble. +# +# By setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is +# entirely made out of your existing topic models with the following method. +# +# Afterwards the number and quality of stable topics might be different depending on your added topics and parameters. +# + +from gensim.models import LdaMulticore + +model1 = LdaMulticore( + corpus=corpus, + id2word=dictionary, + num_topics=9, + passes=4, +) + +model2 = LdaModel( + corpus=corpus, + id2word=dictionary, + num_topics=11, + passes=2, +) + +# add_model supports various types of input, check out its docstring +ensemble.add_model(model1) +ensemble.add_model(model2) + +ensemble.recluster() + +print(len(ensemble.ttda)) +print(len(ensemble.get_topics())) diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 new file mode 100644 index 0000000000..d779810a22 --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 @@ -0,0 +1 @@ +cebdae557712e17fb52871e2a012e9c8 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst new file mode 100644 index 0000000000..a22e937444 --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst @@ -0,0 +1,420 @@ +.. only:: html + + .. note:: + :class: sphx-glr-download-link-note + + Click :ref:`here ` to download the full example code + .. rst-class:: sphx-glr-example-title + + .. _sphx_glr_auto_examples_tutorials_run_ensemblelda.py: + + +Ensemble LDA +============ + +Introduces Gensim's EnsembleLda model + + +.. code-block:: default + + + import logging + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + + + + + + + + +This tutorial will explain how to use the EnsembleLDA model class. + +EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models, +it can therefore be used to remove topics from your results that are noise and are not reproducible. + + +Corpus +------ +We will use the gensim downloader api to get a small corpus for training our ensemble. + +The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`, +so it won't be explained again in detail. + + + +.. code-block:: default + + + import gensim.downloader as api + from gensim.corpora import Dictionary + from nltk.stem.wordnet import WordNetLemmatizer + + lemmatizer = WordNetLemmatizer() + docs = api.load('text8') + docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] + dictionary = Dictionary(docs) + dictionary.filter_extremes(no_below=20, no_above=0.5) + corpus = [dictionary.doc2bow(doc) for doc in docs] + + + + + +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2020-10-25 01:05:04,690 : INFO : adding document #0 to Dictionary(0 unique tokens: []) + 2020-10-25 01:05:10,080 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) + 2020-10-25 01:05:10,262 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... + 2020-10-25 01:05:10,263 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents + 2020-10-25 01:05:10,338 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) + + + + +Training +-------- + +Training the ensemble works very similar to training a single model, + +You can use any model that is based on LdaModel, such as LdaMulticore, to train the Ensemble. +In experiments, LdaMulticore showed better results. + + + +.. code-block:: default + + + from gensim.models import LdaModel + topic_model_class = LdaModel + + + + + + + + +Any arbitrary number of models can be used, but it should be a multiple of your workers so that the +load can be distributed properly. In this example, 4 processes will train 8 models each. + + + +.. code-block:: default + + + ensemble_workers = 4 + num_models = 8 + + + + + + + + +After training all the models, some distance computations are required which can take quite some +time as well. You can speed this up by using workers for that as well. + + + +.. code-block:: default + + + distance_workers = 4 + + + + + + + + +All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as + + + +.. code-block:: default + + num_topics = 20 + passes = 2 + + + + + + + + +Now start the training + +Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics. +The number of stable topics which are clustered from all those topics is smaller. + + + +.. code-block:: default + + + from gensim.models import EnsembleLda + ensemble = EnsembleLda( + corpus=corpus, + id2word=dictionary, + num_topics=num_topics, + passes=passes, + num_models=num_models, + topic_model_class=LdaModel, + ensemble_workers=ensemble_workers, + distance_workers=distance_workers + ) + + print(len(ensemble.ttda)) + print(len(ensemble.get_topics())) + + + + + +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2020-10-25 01:05:13,988 : INFO : generating 8 topic models... + 2020-10-25 01:09:45,442 : INFO : generating a 160 x 160 asymmetric distance matrix... + 2020-10-25 01:09:46,332 : INFO : fitting the clustering model, using 4 for min_samples + 2020-10-25 01:09:46,348 : INFO : generating stable topics, using 3 for min_cores + 2020-10-25 01:09:46,348 : INFO : found 32 clusters + 2020-10-25 01:09:46,350 : INFO : found 2 stable topics + 2020-10-25 01:09:46,350 : INFO : generating classic gensim model representation based on results from the ensemble + 2020-10-25 01:09:46,496 : INFO : using symmetric alpha at 0.5 + 2020-10-25 01:09:46,496 : INFO : using symmetric eta at 0.5 + 2020-10-25 01:09:46,497 : INFO : using serial LDA version on this node + 2020-10-25 01:09:46,500 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 01:09:46,500 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 160 + 2 + + + + +Tuning +------ + +Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters. + +You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around + +until you get as many topics as you desire, which however may reduce their quality. +If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough. + +Having an epsilon that is smaller than the smallest distance doesn't make sense. +Make sure to chose one that is within the range of values. + + + +.. code-block:: default + + + import numpy as np + shape = ensemble.asymmetric_distance_matrix.shape + without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) + without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() + + ensemble.recluster(eps=0.1, min_samples=2, min_cores=2) + + print(len(ensemble.get_topics())) + + + + + +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2020-10-25 01:09:46,802 : INFO : fitting the clustering model + 2020-10-25 01:09:46,816 : INFO : generating stable topics + 2020-10-25 01:09:46,816 : INFO : found 47 clusters + 2020-10-25 01:09:46,819 : INFO : found 2 stable topics + 2020-10-25 01:09:46,819 : INFO : generating classic gensim model representation based on results from the ensemble + 2020-10-25 01:09:46,821 : INFO : using symmetric alpha at 0.5 + 2020-10-25 01:09:46,821 : INFO : using symmetric eta at 0.5 + 2020-10-25 01:09:46,823 : INFO : using serial LDA version on this node + 2020-10-25 01:09:46,825 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 01:09:46,825 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2 + + + + +Increasing the Size +------------------- + +If you have some models lying around that were trained on a corpus based on the same dictionary, +they are compatible and you can add them to the ensemble. + +By setting num_models of the EnsembleLda constructor to 0 you can also create an ensemble that is +entirely made out of your existing topic models with the following method. + +Afterwards the number and quality of stable topics might be different depending on your added topics and parameters. + + + +.. code-block:: default + + + from gensim.models import LdaMulticore + + model1 = LdaMulticore( + corpus=corpus, + id2word=dictionary, + num_topics=9, + passes=4, + ) + + model2 = LdaModel( + corpus=corpus, + id2word=dictionary, + num_topics=11, + passes=2, + ) + + # add_model supports various types of input, check out its docstring + ensemble.add_model(model1) + ensemble.add_model(model2) + + ensemble.recluster() + + print(len(ensemble.ttda)) + print(len(ensemble.get_topics())) + + + + +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2020-10-25 01:09:47,300 : INFO : using symmetric alpha at 0.1111111111111111 + 2020-10-25 01:09:47,301 : INFO : using symmetric eta at 0.1111111111111111 + 2020-10-25 01:09:47,304 : INFO : using serial LDA version on this node + 2020-10-25 01:09:47,314 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 01:09:47,314 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2020-10-25 01:09:47,315 : INFO : training LDA model using 11 processes + 2020-10-25 01:09:47,546 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 01:09:52,514 : INFO : topic #8 (0.111): 0.001*"league" + 0.001*"minister" + 0.001*"jewish" + 0.001*"animal" + 0.001*"km" + 0.001*"soviet" + 0.001*"election" + 0.001*"emperor" + 0.001*"energy" + 0.001*"cell" + 2020-10-25 01:09:52,514 : INFO : topic #2 (0.111): 0.001*"km" + 0.001*"china" + 0.001*"county" + 0.001*"album" + 0.001*"jewish" + 0.001*"california" + 0.001*"actor" + 0.001*"economy" + 0.001*"jew" + 0.001*"band" + 2020-10-25 01:09:52,514 : INFO : topic #6 (0.111): 0.001*"km" + 0.001*"election" + 0.001*"software" + 0.001*"user" + 0.001*"japanese" + 0.001*"female" + 0.001*"minister" + 0.001*"prime" + 0.001*"season" + 0.001*"saint" + 2020-10-25 01:09:52,515 : INFO : topic #0 (0.111): 0.001*"actor" + 0.001*"km" + 0.001*"cell" + 0.001*"minister" + 0.001*"italian" + 0.001*"male" + 0.001*"economy" + 0.001*"energy" + 0.001*"y" + 0.001*"soviet" + 2020-10-25 01:09:52,515 : INFO : topic #5 (0.111): 0.001*"band" + 0.001*"actor" + 0.001*"soviet" + 0.001*"album" + 0.001*"india" + 0.001*"minister" + 0.001*"ship" + 0.001*"km" + 0.001*"energy" + 0.001*"china" + 2020-10-25 01:09:52,515 : INFO : topic diff=1.009308, rho=1.000000 + 2020-10-25 01:10:03,105 : INFO : -9.257 per-word bound, 611.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 01:10:03,105 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 01:10:07,873 : INFO : topic #4 (0.111): 0.002*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"blue" + 0.001*"cell" + 0.001*"emperor" + 0.001*"bc" + 0.001*"actor" + 0.001*"japanese" + 0.001*"software" + 2020-10-25 01:10:07,873 : INFO : topic #2 (0.111): 0.001*"jewish" + 0.001*"jew" + 0.001*"carbon" + 0.001*"county" + 0.001*"california" + 0.001*"jesus" + 0.001*"china" + 0.001*"km" + 0.001*"cell" + 0.001*"gospel" + 2020-10-25 01:10:07,873 : INFO : topic #8 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"baseball" + 0.001*"jewish" + 0.001*"animal" + 0.001*"cell" + 0.001*"y" + 0.001*"ball" + 0.001*"china" + 0.001*"minister" + 2020-10-25 01:10:07,874 : INFO : topic #1 (0.111): 0.002*"km" + 0.002*"soviet" + 0.002*"minister" + 0.002*"election" + 0.001*"est" + 0.001*"economy" + 0.001*"male" + 0.001*"chinese" + 0.001*"liberal" + 0.001*"russian" + 2020-10-25 01:10:07,874 : INFO : topic #5 (0.111): 0.002*"band" + 0.001*"album" + 0.001*"window" + 0.001*"actor" + 0.001*"comic" + 0.001*"irish" + 0.001*"india" + 0.001*"user" + 0.001*"microsoft" + 0.001*"minister" + 2020-10-25 01:10:07,874 : INFO : topic diff=0.149585, rho=0.592297 + 2020-10-25 01:10:18,504 : INFO : -9.186 per-word bound, 582.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 01:10:18,505 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 01:10:23,685 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.001*"jew" + 0.001*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"california" + 0.001*"hebrew" + 0.001*"cell" + 0.001*"translation" + 2020-10-25 01:10:23,685 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.001*"comic" + 0.001*"microsoft" + 0.001*"irish" + 0.001*"apple" + 0.001*"user" + 0.001*"software" + 0.001*"actor" + 2020-10-25 01:10:23,685 : INFO : topic #4 (0.111): 0.002*"album" + 0.002*"energy" + 0.002*"band" + 0.001*"cell" + 0.001*"blue" + 0.001*"y" + 0.001*"bc" + 0.001*"guitar" + 0.001*"emperor" + 0.001*"universe" + 2020-10-25 01:10:23,686 : INFO : topic #1 (0.111): 0.003*"km" + 0.002*"election" + 0.002*"est" + 0.002*"soviet" + 0.002*"minister" + 0.002*"economy" + 0.002*"male" + 0.001*"territory" + 0.001*"liberal" + 0.001*"elected" + 2020-10-25 01:10:23,686 : INFO : topic #8 (0.111): 0.003*"league" + 0.001*"baseball" + 0.001*"season" + 0.001*"ball" + 0.001*"drug" + 0.001*"football" + 0.001*"cell" + 0.001*"animal" + 0.001*"park" + 0.001*"algorithm" + 2020-10-25 01:10:23,686 : INFO : topic diff=0.178759, rho=0.509614 + 2020-10-25 01:10:34,248 : INFO : -9.123 per-word bound, 557.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 01:10:34,249 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 01:10:38,910 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.002*"microsoft" + 0.002*"comic" + 0.002*"irish" + 0.002*"apple" + 0.001*"software" + 0.001*"user" + 0.001*"musical" + 2020-10-25 01:10:38,910 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"hebrew" + 0.001*"christ" + 0.001*"translation" + 0.001*"judaism" + 2020-10-25 01:10:38,910 : INFO : topic #3 (0.111): 0.002*"import" + 0.002*"card" + 0.002*"tree" + 0.001*"league" + 0.001*"bit" + 0.001*"ford" + 0.001*"fiction" + 0.001*"season" + 0.001*"fan" + 0.001*"novel" + 2020-10-25 01:10:38,911 : INFO : topic #7 (0.111): 0.003*"actor" + 0.002*"emperor" + 0.002*"alexander" + 0.002*"bc" + 0.002*"chinese" + 0.001*"actress" + 0.001*"jewish" + 0.001*"china" + 0.001*"singer" + 0.001*"japanese" + 2020-10-25 01:10:38,911 : INFO : topic #1 (0.111): 0.003*"km" + 0.003*"election" + 0.003*"est" + 0.003*"soviet" + 0.003*"minister" + 0.003*"economy" + 0.002*"male" + 0.002*"territory" + 0.002*"parliament" + 0.002*"elected" + 2020-10-25 01:10:38,911 : INFO : topic diff=0.154319, rho=0.454053 + 2020-10-25 01:10:49,397 : INFO : -9.086 per-word bound, 543.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 01:10:49,473 : INFO : using symmetric alpha at 0.09090909090909091 + 2020-10-25 01:10:49,473 : INFO : using symmetric eta at 0.09090909090909091 + 2020-10-25 01:10:49,475 : INFO : using serial LDA version on this node + 2020-10-25 01:10:49,488 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 01:10:49,488 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2020-10-25 01:11:00,304 : INFO : -10.500 per-word bound, 1448.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 01:11:00,304 : INFO : PROGRESS: pass 0, at document #1701/1701 + 2020-10-25 01:11:03,567 : INFO : topic #4 (0.091): 0.001*"moon" + 0.001*"actor" + 0.001*"cell" + 0.001*"election" + 0.001*"minister" + 0.001*"prime" + 0.001*"software" + 0.001*"novel" + 0.001*"emperor" + 0.001*"blue" + 2020-10-25 01:11:03,567 : INFO : topic #10 (0.091): 0.001*"emperor" + 0.001*"actor" + 0.001*"election" + 0.001*"italian" + 0.001*"soviet" + 0.001*"irish" + 0.001*"love" + 0.001*"card" + 0.001*"china" + 0.001*"ball" + 2020-10-25 01:11:03,567 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"territory" + 0.001*"jewish" + 0.001*"minister" + 0.001*"y" + 0.001*"election" + 0.001*"india" + 0.001*"km" + 0.001*"machine" + 0.001*"map" + 2020-10-25 01:11:03,568 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"km" + 0.001*"software" + 0.001*"actor" + 0.001*"india" + 0.001*"indian" + 0.001*"animal" + 0.001*"male" + 0.001*"prime" + 0.001*"philosophy" + 2020-10-25 01:11:03,568 : INFO : topic #2 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"season" + 0.001*"cell" + 0.001*"band" + 0.001*"emperor" + 0.001*"album" + 0.001*"station" + 0.001*"female" + 0.001*"km" + 2020-10-25 01:11:03,568 : INFO : topic diff=1.023399, rho=1.000000 + 2020-10-25 01:11:14,271 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 01:11:14,271 : INFO : PROGRESS: pass 1, at document #1701/1701 + 2020-10-25 01:11:17,511 : INFO : topic #1 (0.091): 0.002*"energy" + 0.001*"season" + 0.001*"league" + 0.001*"africa" + 0.001*"chinese" + 0.001*"football" + 0.001*"blue" + 0.001*"ship" + 0.001*"machine" + 0.001*"symbol" + 2020-10-25 01:11:17,511 : INFO : topic #4 (0.091): 0.002*"cell" + 0.002*"moon" + 0.001*"acid" + 0.001*"disease" + 0.001*"plant" + 0.001*"specie" + 0.001*"surface" + 0.001*"scientific" + 0.001*"software" + 0.001*"energy" + 2020-10-25 01:11:17,512 : INFO : topic #8 (0.091): 0.001*"territory" + 0.001*"league" + 0.001*"jewish" + 0.001*"minister" + 0.001*"election" + 0.001*"machine" + 0.001*"india" + 0.001*"irish" + 0.001*"apple" + 0.001*"y" + 2020-10-25 01:11:17,512 : INFO : topic #3 (0.091): 0.001*"import" + 0.001*"horse" + 0.001*"soviet" + 0.001*"km" + 0.001*"india" + 0.001*"software" + 0.001*"animal" + 0.001*"male" + 0.001*"indian" + 0.001*"lake" + 2020-10-25 01:11:17,512 : INFO : topic #7 (0.091): 0.002*"actor" + 0.002*"emperor" + 0.001*"league" + 0.001*"software" + 0.001*"y" + 0.001*"band" + 0.001*"linux" + 0.001*"italian" + 0.001*"singer" + 0.001*"season" + 2020-10-25 01:11:17,513 : INFO : topic diff=0.165568, rho=0.577350 + 2020-10-25 01:11:17,513 : INFO : ensemble contains 9 models and 160 topics now + 2020-10-25 01:11:17,519 : INFO : ensemble contains 10 models and 169 topics now + 2020-10-25 01:11:17,525 : INFO : asymmetric distance matrix is outdated due to add_model + 2020-10-25 01:11:17,525 : INFO : generating a 180 x 180 asymmetric distance matrix... + 2020-10-25 01:11:18,609 : INFO : fitting the clustering model, using 5 for min_samples + 2020-10-25 01:11:18,629 : INFO : generating stable topics, using 3 for min_cores + 2020-10-25 01:11:18,629 : INFO : found 19 clusters + 2020-10-25 01:11:18,634 : INFO : found 1 stable topics + 2020-10-25 01:11:18,635 : INFO : generating classic gensim model representation based on results from the ensemble + 2020-10-25 01:11:18,635 : INFO : using symmetric alpha at 1.0 + 2020-10-25 01:11:18,635 : INFO : using symmetric eta at 1.0 + 2020-10-25 01:11:18,637 : INFO : using serial LDA version on this node + 2020-10-25 01:11:18,638 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 01:11:18,638 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 180 + 1 + + + + + +.. rst-class:: sphx-glr-timing + + **Total running time of the script:** ( 6 minutes 52.125 seconds) + +**Estimated memory usage:** 1636 MB + + +.. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py: + + +.. only :: html + + .. container:: sphx-glr-footer + :class: sphx-glr-footer-example + + + + .. container:: sphx-glr-download sphx-glr-download-python + + :download:`Download Python source code: run_ensemblelda.py ` + + + + .. container:: sphx-glr-download sphx-glr-download-jupyter + + :download:`Download Jupyter notebook: run_ensemblelda.ipynb ` + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle b/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle new file mode 100644 index 0000000000000000000000000000000000000000..91585af924ab6004df91e11df745eacc51ffe39e GIT binary patch literal 8641 zcmd5>ONbm*6dlccI@2@B%p{Xfi;B2OR7x@%6*n<4N+2IdBH}{YH&t)Cs#0Cm)W^vf zln9C%3IbZe@Ac!09t9t*c?y73WPgaw9_ndp*eeZtW z?HhCN-+o{w|Ce^n!1J&fJAof%Y4>7i;SgI-J7%1{mmR;99ZMHIJRG9fc?E~ri`lVP z;nS>-hB!-CpASOo0*=oKfpmTtScyj#eJ{RhL@^#3+aTuSI7^p`#eQT5A^dlih-7KU ziTb98q9}VO0d{+#irC(p?&XY`-KAiAc`~L@VR!9=L=vYfmJ`LuH*p_~x{AXHwp8fc z=Ycy>xI_H~JCZ*F?&tvf zkux;RARGlDGAsdst?g4+Gkf`M>KKitB-jEwUnn8Fd{`*xTt+YBa{;o*?v|(>N$hyU z9r_1umV%OqHw5v*c{Cb1{($iAm+qW_HcdjU+{0>^D$=!|5Mqm``7Lg#Ktt*}P6{1! zmmvglzqGJlfOO_(1#{v%)R zuk!G!Q}R|FNY`?c0C2kG1%m-3Pvh)`XD*5WsVrHygpS!<971M+;z_c9x1d~s0)&T< zbD9;x4+sjS9flG{bk9(ta~2}$g_Sla_1KFH2JpygH(jPBrC5c^#v)6n(FI#TrHJ{c z*zBU84Et2&#Crd6vCTDRwSg?F0E!K-OB#Pa?VFw($2FPflWsu!T~3 z`m`1z@qbphTRF?3WrP+;`P$xFtOe3(8u(}&RI>alQOlHOM)!yApxPGbYe77PNGk3Rq9N5z2I;QnK*CMtG@E9T1_OBx-)mp%m}^vPN4{22}X{|FWq7F1UEr| z-@5T@+<*$klB?W>IwK^57ZurE4PbXw;hyFWk|2IiRFr6EIK*-2n0@{hg%b^(YY;L; z#4Pw03PZF*&UX$<*9iXVy1m>GQiYE5|N=K;<+z;Qn&$4%bGG=>k!HBS-% z1E;L_6BtW(VSLvxfknC=kI-Ef-aWu8pSlq})w_>2f=PNdC;+D&G;$1&Jfpb_J6nmL z&jrY-%0mYCF-6a84g(n$!&b2ElW*kp)~)=S_lWakmXL^f#;gaEj*_mER2NDQZac9Z zBr)7VG=OP^XAB*mU%?)og7lfy{UGeawY7u%1+3VAhI5K6Cbm6ovHFKHs(UH7ma$jo zLgxD2OsX#^dMi@k_{7;w$FH?ZsxQ?UA>C$Ek=@;->Z;uG9cdK(I9(Kz0}*4|7=opy zrc*Z7=V&yoCIlK*^5^w@&!ia4!lggx69M%7%8OL2gB2_;^PaK!h+sTni<2BrA|2`H^hk@t7f)qn7o~pTizX;Z z%a{D(^h7c;=KIb7HLn*u#5=CTb{|K0+x>;8jcaaCuH#N)0_XZ;uDMCpp&1k@DN znY+moo;(&0u-P%Z(j60AvgHg!bshgInWGFv4(!Tk$TreM+3iqC*tz(r*x-K(W%rb7 zk|YJBbp@v|rH-Y#F0wA54$Uj%I^Wg2KQi!+G|?LGw4UO$Jp={&5^wl>Iq7QNd=;~0 z`s+9E6~r?N!2g3Ol>1O_Ht!bxwL#PgMel#~W_rBN2&u?CsmSi`o9Ptg#) Date: Sun, 25 Oct 2020 01:21:34 +0200 Subject: [PATCH 121/166] changed eps in example --- docs/src/auto_examples/tutorials/run_ensemblelda.ipynb | 2 +- docs/src/auto_examples/tutorials/run_ensemblelda.py | 2 +- docs/src/gallery/tutorials/run_ensemblelda.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb index f4aea41496..3952a97d12 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb @@ -159,7 +159,7 @@ }, "outputs": [], "source": [ - "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.1, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))" + "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.09, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))" ] }, { diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py index b8e8e20887..bde40f98ea 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py @@ -112,7 +112,7 @@ without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() -ensemble.recluster(eps=0.1, min_samples=2, min_cores=2) +ensemble.recluster(eps=0.09, min_samples=2, min_cores=2) print(len(ensemble.get_topics())) diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py index b8e8e20887..bde40f98ea 100644 --- a/docs/src/gallery/tutorials/run_ensemblelda.py +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -112,7 +112,7 @@ without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() -ensemble.recluster(eps=0.1, min_samples=2, min_cores=2) +ensemble.recluster(eps=0.09, min_samples=2, min_cores=2) print(len(ensemble.get_topics())) From 0b5aaf298a6b903c4fe477504ed47a4ccd9e043c Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 25 Oct 2020 11:14:52 +0100 Subject: [PATCH 122/166] added to tutorials --- docs/src/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/conf.py b/docs/src/conf.py index 13d26e254b..42ede6d7f8 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -258,6 +258,7 @@ def sort_key(source_dir): 'run_lda.py', 'run_wmd.py', 'run_summarization.py', + 'run_ensemblelda.py', ] howto_order = [ From 3bd838e94001f82b0f9256f88982bcfcc2e1389f Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 25 Oct 2020 11:30:56 +0100 Subject: [PATCH 123/166] rebuilt --- docs/src/auto_examples/index.rst | 40 ++-- .../tutorials/run_ensemblelda.py.md5 | 2 +- .../tutorials/run_ensemblelda.rst | 208 +++++++++--------- .../tutorials/sg_execution_times.rst | 4 +- 4 files changed, 127 insertions(+), 127 deletions(-) diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index dfee557a6c..e46809100c 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -190,14 +190,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png - :alt: Ensemble LDA + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + :alt: Fast Similarity Queries with Annoy and Word2Vec - :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html @@ -207,18 +207,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_ensemblelda + /auto_examples/tutorials/run_annoy .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :alt: Fast Similarity Queries with Annoy and Word2Vec + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + :alt: LDA Model - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html @@ -228,18 +228,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_annoy + /auto_examples/tutorials/run_lda .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :alt: LDA Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + :alt: Word Mover's Distance - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -249,18 +249,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_lda + /auto_examples/tutorials/run_wmd .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :alt: Word Mover's Distance + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` .. raw:: html @@ -270,7 +270,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_wmd + /auto_examples/tutorials/run_ensemblelda .. raw:: html
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 index d779810a22..aff4513799 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 @@ -1 +1 @@ -cebdae557712e17fb52871e2a012e9c8 \ No newline at end of file +f5b586678dc10d31de5c29251f4f7ad9 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst index a22e937444..f4cfc8ad5d 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst @@ -67,11 +67,11 @@ so it won't be explained again in detail. .. code-block:: none - 2020-10-25 01:05:04,690 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-10-25 01:05:10,080 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) - 2020-10-25 01:05:10,262 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... - 2020-10-25 01:05:10,263 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents - 2020-10-25 01:05:10,338 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) + 2020-10-25 11:23:34,467 : INFO : adding document #0 to Dictionary(0 unique tokens: []) + 2020-10-25 11:23:39,808 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) + 2020-10-25 11:23:39,989 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... + 2020-10-25 11:23:39,989 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents + 2020-10-25 11:23:40,063 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) @@ -185,18 +185,18 @@ The number of stable topics which are clustered from all those topics is smaller .. code-block:: none - 2020-10-25 01:05:13,988 : INFO : generating 8 topic models... - 2020-10-25 01:09:45,442 : INFO : generating a 160 x 160 asymmetric distance matrix... - 2020-10-25 01:09:46,332 : INFO : fitting the clustering model, using 4 for min_samples - 2020-10-25 01:09:46,348 : INFO : generating stable topics, using 3 for min_cores - 2020-10-25 01:09:46,348 : INFO : found 32 clusters - 2020-10-25 01:09:46,350 : INFO : found 2 stable topics - 2020-10-25 01:09:46,350 : INFO : generating classic gensim model representation based on results from the ensemble - 2020-10-25 01:09:46,496 : INFO : using symmetric alpha at 0.5 - 2020-10-25 01:09:46,496 : INFO : using symmetric eta at 0.5 - 2020-10-25 01:09:46,497 : INFO : using serial LDA version on this node - 2020-10-25 01:09:46,500 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 01:09:46,500 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2020-10-25 11:23:43,277 : INFO : generating 8 topic models... + 2020-10-25 11:28:16,432 : INFO : generating a 160 x 160 asymmetric distance matrix... + 2020-10-25 11:28:17,287 : INFO : fitting the clustering model, using 4 for min_samples + 2020-10-25 11:28:17,315 : INFO : generating stable topics, using 3 for min_cores + 2020-10-25 11:28:17,315 : INFO : found 21 clusters + 2020-10-25 11:28:17,319 : INFO : found 2 stable topics + 2020-10-25 11:28:17,319 : INFO : generating classic gensim model representation based on results from the ensemble + 2020-10-25 11:28:17,487 : INFO : using symmetric alpha at 0.5 + 2020-10-25 11:28:17,487 : INFO : using symmetric eta at 0.5 + 2020-10-25 11:28:17,488 : INFO : using serial LDA version on this node + 2020-10-25 11:28:17,491 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 11:28:17,491 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 160 2 @@ -226,7 +226,7 @@ Make sure to chose one that is within the range of values. without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() - ensemble.recluster(eps=0.1, min_samples=2, min_cores=2) + ensemble.recluster(eps=0.09, min_samples=2, min_cores=2) print(len(ensemble.get_topics())) @@ -240,17 +240,17 @@ Make sure to chose one that is within the range of values. .. code-block:: none - 2020-10-25 01:09:46,802 : INFO : fitting the clustering model - 2020-10-25 01:09:46,816 : INFO : generating stable topics - 2020-10-25 01:09:46,816 : INFO : found 47 clusters - 2020-10-25 01:09:46,819 : INFO : found 2 stable topics - 2020-10-25 01:09:46,819 : INFO : generating classic gensim model representation based on results from the ensemble - 2020-10-25 01:09:46,821 : INFO : using symmetric alpha at 0.5 - 2020-10-25 01:09:46,821 : INFO : using symmetric eta at 0.5 - 2020-10-25 01:09:46,823 : INFO : using serial LDA version on this node - 2020-10-25 01:09:46,825 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 01:09:46,825 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2 + 2020-10-25 11:28:17,675 : INFO : fitting the clustering model + 2020-10-25 11:28:17,687 : INFO : generating stable topics + 2020-10-25 11:28:17,688 : INFO : found 33 clusters + 2020-10-25 11:28:17,690 : INFO : found 4 stable topics + 2020-10-25 11:28:17,691 : INFO : generating classic gensim model representation based on results from the ensemble + 2020-10-25 11:28:17,692 : INFO : using symmetric alpha at 0.25 + 2020-10-25 11:28:17,692 : INFO : using symmetric eta at 0.25 + 2020-10-25 11:28:17,694 : INFO : using serial LDA version on this node + 2020-10-25 11:28:17,698 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 11:28:17,698 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 4 @@ -305,79 +305,79 @@ Afterwards the number and quality of stable topics might be different depending .. code-block:: none - 2020-10-25 01:09:47,300 : INFO : using symmetric alpha at 0.1111111111111111 - 2020-10-25 01:09:47,301 : INFO : using symmetric eta at 0.1111111111111111 - 2020-10-25 01:09:47,304 : INFO : using serial LDA version on this node - 2020-10-25 01:09:47,314 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 01:09:47,314 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2020-10-25 01:09:47,315 : INFO : training LDA model using 11 processes - 2020-10-25 01:09:47,546 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 01:09:52,514 : INFO : topic #8 (0.111): 0.001*"league" + 0.001*"minister" + 0.001*"jewish" + 0.001*"animal" + 0.001*"km" + 0.001*"soviet" + 0.001*"election" + 0.001*"emperor" + 0.001*"energy" + 0.001*"cell" - 2020-10-25 01:09:52,514 : INFO : topic #2 (0.111): 0.001*"km" + 0.001*"china" + 0.001*"county" + 0.001*"album" + 0.001*"jewish" + 0.001*"california" + 0.001*"actor" + 0.001*"economy" + 0.001*"jew" + 0.001*"band" - 2020-10-25 01:09:52,514 : INFO : topic #6 (0.111): 0.001*"km" + 0.001*"election" + 0.001*"software" + 0.001*"user" + 0.001*"japanese" + 0.001*"female" + 0.001*"minister" + 0.001*"prime" + 0.001*"season" + 0.001*"saint" - 2020-10-25 01:09:52,515 : INFO : topic #0 (0.111): 0.001*"actor" + 0.001*"km" + 0.001*"cell" + 0.001*"minister" + 0.001*"italian" + 0.001*"male" + 0.001*"economy" + 0.001*"energy" + 0.001*"y" + 0.001*"soviet" - 2020-10-25 01:09:52,515 : INFO : topic #5 (0.111): 0.001*"band" + 0.001*"actor" + 0.001*"soviet" + 0.001*"album" + 0.001*"india" + 0.001*"minister" + 0.001*"ship" + 0.001*"km" + 0.001*"energy" + 0.001*"china" - 2020-10-25 01:09:52,515 : INFO : topic diff=1.009308, rho=1.000000 - 2020-10-25 01:10:03,105 : INFO : -9.257 per-word bound, 611.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 01:10:03,105 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 01:10:07,873 : INFO : topic #4 (0.111): 0.002*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"blue" + 0.001*"cell" + 0.001*"emperor" + 0.001*"bc" + 0.001*"actor" + 0.001*"japanese" + 0.001*"software" - 2020-10-25 01:10:07,873 : INFO : topic #2 (0.111): 0.001*"jewish" + 0.001*"jew" + 0.001*"carbon" + 0.001*"county" + 0.001*"california" + 0.001*"jesus" + 0.001*"china" + 0.001*"km" + 0.001*"cell" + 0.001*"gospel" - 2020-10-25 01:10:07,873 : INFO : topic #8 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"baseball" + 0.001*"jewish" + 0.001*"animal" + 0.001*"cell" + 0.001*"y" + 0.001*"ball" + 0.001*"china" + 0.001*"minister" - 2020-10-25 01:10:07,874 : INFO : topic #1 (0.111): 0.002*"km" + 0.002*"soviet" + 0.002*"minister" + 0.002*"election" + 0.001*"est" + 0.001*"economy" + 0.001*"male" + 0.001*"chinese" + 0.001*"liberal" + 0.001*"russian" - 2020-10-25 01:10:07,874 : INFO : topic #5 (0.111): 0.002*"band" + 0.001*"album" + 0.001*"window" + 0.001*"actor" + 0.001*"comic" + 0.001*"irish" + 0.001*"india" + 0.001*"user" + 0.001*"microsoft" + 0.001*"minister" - 2020-10-25 01:10:07,874 : INFO : topic diff=0.149585, rho=0.592297 - 2020-10-25 01:10:18,504 : INFO : -9.186 per-word bound, 582.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 01:10:18,505 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 01:10:23,685 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.001*"jew" + 0.001*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"california" + 0.001*"hebrew" + 0.001*"cell" + 0.001*"translation" - 2020-10-25 01:10:23,685 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.001*"comic" + 0.001*"microsoft" + 0.001*"irish" + 0.001*"apple" + 0.001*"user" + 0.001*"software" + 0.001*"actor" - 2020-10-25 01:10:23,685 : INFO : topic #4 (0.111): 0.002*"album" + 0.002*"energy" + 0.002*"band" + 0.001*"cell" + 0.001*"blue" + 0.001*"y" + 0.001*"bc" + 0.001*"guitar" + 0.001*"emperor" + 0.001*"universe" - 2020-10-25 01:10:23,686 : INFO : topic #1 (0.111): 0.003*"km" + 0.002*"election" + 0.002*"est" + 0.002*"soviet" + 0.002*"minister" + 0.002*"economy" + 0.002*"male" + 0.001*"territory" + 0.001*"liberal" + 0.001*"elected" - 2020-10-25 01:10:23,686 : INFO : topic #8 (0.111): 0.003*"league" + 0.001*"baseball" + 0.001*"season" + 0.001*"ball" + 0.001*"drug" + 0.001*"football" + 0.001*"cell" + 0.001*"animal" + 0.001*"park" + 0.001*"algorithm" - 2020-10-25 01:10:23,686 : INFO : topic diff=0.178759, rho=0.509614 - 2020-10-25 01:10:34,248 : INFO : -9.123 per-word bound, 557.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 01:10:34,249 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 01:10:38,910 : INFO : topic #5 (0.111): 0.002*"band" + 0.002*"album" + 0.002*"window" + 0.002*"microsoft" + 0.002*"comic" + 0.002*"irish" + 0.002*"apple" + 0.001*"software" + 0.001*"user" + 0.001*"musical" - 2020-10-25 01:10:38,910 : INFO : topic #2 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"jesus" + 0.001*"carbon" + 0.001*"gospel" + 0.001*"bible" + 0.001*"hebrew" + 0.001*"christ" + 0.001*"translation" + 0.001*"judaism" - 2020-10-25 01:10:38,910 : INFO : topic #3 (0.111): 0.002*"import" + 0.002*"card" + 0.002*"tree" + 0.001*"league" + 0.001*"bit" + 0.001*"ford" + 0.001*"fiction" + 0.001*"season" + 0.001*"fan" + 0.001*"novel" - 2020-10-25 01:10:38,911 : INFO : topic #7 (0.111): 0.003*"actor" + 0.002*"emperor" + 0.002*"alexander" + 0.002*"bc" + 0.002*"chinese" + 0.001*"actress" + 0.001*"jewish" + 0.001*"china" + 0.001*"singer" + 0.001*"japanese" - 2020-10-25 01:10:38,911 : INFO : topic #1 (0.111): 0.003*"km" + 0.003*"election" + 0.003*"est" + 0.003*"soviet" + 0.003*"minister" + 0.003*"economy" + 0.002*"male" + 0.002*"territory" + 0.002*"parliament" + 0.002*"elected" - 2020-10-25 01:10:38,911 : INFO : topic diff=0.154319, rho=0.454053 - 2020-10-25 01:10:49,397 : INFO : -9.086 per-word bound, 543.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 01:10:49,473 : INFO : using symmetric alpha at 0.09090909090909091 - 2020-10-25 01:10:49,473 : INFO : using symmetric eta at 0.09090909090909091 - 2020-10-25 01:10:49,475 : INFO : using serial LDA version on this node - 2020-10-25 01:10:49,488 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 01:10:49,488 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2020-10-25 01:11:00,304 : INFO : -10.500 per-word bound, 1448.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 01:11:00,304 : INFO : PROGRESS: pass 0, at document #1701/1701 - 2020-10-25 01:11:03,567 : INFO : topic #4 (0.091): 0.001*"moon" + 0.001*"actor" + 0.001*"cell" + 0.001*"election" + 0.001*"minister" + 0.001*"prime" + 0.001*"software" + 0.001*"novel" + 0.001*"emperor" + 0.001*"blue" - 2020-10-25 01:11:03,567 : INFO : topic #10 (0.091): 0.001*"emperor" + 0.001*"actor" + 0.001*"election" + 0.001*"italian" + 0.001*"soviet" + 0.001*"irish" + 0.001*"love" + 0.001*"card" + 0.001*"china" + 0.001*"ball" - 2020-10-25 01:11:03,567 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"territory" + 0.001*"jewish" + 0.001*"minister" + 0.001*"y" + 0.001*"election" + 0.001*"india" + 0.001*"km" + 0.001*"machine" + 0.001*"map" - 2020-10-25 01:11:03,568 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"km" + 0.001*"software" + 0.001*"actor" + 0.001*"india" + 0.001*"indian" + 0.001*"animal" + 0.001*"male" + 0.001*"prime" + 0.001*"philosophy" - 2020-10-25 01:11:03,568 : INFO : topic #2 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"season" + 0.001*"cell" + 0.001*"band" + 0.001*"emperor" + 0.001*"album" + 0.001*"station" + 0.001*"female" + 0.001*"km" - 2020-10-25 01:11:03,568 : INFO : topic diff=1.023399, rho=1.000000 - 2020-10-25 01:11:14,271 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 01:11:14,271 : INFO : PROGRESS: pass 1, at document #1701/1701 - 2020-10-25 01:11:17,511 : INFO : topic #1 (0.091): 0.002*"energy" + 0.001*"season" + 0.001*"league" + 0.001*"africa" + 0.001*"chinese" + 0.001*"football" + 0.001*"blue" + 0.001*"ship" + 0.001*"machine" + 0.001*"symbol" - 2020-10-25 01:11:17,511 : INFO : topic #4 (0.091): 0.002*"cell" + 0.002*"moon" + 0.001*"acid" + 0.001*"disease" + 0.001*"plant" + 0.001*"specie" + 0.001*"surface" + 0.001*"scientific" + 0.001*"software" + 0.001*"energy" - 2020-10-25 01:11:17,512 : INFO : topic #8 (0.091): 0.001*"territory" + 0.001*"league" + 0.001*"jewish" + 0.001*"minister" + 0.001*"election" + 0.001*"machine" + 0.001*"india" + 0.001*"irish" + 0.001*"apple" + 0.001*"y" - 2020-10-25 01:11:17,512 : INFO : topic #3 (0.091): 0.001*"import" + 0.001*"horse" + 0.001*"soviet" + 0.001*"km" + 0.001*"india" + 0.001*"software" + 0.001*"animal" + 0.001*"male" + 0.001*"indian" + 0.001*"lake" - 2020-10-25 01:11:17,512 : INFO : topic #7 (0.091): 0.002*"actor" + 0.002*"emperor" + 0.001*"league" + 0.001*"software" + 0.001*"y" + 0.001*"band" + 0.001*"linux" + 0.001*"italian" + 0.001*"singer" + 0.001*"season" - 2020-10-25 01:11:17,513 : INFO : topic diff=0.165568, rho=0.577350 - 2020-10-25 01:11:17,513 : INFO : ensemble contains 9 models and 160 topics now - 2020-10-25 01:11:17,519 : INFO : ensemble contains 10 models and 169 topics now - 2020-10-25 01:11:17,525 : INFO : asymmetric distance matrix is outdated due to add_model - 2020-10-25 01:11:17,525 : INFO : generating a 180 x 180 asymmetric distance matrix... - 2020-10-25 01:11:18,609 : INFO : fitting the clustering model, using 5 for min_samples - 2020-10-25 01:11:18,629 : INFO : generating stable topics, using 3 for min_cores - 2020-10-25 01:11:18,629 : INFO : found 19 clusters - 2020-10-25 01:11:18,634 : INFO : found 1 stable topics - 2020-10-25 01:11:18,635 : INFO : generating classic gensim model representation based on results from the ensemble - 2020-10-25 01:11:18,635 : INFO : using symmetric alpha at 1.0 - 2020-10-25 01:11:18,635 : INFO : using symmetric eta at 1.0 - 2020-10-25 01:11:18,637 : INFO : using serial LDA version on this node - 2020-10-25 01:11:18,638 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 01:11:18,638 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2020-10-25 11:28:17,999 : INFO : using symmetric alpha at 0.1111111111111111 + 2020-10-25 11:28:18,000 : INFO : using symmetric eta at 0.1111111111111111 + 2020-10-25 11:28:18,002 : INFO : using serial LDA version on this node + 2020-10-25 11:28:18,013 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 11:28:18,013 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2020-10-25 11:28:18,013 : INFO : training LDA model using 11 processes + 2020-10-25 11:28:18,129 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 11:28:22,666 : INFO : topic #0 (0.111): 0.001*"km" + 0.001*"band" + 0.001*"league" + 0.001*"album" + 0.001*"season" + 0.001*"emperor" + 0.001*"y" + 0.001*"actor" + 0.001*"ball" + 0.001*"machine" + 2020-10-25 11:28:22,666 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"jewish" + 0.001*"band" + 0.001*"software" + 0.001*"emperor" + 0.001*"season" + 0.001*"album" + 0.001*"minister" + 0.001*"israel" + 0.001*"australia" + 2020-10-25 11:28:22,666 : INFO : topic #3 (0.111): 0.001*"jewish" + 0.001*"actor" + 0.001*"y" + 0.001*"animal" + 0.001*"energy" + 0.001*"election" + 0.001*"km" + 0.001*"lake" + 0.001*"emperor" + 0.001*"cell" + 2020-10-25 11:28:22,667 : INFO : topic #7 (0.111): 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"cell" + 0.001*"soviet" + 0.001*"energy" + 0.001*"actor" + 0.001*"russian" + 0.001*"emperor" + 0.001*"band" + 2020-10-25 11:28:22,667 : INFO : topic #6 (0.111): 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"italian" + 0.001*"economy" + 0.001*"car" + 0.001*"animal" + 0.001*"km" + 0.001*"channel" + 0.001*"energy" + 2020-10-25 11:28:22,667 : INFO : topic diff=1.009925, rho=1.000000 + 2020-10-25 11:28:33,290 : INFO : -9.255 per-word bound, 611.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 11:28:33,290 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 11:28:37,864 : INFO : topic #1 (0.111): 0.002*"minister" + 0.002*"km" + 0.002*"election" + 0.002*"est" + 0.002*"economy" + 0.001*"actor" + 0.001*"male" + 0.001*"india" + 0.001*"china" + 0.001*"elected" + 2020-10-25 11:28:37,865 : INFO : topic #8 (0.111): 0.001*"software" + 0.001*"user" + 0.001*"apollo" + 0.001*"metal" + 0.001*"cell" + 0.001*"machine" + 0.001*"actor" + 0.001*"soviet" + 0.001*"lincoln" + 0.001*"disk" + 2020-10-25 11:28:37,865 : INFO : topic #3 (0.111): 0.001*"y" + 0.001*"animal" + 0.001*"lake" + 0.001*"energy" + 0.001*"jewish" + 0.001*"window" + 0.001*"aircraft" + 0.001*"cell" + 0.001*"memory" + 0.001*"machine" + 2020-10-25 11:28:37,865 : INFO : topic #7 (0.111): 0.002*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"soviet" + 0.001*"cell" + 0.001*"est" + 0.001*"energy" + 0.001*"economy" + 0.001*"parliament" + 0.001*"russian" + 2020-10-25 11:28:37,865 : INFO : topic #4 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"jewish" + 0.001*"israel" + 0.001*"emperor" + 0.001*"baseball" + 0.001*"band" + 0.001*"software" + 0.001*"album" + 0.001*"irish" + 2020-10-25 11:28:37,865 : INFO : topic diff=0.150801, rho=0.592297 + 2020-10-25 11:28:48,377 : INFO : -9.186 per-word bound, 582.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 11:28:48,378 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 11:28:52,718 : INFO : topic #4 (0.111): 0.003*"league" + 0.002*"season" + 0.002*"jewish" + 0.001*"baseball" + 0.001*"israel" + 0.001*"emperor" + 0.001*"band" + 0.001*"irish" + 0.001*"jew" + 0.001*"hebrew" + 2020-10-25 11:28:52,718 : INFO : topic #2 (0.111): 0.002*"album" + 0.002*"band" + 0.002*"chinese" + 0.001*"love" + 0.001*"jew" + 0.001*"japanese" + 0.001*"china" + 0.001*"artist" + 0.001*"jewish" + 0.001*"actor" + 2020-10-25 11:28:52,718 : INFO : topic #8 (0.111): 0.002*"software" + 0.002*"apollo" + 0.001*"user" + 0.001*"disk" + 0.001*"cell" + 0.001*"metal" + 0.001*"lincoln" + 0.001*"machine" + 0.001*"alexander" + 0.001*"bit" + 2020-10-25 11:28:52,718 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"window" + 0.001*"aircraft" + 0.001*"lake" + 0.001*"machine" + 0.001*"energy" + 0.001*"animal" + 0.001*"bit" + 0.001*"memory" + 0.001*"user" + 2020-10-25 11:28:52,719 : INFO : topic #5 (0.111): 0.001*"cell" + 0.001*"japanese" + 0.001*"japan" + 0.001*"energy" + 0.001*"software" + 0.001*"emperor" + 0.001*"blood" + 0.001*"bc" + 0.001*"actor" + 0.001*"brain" + 2020-10-25 11:28:52,719 : INFO : topic diff=0.173393, rho=0.509614 + 2020-10-25 11:29:03,210 : INFO : -9.128 per-word bound, 559.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 11:29:03,210 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2020-10-25 11:29:07,566 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"aircraft" + 0.001*"window" + 0.001*"user" + 0.001*"bit" + 0.001*"machine" + 0.001*"memory" + 0.001*"internet" + 0.001*"energy" + 0.001*"lake" + 2020-10-25 11:29:07,566 : INFO : topic #7 (0.111): 0.002*"soviet" + 0.002*"minister" + 0.002*"israel" + 0.002*"election" + 0.002*"parliament" + 0.002*"emperor" + 0.001*"israeli" + 0.001*"cell" + 0.001*"russian" + 0.001*"lebanon" + 2020-10-25 11:29:07,566 : INFO : topic #0 (0.111): 0.003*"ball" + 0.002*"import" + 0.002*"brown" + 0.001*"band" + 0.001*"cat" + 0.001*"iraq" + 0.001*"km" + 0.001*"album" + 0.001*"california" + 0.001*"football" + 2020-10-25 11:29:07,566 : INFO : topic #1 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"minister" + 0.003*"election" + 0.002*"economy" + 0.002*"actor" + 0.002*"india" + 0.002*"male" + 0.002*"china" + 0.002*"constitution" + 2020-10-25 11:29:07,567 : INFO : topic #6 (0.111): 0.002*"blue" + 0.002*"car" + 0.002*"engine" + 0.001*"soviet" + 0.001*"channel" + 0.001*"flag" + 0.001*"microsoft" + 0.001*"county" + 0.001*"director" + 0.001*"communist" + 2020-10-25 11:29:07,567 : INFO : topic diff=0.148814, rho=0.454053 + 2020-10-25 11:29:18,053 : INFO : -9.094 per-word bound, 546.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 11:29:18,119 : INFO : using symmetric alpha at 0.09090909090909091 + 2020-10-25 11:29:18,119 : INFO : using symmetric eta at 0.09090909090909091 + 2020-10-25 11:29:18,121 : INFO : using serial LDA version on this node + 2020-10-25 11:29:18,134 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 11:29:18,134 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2020-10-25 11:29:28,764 : INFO : -10.500 per-word bound, 1448.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 11:29:28,765 : INFO : PROGRESS: pass 0, at document #1701/1701 + 2020-10-25 11:29:31,978 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"km" + 0.001*"china" + 0.001*"japan" + 0.001*"love" + 0.001*"chinese" + 0.001*"y" + 0.001*"soviet" + 0.001*"emperor" + 2020-10-25 11:29:31,979 : INFO : topic #8 (0.091): 0.001*"actor" + 0.001*"minister" + 0.001*"card" + 0.001*"band" + 0.001*"map" + 0.001*"car" + 0.001*"muslim" + 0.001*"california" + 0.001*"soviet" + 0.001*"lord" + 2020-10-25 11:29:31,979 : INFO : topic #1 (0.091): 0.001*"km" + 0.001*"soviet" + 0.001*"album" + 0.001*"minister" + 0.001*"cell" + 0.001*"energy" + 0.001*"jewish" + 0.001*"band" + 0.001*"jew" + 0.001*"actor" + 2020-10-25 11:29:31,979 : INFO : topic #2 (0.091): 0.001*"election" + 0.001*"km" + 0.001*"band" + 0.001*"energy" + 0.001*"cell" + 0.001*"soviet" + 0.001*"minister" + 0.001*"china" + 0.001*"map" + 0.001*"australia" + 2020-10-25 11:29:31,980 : INFO : topic #10 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"jewish" + 0.001*"km" + 0.001*"israel" + 0.001*"bc" + 0.001*"y" + 0.001*"ship" + 0.001*"energy" + 0.001*"minister" + 2020-10-25 11:29:31,980 : INFO : topic diff=1.023741, rho=1.000000 + 2020-10-25 11:29:42,609 : INFO : -9.277 per-word bound, 620.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2020-10-25 11:29:42,609 : INFO : PROGRESS: pass 1, at document #1701/1701 + 2020-10-25 11:29:45,801 : INFO : topic #9 (0.091): 0.002*"km" + 0.001*"election" + 0.001*"economy" + 0.001*"minister" + 0.001*"est" + 0.001*"oil" + 0.001*"spanish" + 0.001*"male" + 0.001*"liberal" + 0.001*"flag" + 2020-10-25 11:29:45,802 : INFO : topic #2 (0.091): 0.001*"energy" + 0.001*"cell" + 0.001*"election" + 0.001*"band" + 0.001*"map" + 0.001*"km" + 0.001*"specie" + 0.001*"australia" + 0.001*"metal" + 0.001*"drug" + 2020-10-25 11:29:45,802 : INFO : topic #6 (0.091): 0.001*"actor" + 0.001*"soviet" + 0.001*"wikipedia" + 0.001*"cell" + 0.001*"jewish" + 0.001*"italian" + 0.001*"football" + 0.001*"band" + 0.001*"acid" + 0.001*"russian" + 2020-10-25 11:29:45,802 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"japan" + 0.001*"y" + 0.001*"apple" + 0.001*"love" + 0.001*"china" + 0.001*"chinese" + 0.001*"philosophy" + 0.001*"classical" + 2020-10-25 11:29:45,803 : INFO : topic #7 (0.091): 0.002*"emperor" + 0.002*"minister" + 0.001*"male" + 0.001*"saint" + 0.001*"constitution" + 0.001*"jewish" + 0.001*"election" + 0.001*"female" + 0.001*"actor" + 0.001*"jesus" + 2020-10-25 11:29:45,803 : INFO : topic diff=0.168078, rho=0.577350 + 2020-10-25 11:29:45,803 : INFO : ensemble contains 9 models and 160 topics now + 2020-10-25 11:29:45,809 : INFO : ensemble contains 10 models and 169 topics now + 2020-10-25 11:29:45,815 : INFO : asymmetric distance matrix is outdated due to add_model + 2020-10-25 11:29:45,815 : INFO : generating a 180 x 180 asymmetric distance matrix... + 2020-10-25 11:29:46,856 : INFO : fitting the clustering model, using 5 for min_samples + 2020-10-25 11:29:46,890 : INFO : generating stable topics, using 3 for min_cores + 2020-10-25 11:29:46,890 : INFO : found 18 clusters + 2020-10-25 11:29:46,899 : INFO : found 1 stable topics + 2020-10-25 11:29:46,899 : INFO : generating classic gensim model representation based on results from the ensemble + 2020-10-25 11:29:46,900 : INFO : using symmetric alpha at 1.0 + 2020-10-25 11:29:46,900 : INFO : using symmetric eta at 1.0 + 2020-10-25 11:29:46,903 : INFO : using serial LDA version on this node + 2020-10-25 11:29:46,906 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2020-10-25 11:29:46,906 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 180 1 @@ -387,9 +387,9 @@ Afterwards the number and quality of stable topics might be different depending .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 6 minutes 52.125 seconds) + **Total running time of the script:** ( 6 minutes 53.294 seconds) -**Estimated memory usage:** 1636 MB +**Estimated memory usage:** 1629 MB .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py: diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index 5b1c444a94..b4845ff578 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,10 +5,10 @@ Computation times ================= -**06:52.125** total execution time for **auto_examples_tutorials** files: +**06:53.294** total execution time for **auto_examples_tutorials** files: +-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:52.125 | 1636.4 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:53.294 | 1628.6 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ From 3057927ed54f88011d74f6151c6f132d9199d896 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Jan 2021 12:18:55 +0100 Subject: [PATCH 124/166] update docs --- .../tutorials/run_ensemblelda.ipynb | 8 +- .../tutorials/run_ensemblelda.py | 10 +- .../tutorials/run_ensemblelda.py.md5 | 2 +- .../tutorials/run_ensemblelda.rst | 259 ++++++++++-------- .../tutorials/sg_execution_times.rst | 4 +- docs/src/gallery/tutorials/run_ensemblelda.py | 10 +- gensim/downloader.py | 5 + 7 files changed, 170 insertions(+), 128 deletions(-) diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb index 3952a97d12..d9573e5e22 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This tutorial will explain how to use the EnsembleLDA model class.\n\nEnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,\nit can therefore be used to remove topics from your results that are noise and are not reproducible.\n\n\n" + "This tutorial will explain how to use the EnsembleLDA model class.\n\nEnsembleLda is a method of finding and generating stable topics from the results of multiple topic models,\nit can be used to remove topics from your results that are noise and are not reproducible.\n\n\n" ] }, { @@ -148,7 +148,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Tuning\n\nDifferent from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.\n\nYou can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around\n\nuntil you get as many topics as you desire, which however may reduce their quality.\nIf your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.\n\nHaving an epsilon that is smaller than the smallest distance doesn't make sense.\nMake sure to chose one that is within the range of values.\n\n\n" + "## Tuning\n\nDifferent from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters.\n\nYou can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor.\n\nPlay around until you get as many topics as you desire, which however may reduce their quality.\nIf your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough.\n\nHaving an epsilon that is smaller than the smallest distance doesn't make sense.\nMake sure to chose one that is within the range of values in ``asymmetric_distance_matrix``.\n\n\n" ] }, { @@ -159,7 +159,7 @@ }, "outputs": [], "source": [ - "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nwithout_diagonal.min(), without_diagonal.mean(), without_diagonal.max()\n\nensemble.recluster(eps=0.09, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))" + "import numpy as np\nshape = ensemble.asymmetric_distance_matrix.shape\nwithout_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1)\nprint(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max())\n\nensemble.recluster(eps=0.09, min_samples=2, min_cores=2)\n\nprint(len(ensemble.get_topics()))" ] }, { @@ -197,7 +197,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py index bde40f98ea..086f65fc44 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py @@ -13,7 +13,7 @@ # This tutorial will explain how to use the EnsembleLDA model class. # # EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models, -# it can therefore be used to remove topics from your results that are noise and are not reproducible. +# it can be used to remove topics from your results that are noise and are not reproducible. # ############################################################################### @@ -98,19 +98,19 @@ # # Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters. # -# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around +# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. # -# until you get as many topics as you desire, which however may reduce their quality. +# Play around until you get as many topics as you desire, which however may reduce their quality. # If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough. # # Having an epsilon that is smaller than the smallest distance doesn't make sense. -# Make sure to chose one that is within the range of values. +# Make sure to chose one that is within the range of values in ``asymmetric_distance_matrix``. # import numpy as np shape = ensemble.asymmetric_distance_matrix.shape without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) -without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() +print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()) ensemble.recluster(eps=0.09, min_samples=2, min_cores=2) diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 index aff4513799..0b31c8851e 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 @@ -1 +1 @@ -f5b586678dc10d31de5c29251f4f7ad9 \ No newline at end of file +6b332fb6a9968b6e82fa724edbe332c2 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst index f4cfc8ad5d..4ff2735a69 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst @@ -1,12 +1,21 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "auto_examples/tutorials/run_ensemblelda.py" +.. LINE NUMBERS ARE GIVEN BELOW. + .. only:: html .. note:: :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title + Click :ref:`here ` + to download the full example code - .. _sphx_glr_auto_examples_tutorials_run_ensemblelda.py: +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_auto_examples_tutorials_run_ensemblelda.py: Ensemble LDA @@ -14,6 +23,7 @@ Ensemble LDA Introduces Gensim's EnsembleLda model +.. GENERATED FROM PYTHON SOURCE LINES 8-12 .. code-block:: default @@ -28,12 +38,16 @@ Introduces Gensim's EnsembleLda model +.. GENERATED FROM PYTHON SOURCE LINES 13-18 + This tutorial will explain how to use the EnsembleLDA model class. EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models, -it can therefore be used to remove topics from your results that are noise and are not reproducible. +it can be used to remove topics from your results that are noise and are not reproducible. +.. GENERATED FROM PYTHON SOURCE LINES 20-27 + Corpus ------ We will use the gensim downloader api to get a small corpus for training our ensemble. @@ -42,6 +56,7 @@ The preprocessing is similar to :ref:`sphx_glr_auto_examples_tutorials_run_word2 so it won't be explained again in detail. +.. GENERATED FROM PYTHON SOURCE LINES 27-39 .. code-block:: default @@ -67,14 +82,16 @@ so it won't be explained again in detail. .. code-block:: none - 2020-10-25 11:23:34,467 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-10-25 11:23:39,808 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) - 2020-10-25 11:23:39,989 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... - 2020-10-25 11:23:39,989 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents - 2020-10-25 11:23:40,063 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) + 2021-01-23 12:02:54,189 : INFO : adding document #0 to Dictionary(0 unique tokens: []) + 2021-01-23 12:03:00,997 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) + 2021-01-23 12:03:01,199 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... + 2021-01-23 12:03:01,199 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents + 2021-01-23 12:03:01,288 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) + +.. GENERATED FROM PYTHON SOURCE LINES 40-48 Training -------- @@ -85,6 +102,7 @@ You can use any model that is based on LdaModel, such as LdaMulticore, to train In experiments, LdaMulticore showed better results. +.. GENERATED FROM PYTHON SOURCE LINES 48-52 .. code-block:: default @@ -99,10 +117,13 @@ In experiments, LdaMulticore showed better results. +.. GENERATED FROM PYTHON SOURCE LINES 53-56 + Any arbitrary number of models can be used, but it should be a multiple of your workers so that the load can be distributed properly. In this example, 4 processes will train 8 models each. +.. GENERATED FROM PYTHON SOURCE LINES 56-60 .. code-block:: default @@ -117,10 +138,13 @@ load can be distributed properly. In this example, 4 processes will train 8 mode +.. GENERATED FROM PYTHON SOURCE LINES 61-64 + After training all the models, some distance computations are required which can take quite some time as well. You can speed this up by using workers for that as well. +.. GENERATED FROM PYTHON SOURCE LINES 64-67 .. code-block:: default @@ -134,9 +158,12 @@ time as well. You can speed this up by using workers for that as well. +.. GENERATED FROM PYTHON SOURCE LINES 68-70 + All other parameters that are unknown to EnsembleLda are forwarded to each LDA Model, such as +.. GENERATED FROM PYTHON SOURCE LINES 70-73 .. code-block:: default @@ -150,12 +177,15 @@ All other parameters that are unknown to EnsembleLda are forwarded to each LDA M +.. GENERATED FROM PYTHON SOURCE LINES 74-79 + Now start the training Since 20 topics were trained on each of the 8 models, we expect there to be 160 different topics. The number of stable topics which are clustered from all those topics is smaller. +.. GENERATED FROM PYTHON SOURCE LINES 79-95 .. code-block:: default @@ -185,38 +215,41 @@ The number of stable topics which are clustered from all those topics is smaller .. code-block:: none - 2020-10-25 11:23:43,277 : INFO : generating 8 topic models... - 2020-10-25 11:28:16,432 : INFO : generating a 160 x 160 asymmetric distance matrix... - 2020-10-25 11:28:17,287 : INFO : fitting the clustering model, using 4 for min_samples - 2020-10-25 11:28:17,315 : INFO : generating stable topics, using 3 for min_cores - 2020-10-25 11:28:17,315 : INFO : found 21 clusters - 2020-10-25 11:28:17,319 : INFO : found 2 stable topics - 2020-10-25 11:28:17,319 : INFO : generating classic gensim model representation based on results from the ensemble - 2020-10-25 11:28:17,487 : INFO : using symmetric alpha at 0.5 - 2020-10-25 11:28:17,487 : INFO : using symmetric eta at 0.5 - 2020-10-25 11:28:17,488 : INFO : using serial LDA version on this node - 2020-10-25 11:28:17,491 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 11:28:17,491 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 12:03:05,136 : INFO : generating 8 topic models... + 2021-01-23 12:11:47,694 : INFO : generating a 160 x 160 asymmetric distance matrix... + 2021-01-23 12:11:49,175 : INFO : fitting the clustering model, using 4 for min_samples + 2021-01-23 12:11:49,206 : INFO : generating stable topics, using 3 for min_cores + 2021-01-23 12:11:49,206 : INFO : found 26 clusters + 2021-01-23 12:11:49,209 : INFO : found 1 stable topics + 2021-01-23 12:11:49,209 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-23 12:11:49,398 : INFO : using symmetric alpha at 1.0 + 2021-01-23 12:11:49,398 : INFO : using symmetric eta at 1.0 + 2021-01-23 12:11:49,400 : INFO : using serial LDA version on this node + 2021-01-23 12:11:49,402 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 12:11:49,402 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 160 - 2 + 1 + +.. GENERATED FROM PYTHON SOURCE LINES 96-109 Tuning ------ Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters. -You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around +You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. -until you get as many topics as you desire, which however may reduce their quality. +Play around until you get as many topics as you desire, which however may reduce their quality. If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough. Having an epsilon that is smaller than the smallest distance doesn't make sense. -Make sure to chose one that is within the range of values. +Make sure to chose one that is within the range of values in ``asymmetric_distance_matrix``. +.. GENERATED FROM PYTHON SOURCE LINES 109-119 .. code-block:: default @@ -224,7 +257,7 @@ Make sure to chose one that is within the range of values. import numpy as np shape = ensemble.asymmetric_distance_matrix.shape without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) - without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() + print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()) ensemble.recluster(eps=0.09, min_samples=2, min_cores=2) @@ -240,21 +273,24 @@ Make sure to chose one that is within the range of values. .. code-block:: none - 2020-10-25 11:28:17,675 : INFO : fitting the clustering model - 2020-10-25 11:28:17,687 : INFO : generating stable topics - 2020-10-25 11:28:17,688 : INFO : found 33 clusters - 2020-10-25 11:28:17,690 : INFO : found 4 stable topics - 2020-10-25 11:28:17,691 : INFO : generating classic gensim model representation based on results from the ensemble - 2020-10-25 11:28:17,692 : INFO : using symmetric alpha at 0.25 - 2020-10-25 11:28:17,692 : INFO : using symmetric eta at 0.25 - 2020-10-25 11:28:17,694 : INFO : using serial LDA version on this node - 2020-10-25 11:28:17,698 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 11:28:17,698 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 0.07280884735152338 0.12813543590172502 0.2606207782381612 + 2021-01-23 12:11:49,575 : INFO : fitting the clustering model + 2021-01-23 12:11:49,589 : INFO : generating stable topics + 2021-01-23 12:11:49,589 : INFO : found 35 clusters + 2021-01-23 12:11:49,593 : INFO : found 4 stable topics + 2021-01-23 12:11:49,593 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-23 12:11:49,594 : INFO : using symmetric alpha at 0.25 + 2021-01-23 12:11:49,594 : INFO : using symmetric eta at 0.25 + 2021-01-23 12:11:49,596 : INFO : using serial LDA version on this node + 2021-01-23 12:11:49,602 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 12:11:49,603 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 4 +.. GENERATED FROM PYTHON SOURCE LINES 120-131 + Increasing the Size ------------------- @@ -267,6 +303,7 @@ entirely made out of your existing topic models with the following method. Afterwards the number and quality of stable topics might be different depending on your added topics and parameters. +.. GENERATED FROM PYTHON SOURCE LINES 131-156 .. code-block:: default @@ -305,79 +342,79 @@ Afterwards the number and quality of stable topics might be different depending .. code-block:: none - 2020-10-25 11:28:17,999 : INFO : using symmetric alpha at 0.1111111111111111 - 2020-10-25 11:28:18,000 : INFO : using symmetric eta at 0.1111111111111111 - 2020-10-25 11:28:18,002 : INFO : using serial LDA version on this node - 2020-10-25 11:28:18,013 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 11:28:18,013 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2020-10-25 11:28:18,013 : INFO : training LDA model using 11 processes - 2020-10-25 11:28:18,129 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 11:28:22,666 : INFO : topic #0 (0.111): 0.001*"km" + 0.001*"band" + 0.001*"league" + 0.001*"album" + 0.001*"season" + 0.001*"emperor" + 0.001*"y" + 0.001*"actor" + 0.001*"ball" + 0.001*"machine" - 2020-10-25 11:28:22,666 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"jewish" + 0.001*"band" + 0.001*"software" + 0.001*"emperor" + 0.001*"season" + 0.001*"album" + 0.001*"minister" + 0.001*"israel" + 0.001*"australia" - 2020-10-25 11:28:22,666 : INFO : topic #3 (0.111): 0.001*"jewish" + 0.001*"actor" + 0.001*"y" + 0.001*"animal" + 0.001*"energy" + 0.001*"election" + 0.001*"km" + 0.001*"lake" + 0.001*"emperor" + 0.001*"cell" - 2020-10-25 11:28:22,667 : INFO : topic #7 (0.111): 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"cell" + 0.001*"soviet" + 0.001*"energy" + 0.001*"actor" + 0.001*"russian" + 0.001*"emperor" + 0.001*"band" - 2020-10-25 11:28:22,667 : INFO : topic #6 (0.111): 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"italian" + 0.001*"economy" + 0.001*"car" + 0.001*"animal" + 0.001*"km" + 0.001*"channel" + 0.001*"energy" - 2020-10-25 11:28:22,667 : INFO : topic diff=1.009925, rho=1.000000 - 2020-10-25 11:28:33,290 : INFO : -9.255 per-word bound, 611.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 11:28:33,290 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 11:28:37,864 : INFO : topic #1 (0.111): 0.002*"minister" + 0.002*"km" + 0.002*"election" + 0.002*"est" + 0.002*"economy" + 0.001*"actor" + 0.001*"male" + 0.001*"india" + 0.001*"china" + 0.001*"elected" - 2020-10-25 11:28:37,865 : INFO : topic #8 (0.111): 0.001*"software" + 0.001*"user" + 0.001*"apollo" + 0.001*"metal" + 0.001*"cell" + 0.001*"machine" + 0.001*"actor" + 0.001*"soviet" + 0.001*"lincoln" + 0.001*"disk" - 2020-10-25 11:28:37,865 : INFO : topic #3 (0.111): 0.001*"y" + 0.001*"animal" + 0.001*"lake" + 0.001*"energy" + 0.001*"jewish" + 0.001*"window" + 0.001*"aircraft" + 0.001*"cell" + 0.001*"memory" + 0.001*"machine" - 2020-10-25 11:28:37,865 : INFO : topic #7 (0.111): 0.002*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"soviet" + 0.001*"cell" + 0.001*"est" + 0.001*"energy" + 0.001*"economy" + 0.001*"parliament" + 0.001*"russian" - 2020-10-25 11:28:37,865 : INFO : topic #4 (0.111): 0.002*"league" + 0.001*"season" + 0.001*"jewish" + 0.001*"israel" + 0.001*"emperor" + 0.001*"baseball" + 0.001*"band" + 0.001*"software" + 0.001*"album" + 0.001*"irish" - 2020-10-25 11:28:37,865 : INFO : topic diff=0.150801, rho=0.592297 - 2020-10-25 11:28:48,377 : INFO : -9.186 per-word bound, 582.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 11:28:48,378 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 11:28:52,718 : INFO : topic #4 (0.111): 0.003*"league" + 0.002*"season" + 0.002*"jewish" + 0.001*"baseball" + 0.001*"israel" + 0.001*"emperor" + 0.001*"band" + 0.001*"irish" + 0.001*"jew" + 0.001*"hebrew" - 2020-10-25 11:28:52,718 : INFO : topic #2 (0.111): 0.002*"album" + 0.002*"band" + 0.002*"chinese" + 0.001*"love" + 0.001*"jew" + 0.001*"japanese" + 0.001*"china" + 0.001*"artist" + 0.001*"jewish" + 0.001*"actor" - 2020-10-25 11:28:52,718 : INFO : topic #8 (0.111): 0.002*"software" + 0.002*"apollo" + 0.001*"user" + 0.001*"disk" + 0.001*"cell" + 0.001*"metal" + 0.001*"lincoln" + 0.001*"machine" + 0.001*"alexander" + 0.001*"bit" - 2020-10-25 11:28:52,718 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"window" + 0.001*"aircraft" + 0.001*"lake" + 0.001*"machine" + 0.001*"energy" + 0.001*"animal" + 0.001*"bit" + 0.001*"memory" + 0.001*"user" - 2020-10-25 11:28:52,719 : INFO : topic #5 (0.111): 0.001*"cell" + 0.001*"japanese" + 0.001*"japan" + 0.001*"energy" + 0.001*"software" + 0.001*"emperor" + 0.001*"blood" + 0.001*"bc" + 0.001*"actor" + 0.001*"brain" - 2020-10-25 11:28:52,719 : INFO : topic diff=0.173393, rho=0.509614 - 2020-10-25 11:29:03,210 : INFO : -9.128 per-word bound, 559.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 11:29:03,210 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2020-10-25 11:29:07,566 : INFO : topic #3 (0.111): 0.002*"y" + 0.001*"aircraft" + 0.001*"window" + 0.001*"user" + 0.001*"bit" + 0.001*"machine" + 0.001*"memory" + 0.001*"internet" + 0.001*"energy" + 0.001*"lake" - 2020-10-25 11:29:07,566 : INFO : topic #7 (0.111): 0.002*"soviet" + 0.002*"minister" + 0.002*"israel" + 0.002*"election" + 0.002*"parliament" + 0.002*"emperor" + 0.001*"israeli" + 0.001*"cell" + 0.001*"russian" + 0.001*"lebanon" - 2020-10-25 11:29:07,566 : INFO : topic #0 (0.111): 0.003*"ball" + 0.002*"import" + 0.002*"brown" + 0.001*"band" + 0.001*"cat" + 0.001*"iraq" + 0.001*"km" + 0.001*"album" + 0.001*"california" + 0.001*"football" - 2020-10-25 11:29:07,566 : INFO : topic #1 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"minister" + 0.003*"election" + 0.002*"economy" + 0.002*"actor" + 0.002*"india" + 0.002*"male" + 0.002*"china" + 0.002*"constitution" - 2020-10-25 11:29:07,567 : INFO : topic #6 (0.111): 0.002*"blue" + 0.002*"car" + 0.002*"engine" + 0.001*"soviet" + 0.001*"channel" + 0.001*"flag" + 0.001*"microsoft" + 0.001*"county" + 0.001*"director" + 0.001*"communist" - 2020-10-25 11:29:07,567 : INFO : topic diff=0.148814, rho=0.454053 - 2020-10-25 11:29:18,053 : INFO : -9.094 per-word bound, 546.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 11:29:18,119 : INFO : using symmetric alpha at 0.09090909090909091 - 2020-10-25 11:29:18,119 : INFO : using symmetric eta at 0.09090909090909091 - 2020-10-25 11:29:18,121 : INFO : using serial LDA version on this node - 2020-10-25 11:29:18,134 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 11:29:18,134 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2020-10-25 11:29:28,764 : INFO : -10.500 per-word bound, 1448.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 11:29:28,765 : INFO : PROGRESS: pass 0, at document #1701/1701 - 2020-10-25 11:29:31,978 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"km" + 0.001*"china" + 0.001*"japan" + 0.001*"love" + 0.001*"chinese" + 0.001*"y" + 0.001*"soviet" + 0.001*"emperor" - 2020-10-25 11:29:31,979 : INFO : topic #8 (0.091): 0.001*"actor" + 0.001*"minister" + 0.001*"card" + 0.001*"band" + 0.001*"map" + 0.001*"car" + 0.001*"muslim" + 0.001*"california" + 0.001*"soviet" + 0.001*"lord" - 2020-10-25 11:29:31,979 : INFO : topic #1 (0.091): 0.001*"km" + 0.001*"soviet" + 0.001*"album" + 0.001*"minister" + 0.001*"cell" + 0.001*"energy" + 0.001*"jewish" + 0.001*"band" + 0.001*"jew" + 0.001*"actor" - 2020-10-25 11:29:31,979 : INFO : topic #2 (0.091): 0.001*"election" + 0.001*"km" + 0.001*"band" + 0.001*"energy" + 0.001*"cell" + 0.001*"soviet" + 0.001*"minister" + 0.001*"china" + 0.001*"map" + 0.001*"australia" - 2020-10-25 11:29:31,980 : INFO : topic #10 (0.091): 0.001*"actor" + 0.001*"league" + 0.001*"jewish" + 0.001*"km" + 0.001*"israel" + 0.001*"bc" + 0.001*"y" + 0.001*"ship" + 0.001*"energy" + 0.001*"minister" - 2020-10-25 11:29:31,980 : INFO : topic diff=1.023741, rho=1.000000 - 2020-10-25 11:29:42,609 : INFO : -9.277 per-word bound, 620.6 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2020-10-25 11:29:42,609 : INFO : PROGRESS: pass 1, at document #1701/1701 - 2020-10-25 11:29:45,801 : INFO : topic #9 (0.091): 0.002*"km" + 0.001*"election" + 0.001*"economy" + 0.001*"minister" + 0.001*"est" + 0.001*"oil" + 0.001*"spanish" + 0.001*"male" + 0.001*"liberal" + 0.001*"flag" - 2020-10-25 11:29:45,802 : INFO : topic #2 (0.091): 0.001*"energy" + 0.001*"cell" + 0.001*"election" + 0.001*"band" + 0.001*"map" + 0.001*"km" + 0.001*"specie" + 0.001*"australia" + 0.001*"metal" + 0.001*"drug" - 2020-10-25 11:29:45,802 : INFO : topic #6 (0.091): 0.001*"actor" + 0.001*"soviet" + 0.001*"wikipedia" + 0.001*"cell" + 0.001*"jewish" + 0.001*"italian" + 0.001*"football" + 0.001*"band" + 0.001*"acid" + 0.001*"russian" - 2020-10-25 11:29:45,802 : INFO : topic #4 (0.091): 0.002*"software" + 0.001*"japanese" + 0.001*"japan" + 0.001*"y" + 0.001*"apple" + 0.001*"love" + 0.001*"china" + 0.001*"chinese" + 0.001*"philosophy" + 0.001*"classical" - 2020-10-25 11:29:45,803 : INFO : topic #7 (0.091): 0.002*"emperor" + 0.002*"minister" + 0.001*"male" + 0.001*"saint" + 0.001*"constitution" + 0.001*"jewish" + 0.001*"election" + 0.001*"female" + 0.001*"actor" + 0.001*"jesus" - 2020-10-25 11:29:45,803 : INFO : topic diff=0.168078, rho=0.577350 - 2020-10-25 11:29:45,803 : INFO : ensemble contains 9 models and 160 topics now - 2020-10-25 11:29:45,809 : INFO : ensemble contains 10 models and 169 topics now - 2020-10-25 11:29:45,815 : INFO : asymmetric distance matrix is outdated due to add_model - 2020-10-25 11:29:45,815 : INFO : generating a 180 x 180 asymmetric distance matrix... - 2020-10-25 11:29:46,856 : INFO : fitting the clustering model, using 5 for min_samples - 2020-10-25 11:29:46,890 : INFO : generating stable topics, using 3 for min_cores - 2020-10-25 11:29:46,890 : INFO : found 18 clusters - 2020-10-25 11:29:46,899 : INFO : found 1 stable topics - 2020-10-25 11:29:46,899 : INFO : generating classic gensim model representation based on results from the ensemble - 2020-10-25 11:29:46,900 : INFO : using symmetric alpha at 1.0 - 2020-10-25 11:29:46,900 : INFO : using symmetric eta at 1.0 - 2020-10-25 11:29:46,903 : INFO : using serial LDA version on this node - 2020-10-25 11:29:46,906 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2020-10-25 11:29:46,906 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 12:11:49,879 : INFO : using symmetric alpha at 0.1111111111111111 + 2021-01-23 12:11:49,879 : INFO : using symmetric eta at 0.1111111111111111 + 2021-01-23 12:11:49,881 : INFO : using serial LDA version on this node + 2021-01-23 12:11:49,895 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 12:11:49,895 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 12:11:49,896 : INFO : training LDA model using 7 processes + 2021-01-23 12:11:49,969 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 12:11:57,803 : INFO : topic #6 (0.111): 0.001*"emperor" + 0.001*"cell" + 0.001*"minister" + 0.001*"energy" + 0.001*"chinese" + 0.001*"female" + 0.001*"animal" + 0.001*"award" + 0.001*"spanish" + 0.001*"season" + 2021-01-23 12:11:57,803 : INFO : topic #1 (0.111): 0.001*"band" + 0.001*"league" + 0.001*"km" + 0.001*"car" + 0.001*"india" + 0.001*"bc" + 0.001*"actor" + 0.001*"jewish" + 0.001*"album" + 0.001*"minister" + 2021-01-23 12:11:57,804 : INFO : topic #3 (0.111): 0.001*"actor" + 0.001*"band" + 0.001*"novel" + 0.001*"election" + 0.001*"album" + 0.001*"km" + 0.001*"chinese" + 0.001*"energy" + 0.001*"russian" + 0.001*"male" + 2021-01-23 12:11:57,804 : INFO : topic #5 (0.111): 0.001*"soviet" + 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"lord" + 0.001*"software" + 0.001*"plant" + 0.001*"african" + 0.001*"territory" + 0.001*"est" + 2021-01-23 12:11:57,805 : INFO : topic #8 (0.111): 0.001*"km" + 0.001*"soviet" + 0.001*"africa" + 0.001*"japanese" + 0.001*"chinese" + 0.001*"energy" + 0.001*"minister" + 0.001*"band" + 0.001*"user" + 0.001*"china" + 2021-01-23 12:11:57,805 : INFO : topic diff=1.009632, rho=1.000000 + 2021-01-23 12:12:18,557 : INFO : -9.256 per-word bound, 611.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 12:12:18,557 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 12:12:26,238 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"ball" + 0.001*"minister" + 0.001*"cell" + 0.001*"engine" + 0.001*"bc" + 0.001*"soviet" + 0.001*"software" + 0.001*"election" + 0.001*"album" + 2021-01-23 12:12:26,239 : INFO : topic #1 (0.111): 0.001*"league" + 0.001*"band" + 0.001*"car" + 0.001*"window" + 0.001*"album" + 0.001*"microsoft" + 0.001*"km" + 0.001*"india" + 0.001*"ball" + 0.001*"season" + 2021-01-23 12:12:26,240 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.001*"cell" + 0.001*"jewish" + 0.001*"chinese" + 0.001*"animal" + 0.001*"energy" + 0.001*"award" + 0.001*"fiction" + 0.001*"orthodox" + 0.001*"japanese" + 2021-01-23 12:12:26,241 : INFO : topic #5 (0.111): 0.002*"soviet" + 0.001*"election" + 0.001*"minister" + 0.001*"km" + 0.001*"est" + 0.001*"territory" + 0.001*"lord" + 0.001*"economy" + 0.001*"african" + 0.001*"plant" + 2021-01-23 12:12:26,241 : INFO : topic #8 (0.111): 0.002*"km" + 0.001*"est" + 0.001*"africa" + 0.001*"chinese" + 0.001*"japanese" + 0.001*"energy" + 0.001*"election" + 0.001*"minister" + 0.001*"economy" + 0.001*"y" + 2021-01-23 12:12:26,242 : INFO : topic diff=0.148485, rho=0.592297 + 2021-01-23 12:12:46,644 : INFO : -9.193 per-word bound, 585.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 12:12:46,645 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 12:12:54,251 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.002*"jewish" + 0.001*"cell" + 0.001*"jesus" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"animal" + 0.001*"jew" + 0.001*"fiction" + 0.001*"chinese" + 2021-01-23 12:12:54,252 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.001*"km" + 0.001*"economy" + 0.001*"territory" + 0.001*"parliament" + 0.001*"est" + 0.001*"lord" + 0.001*"elected" + 2021-01-23 12:12:54,253 : INFO : topic #2 (0.111): 0.002*"actor" + 0.001*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"emperor" + 0.001*"bc" + 0.001*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"ford" + 2021-01-23 12:12:54,253 : INFO : topic #3 (0.111): 0.002*"band" + 0.002*"album" + 0.001*"bass" + 0.001*"instrument" + 0.001*"novel" + 0.001*"actor" + 0.001*"blue" + 0.001*"card" + 0.001*"kong" + 0.001*"chinese" + 2021-01-23 12:12:54,254 : INFO : topic #7 (0.111): 0.002*"software" + 0.001*"horse" + 0.001*"y" + 0.001*"blue" + 0.001*"file" + 0.001*"actor" + 0.001*"moon" + 0.001*"apollo" + 0.001*"video" + 0.001*"season" + 2021-01-23 12:12:54,254 : INFO : topic diff=0.174918, rho=0.509614 + 2021-01-23 12:13:15,179 : INFO : -9.137 per-word bound, 563.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 12:13:15,179 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 12:13:22,431 : INFO : topic #2 (0.111): 0.002*"actor" + 0.002*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"henry" + 0.001*"emperor" + 0.001*"bc" + 0.001*"ford" + 0.001*"louis" + 0.001*"singer" + 2021-01-23 12:13:22,431 : INFO : topic #7 (0.111): 0.002*"software" + 0.002*"y" + 0.001*"horse" + 0.001*"file" + 0.001*"moon" + 0.001*"blue" + 0.001*"apollo" + 0.001*"video" + 0.001*"user" + 0.001*"color" + 2021-01-23 12:13:22,432 : INFO : topic #0 (0.111): 0.002*"india" + 0.002*"actor" + 0.002*"aircraft" + 0.002*"import" + 0.002*"km" + 0.002*"ship" + 0.002*"minister" + 0.002*"italian" + 0.002*"irish" + 0.001*"indian" + 2021-01-23 12:13:22,433 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.002*"economy" + 0.002*"parliament" + 0.002*"km" + 0.001*"territory" + 0.001*"liberal" + 0.001*"vote" + 0.001*"elected" + 2021-01-23 12:13:22,433 : INFO : topic #8 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"africa" + 0.002*"chinese" + 0.002*"economy" + 0.002*"election" + 0.002*"energy" + 0.001*"lake" + 0.001*"male" + 0.001*"african" + 2021-01-23 12:13:22,434 : INFO : topic diff=0.152037, rho=0.454053 + 2021-01-23 12:13:42,802 : INFO : -9.102 per-word bound, 549.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 12:13:42,889 : INFO : using symmetric alpha at 0.09090909090909091 + 2021-01-23 12:13:42,890 : INFO : using symmetric eta at 0.09090909090909091 + 2021-01-23 12:13:42,895 : INFO : using serial LDA version on this node + 2021-01-23 12:13:42,928 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 12:13:42,929 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 12:14:03,840 : INFO : -10.500 per-word bound, 1447.8 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 12:14:03,840 : INFO : PROGRESS: pass 0, at document #1701/1701 + 2021-01-23 12:14:09,469 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"emperor" + 0.001*"band" + 0.001*"india" + 0.001*"energy" + 0.001*"actor" + 0.001*"indian" + 0.001*"canadian" + 0.001*"minister" + 0.001*"italian" + 2021-01-23 12:14:09,470 : INFO : topic #4 (0.091): 0.001*"import" + 0.001*"minister" + 0.001*"bc" + 0.001*"lord" + 0.001*"ball" + 0.001*"km" + 0.001*"chinese" + 0.001*"season" + 0.001*"internet" + 0.001*"japanese" + 2021-01-23 12:14:09,470 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"cell" + 0.001*"actor" + 0.001*"israel" + 0.001*"election" + 0.001*"minister" + 0.001*"china" + 0.001*"economy" + 0.001*"energy" + 2021-01-23 12:14:09,471 : INFO : topic #1 (0.091): 0.001*"lake" + 0.001*"y" + 0.001*"soviet" + 0.001*"irish" + 0.001*"km" + 0.001*"actor" + 0.001*"league" + 0.001*"band" + 0.001*"male" + 0.001*"jewish" + 2021-01-23 12:14:09,471 : INFO : topic #6 (0.091): 0.001*"cell" + 0.001*"actor" + 0.001*"emperor" + 0.001*"election" + 0.001*"album" + 0.001*"band" + 0.001*"lake" + 0.001*"china" + 0.001*"minister" + 0.001*"la" + 2021-01-23 12:14:09,472 : INFO : topic diff=1.023212, rho=1.000000 + 2021-01-23 12:14:30,022 : INFO : -9.278 per-word bound, 620.9 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 12:14:30,023 : INFO : PROGRESS: pass 1, at document #1701/1701 + 2021-01-23 12:14:35,544 : INFO : topic #7 (0.091): 0.001*"emperor" + 0.001*"blue" + 0.001*"bear" + 0.001*"soviet" + 0.001*"actor" + 0.001*"alexander" + 0.001*"constitution" + 0.001*"bc" + 0.001*"mary" + 0.001*"county" + 2021-01-23 12:14:35,545 : INFO : topic #4 (0.091): 0.002*"import" + 0.001*"lord" + 0.001*"ball" + 0.001*"bc" + 0.001*"internet" + 0.001*"chinese" + 0.001*"season" + 0.001*"japanese" + 0.001*"jewish" + 0.001*"carbon" + 2021-01-23 12:14:35,545 : INFO : topic #3 (0.091): 0.002*"band" + 0.001*"india" + 0.001*"instrument" + 0.001*"soviet" + 0.001*"emperor" + 0.001*"actor" + 0.001*"bass" + 0.001*"indian" + 0.001*"album" + 0.001*"canadian" + 2021-01-23 12:14:35,546 : INFO : topic #8 (0.091): 0.002*"jewish" + 0.002*"ball" + 0.002*"jew" + 0.002*"actor" + 0.001*"soviet" + 0.001*"minister" + 0.001*"football" + 0.001*"russian" + 0.001*"australian" + 0.001*"japanese" + 2021-01-23 12:14:35,547 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"israel" + 0.001*"energy" + 0.001*"flag" + 0.001*"moon" + 0.001*"cell" + 0.001*"speed" + 0.001*"china" + 0.001*"software" + 2021-01-23 12:14:35,547 : INFO : topic diff=0.163866, rho=0.577350 + 2021-01-23 12:14:35,548 : INFO : ensemble contains 9 models and 160 topics now + 2021-01-23 12:14:35,557 : INFO : ensemble contains 10 models and 169 topics now + 2021-01-23 12:14:35,564 : INFO : asymmetric distance matrix is outdated due to add_model + 2021-01-23 12:14:35,565 : INFO : generating a 180 x 180 asymmetric distance matrix... + 2021-01-23 12:14:37,408 : INFO : fitting the clustering model, using 5 for min_samples + 2021-01-23 12:14:37,438 : INFO : generating stable topics, using 3 for min_cores + 2021-01-23 12:14:37,438 : INFO : found 21 clusters + 2021-01-23 12:14:37,442 : INFO : found 1 stable topics + 2021-01-23 12:14:37,442 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-23 12:14:37,444 : INFO : using symmetric alpha at 1.0 + 2021-01-23 12:14:37,444 : INFO : using symmetric eta at 1.0 + 2021-01-23 12:14:37,446 : INFO : using serial LDA version on this node + 2021-01-23 12:14:37,448 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 12:14:37,448 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 180 1 @@ -387,9 +424,9 @@ Afterwards the number and quality of stable topics might be different depending .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 6 minutes 53.294 seconds) + **Total running time of the script:** ( 12 minutes 22.200 seconds) -**Estimated memory usage:** 1629 MB +**Estimated memory usage:** 1637 MB .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py: diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index b4845ff578..9ac6299492 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,10 +5,10 @@ Computation times ================= -**06:53.294** total execution time for **auto_examples_tutorials** files: +**12:22.200** total execution time for **auto_examples_tutorials** files: +-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:53.294 | 1628.6 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 12:22.200 | 1637.2 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py index bde40f98ea..086f65fc44 100644 --- a/docs/src/gallery/tutorials/run_ensemblelda.py +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -13,7 +13,7 @@ # This tutorial will explain how to use the EnsembleLDA model class. # # EnsembleLda is a method of finding and generating stable topics from the results of multiple topic models, -# it can therefore be used to remove topics from your results that are noise and are not reproducible. +# it can be used to remove topics from your results that are noise and are not reproducible. # ############################################################################### @@ -98,19 +98,19 @@ # # Different from LdaModel, the number of resulting topics varies greatly depending on the clustering parameters. # -# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. Play around +# You can provide those in the ``recluster()`` function or the ``EnsembleLda`` constructor. # -# until you get as many topics as you desire, which however may reduce their quality. +# Play around until you get as many topics as you desire, which however may reduce their quality. # If your ensemble doesn't have enough topics to begin with, you should make sure to make it large enough. # # Having an epsilon that is smaller than the smallest distance doesn't make sense. -# Make sure to chose one that is within the range of values. +# Make sure to chose one that is within the range of values in ``asymmetric_distance_matrix``. # import numpy as np shape = ensemble.asymmetric_distance_matrix.shape without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) -without_diagonal.min(), without_diagonal.mean(), without_diagonal.max() +print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()) ensemble.recluster(eps=0.09, min_samples=2, min_cores=2) diff --git a/gensim/downloader.py b/gensim/downloader.py index 8a4395440e..47061ec793 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -357,6 +357,7 @@ def _download(name): If md5sum on client and in repo are different. """ + print('_download') url_load_file = "{base}/{fname}/__init__.py".format(base=DOWNLOAD_BASE_URL, fname=name) data_folder_dir = os.path.join(BASE_DIR, name) data_folder_dir_tmp = data_folder_dir + '_tmp' @@ -372,6 +373,8 @@ def _download(name): fname = "{f}.gz_0{p}".format(f=name, p=part) dst_path = os.path.join(tmp_dir, fname) + print('downloading', url_data) + print() urllib.urlretrieve( url_data, dst_path, reporthook=partial(_progress, part=part, total_parts=total_parts) @@ -393,6 +396,8 @@ def _download(name): url_data = "{base}/{fname}/{fname}.gz".format(base=DOWNLOAD_BASE_URL, fname=name) fname = "{fname}.gz".format(fname=name) dst_path = os.path.join(tmp_dir, fname) + print('downloading', url_data, 'to', dst_path, data_folder_dir) + print() urllib.urlretrieve(url_data, dst_path, reporthook=_progress) if _calculate_md5_checksum(dst_path) == _get_checksum(name): sys.stdout.write("\n") From 847874a92e2055f65c5b9482003b99b5f9d77842 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sat, 23 Jan 2021 15:29:02 +0100 Subject: [PATCH 125/166] attempt at fixing that pickle problem --- .../tutorials/run_ensemblelda.rst | 208 +++++++++--------- .../tutorials/sg_execution_times.rst | 4 +- 2 files changed, 106 insertions(+), 106 deletions(-) diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst index 4ff2735a69..9b959482d9 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst @@ -82,11 +82,11 @@ so it won't be explained again in detail. .. code-block:: none - 2021-01-23 12:02:54,189 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-01-23 12:03:00,997 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) - 2021-01-23 12:03:01,199 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... - 2021-01-23 12:03:01,199 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents - 2021-01-23 12:03:01,288 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) + 2021-01-23 15:15:53,986 : INFO : adding document #0 to Dictionary(0 unique tokens: []) + 2021-01-23 15:16:00,703 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) + 2021-01-23 15:16:00,901 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... + 2021-01-23 15:16:00,901 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents + 2021-01-23 15:16:00,984 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) @@ -215,18 +215,18 @@ The number of stable topics which are clustered from all those topics is smaller .. code-block:: none - 2021-01-23 12:03:05,136 : INFO : generating 8 topic models... - 2021-01-23 12:11:47,694 : INFO : generating a 160 x 160 asymmetric distance matrix... - 2021-01-23 12:11:49,175 : INFO : fitting the clustering model, using 4 for min_samples - 2021-01-23 12:11:49,206 : INFO : generating stable topics, using 3 for min_cores - 2021-01-23 12:11:49,206 : INFO : found 26 clusters - 2021-01-23 12:11:49,209 : INFO : found 1 stable topics - 2021-01-23 12:11:49,209 : INFO : generating classic gensim model representation based on results from the ensemble - 2021-01-23 12:11:49,398 : INFO : using symmetric alpha at 1.0 - 2021-01-23 12:11:49,398 : INFO : using symmetric eta at 1.0 - 2021-01-23 12:11:49,400 : INFO : using serial LDA version on this node - 2021-01-23 12:11:49,402 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 12:11:49,402 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 15:16:04,680 : INFO : generating 8 topic models... + 2021-01-23 15:24:25,880 : INFO : generating a 160 x 160 asymmetric distance matrix... + 2021-01-23 15:24:27,354 : INFO : fitting the clustering model, using 4 for min_samples + 2021-01-23 15:24:27,387 : INFO : generating stable topics, using 3 for min_cores + 2021-01-23 15:24:27,387 : INFO : found 25 clusters + 2021-01-23 15:24:27,391 : INFO : found 1 stable topics + 2021-01-23 15:24:27,391 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-23 15:24:27,592 : INFO : using symmetric alpha at 1.0 + 2021-01-23 15:24:27,592 : INFO : using symmetric eta at 1.0 + 2021-01-23 15:24:27,594 : INFO : using serial LDA version on this node + 2021-01-23 15:24:27,596 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 15:24:27,596 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 160 1 @@ -273,18 +273,18 @@ Make sure to chose one that is within the range of values in ``asymmetric_distan .. code-block:: none - 0.07280884735152338 0.12813543590172502 0.2606207782381612 - 2021-01-23 12:11:49,575 : INFO : fitting the clustering model - 2021-01-23 12:11:49,589 : INFO : generating stable topics - 2021-01-23 12:11:49,589 : INFO : found 35 clusters - 2021-01-23 12:11:49,593 : INFO : found 4 stable topics - 2021-01-23 12:11:49,593 : INFO : generating classic gensim model representation based on results from the ensemble - 2021-01-23 12:11:49,594 : INFO : using symmetric alpha at 0.25 - 2021-01-23 12:11:49,594 : INFO : using symmetric eta at 0.25 - 2021-01-23 12:11:49,596 : INFO : using serial LDA version on this node - 2021-01-23 12:11:49,602 : INFO : running online (multi-pass) LDA training, 4 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 12:11:49,603 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 4 + 0.0664596363067147 0.12630570073794478 0.2773686345664633 + 2021-01-23 15:24:27,763 : INFO : fitting the clustering model + 2021-01-23 15:24:27,777 : INFO : generating stable topics + 2021-01-23 15:24:27,777 : INFO : found 22 clusters + 2021-01-23 15:24:27,782 : INFO : found 3 stable topics + 2021-01-23 15:24:27,782 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-23 15:24:27,784 : INFO : using symmetric alpha at 0.3333333333333333 + 2021-01-23 15:24:27,784 : INFO : using symmetric eta at 0.3333333333333333 + 2021-01-23 15:24:27,785 : INFO : using serial LDA version on this node + 2021-01-23 15:24:27,790 : INFO : running online (multi-pass) LDA training, 3 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 15:24:27,790 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 3 @@ -342,79 +342,79 @@ Afterwards the number and quality of stable topics might be different depending .. code-block:: none - 2021-01-23 12:11:49,879 : INFO : using symmetric alpha at 0.1111111111111111 - 2021-01-23 12:11:49,879 : INFO : using symmetric eta at 0.1111111111111111 - 2021-01-23 12:11:49,881 : INFO : using serial LDA version on this node - 2021-01-23 12:11:49,895 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 12:11:49,895 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2021-01-23 12:11:49,896 : INFO : training LDA model using 7 processes - 2021-01-23 12:11:49,969 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 12:11:57,803 : INFO : topic #6 (0.111): 0.001*"emperor" + 0.001*"cell" + 0.001*"minister" + 0.001*"energy" + 0.001*"chinese" + 0.001*"female" + 0.001*"animal" + 0.001*"award" + 0.001*"spanish" + 0.001*"season" - 2021-01-23 12:11:57,803 : INFO : topic #1 (0.111): 0.001*"band" + 0.001*"league" + 0.001*"km" + 0.001*"car" + 0.001*"india" + 0.001*"bc" + 0.001*"actor" + 0.001*"jewish" + 0.001*"album" + 0.001*"minister" - 2021-01-23 12:11:57,804 : INFO : topic #3 (0.111): 0.001*"actor" + 0.001*"band" + 0.001*"novel" + 0.001*"election" + 0.001*"album" + 0.001*"km" + 0.001*"chinese" + 0.001*"energy" + 0.001*"russian" + 0.001*"male" - 2021-01-23 12:11:57,804 : INFO : topic #5 (0.111): 0.001*"soviet" + 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"lord" + 0.001*"software" + 0.001*"plant" + 0.001*"african" + 0.001*"territory" + 0.001*"est" - 2021-01-23 12:11:57,805 : INFO : topic #8 (0.111): 0.001*"km" + 0.001*"soviet" + 0.001*"africa" + 0.001*"japanese" + 0.001*"chinese" + 0.001*"energy" + 0.001*"minister" + 0.001*"band" + 0.001*"user" + 0.001*"china" - 2021-01-23 12:11:57,805 : INFO : topic diff=1.009632, rho=1.000000 - 2021-01-23 12:12:18,557 : INFO : -9.256 per-word bound, 611.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 12:12:18,557 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 12:12:26,238 : INFO : topic #4 (0.111): 0.001*"league" + 0.001*"ball" + 0.001*"minister" + 0.001*"cell" + 0.001*"engine" + 0.001*"bc" + 0.001*"soviet" + 0.001*"software" + 0.001*"election" + 0.001*"album" - 2021-01-23 12:12:26,239 : INFO : topic #1 (0.111): 0.001*"league" + 0.001*"band" + 0.001*"car" + 0.001*"window" + 0.001*"album" + 0.001*"microsoft" + 0.001*"km" + 0.001*"india" + 0.001*"ball" + 0.001*"season" - 2021-01-23 12:12:26,240 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.001*"cell" + 0.001*"jewish" + 0.001*"chinese" + 0.001*"animal" + 0.001*"energy" + 0.001*"award" + 0.001*"fiction" + 0.001*"orthodox" + 0.001*"japanese" - 2021-01-23 12:12:26,241 : INFO : topic #5 (0.111): 0.002*"soviet" + 0.001*"election" + 0.001*"minister" + 0.001*"km" + 0.001*"est" + 0.001*"territory" + 0.001*"lord" + 0.001*"economy" + 0.001*"african" + 0.001*"plant" - 2021-01-23 12:12:26,241 : INFO : topic #8 (0.111): 0.002*"km" + 0.001*"est" + 0.001*"africa" + 0.001*"chinese" + 0.001*"japanese" + 0.001*"energy" + 0.001*"election" + 0.001*"minister" + 0.001*"economy" + 0.001*"y" - 2021-01-23 12:12:26,242 : INFO : topic diff=0.148485, rho=0.592297 - 2021-01-23 12:12:46,644 : INFO : -9.193 per-word bound, 585.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 12:12:46,645 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 12:12:54,251 : INFO : topic #6 (0.111): 0.002*"emperor" + 0.002*"jewish" + 0.001*"cell" + 0.001*"jesus" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"animal" + 0.001*"jew" + 0.001*"fiction" + 0.001*"chinese" - 2021-01-23 12:12:54,252 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.001*"km" + 0.001*"economy" + 0.001*"territory" + 0.001*"parliament" + 0.001*"est" + 0.001*"lord" + 0.001*"elected" - 2021-01-23 12:12:54,253 : INFO : topic #2 (0.111): 0.002*"actor" + 0.001*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"emperor" + 0.001*"bc" + 0.001*"album" + 0.001*"energy" + 0.001*"band" + 0.001*"ford" - 2021-01-23 12:12:54,253 : INFO : topic #3 (0.111): 0.002*"band" + 0.002*"album" + 0.001*"bass" + 0.001*"instrument" + 0.001*"novel" + 0.001*"actor" + 0.001*"blue" + 0.001*"card" + 0.001*"kong" + 0.001*"chinese" - 2021-01-23 12:12:54,254 : INFO : topic #7 (0.111): 0.002*"software" + 0.001*"horse" + 0.001*"y" + 0.001*"blue" + 0.001*"file" + 0.001*"actor" + 0.001*"moon" + 0.001*"apollo" + 0.001*"video" + 0.001*"season" - 2021-01-23 12:12:54,254 : INFO : topic diff=0.174918, rho=0.509614 - 2021-01-23 12:13:15,179 : INFO : -9.137 per-word bound, 563.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 12:13:15,179 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 12:13:22,431 : INFO : topic #2 (0.111): 0.002*"actor" + 0.002*"love" + 0.001*"lincoln" + 0.001*"mary" + 0.001*"henry" + 0.001*"emperor" + 0.001*"bc" + 0.001*"ford" + 0.001*"louis" + 0.001*"singer" - 2021-01-23 12:13:22,431 : INFO : topic #7 (0.111): 0.002*"software" + 0.002*"y" + 0.001*"horse" + 0.001*"file" + 0.001*"moon" + 0.001*"blue" + 0.001*"apollo" + 0.001*"video" + 0.001*"user" + 0.001*"color" - 2021-01-23 12:13:22,432 : INFO : topic #0 (0.111): 0.002*"india" + 0.002*"actor" + 0.002*"aircraft" + 0.002*"import" + 0.002*"km" + 0.002*"ship" + 0.002*"minister" + 0.002*"italian" + 0.002*"irish" + 0.001*"indian" - 2021-01-23 12:13:22,433 : INFO : topic #5 (0.111): 0.003*"soviet" + 0.002*"election" + 0.002*"minister" + 0.002*"economy" + 0.002*"parliament" + 0.002*"km" + 0.001*"territory" + 0.001*"liberal" + 0.001*"vote" + 0.001*"elected" - 2021-01-23 12:13:22,433 : INFO : topic #8 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"africa" + 0.002*"chinese" + 0.002*"economy" + 0.002*"election" + 0.002*"energy" + 0.001*"lake" + 0.001*"male" + 0.001*"african" - 2021-01-23 12:13:22,434 : INFO : topic diff=0.152037, rho=0.454053 - 2021-01-23 12:13:42,802 : INFO : -9.102 per-word bound, 549.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 12:13:42,889 : INFO : using symmetric alpha at 0.09090909090909091 - 2021-01-23 12:13:42,890 : INFO : using symmetric eta at 0.09090909090909091 - 2021-01-23 12:13:42,895 : INFO : using serial LDA version on this node - 2021-01-23 12:13:42,928 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 12:13:42,929 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2021-01-23 12:14:03,840 : INFO : -10.500 per-word bound, 1447.8 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 12:14:03,840 : INFO : PROGRESS: pass 0, at document #1701/1701 - 2021-01-23 12:14:09,469 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"emperor" + 0.001*"band" + 0.001*"india" + 0.001*"energy" + 0.001*"actor" + 0.001*"indian" + 0.001*"canadian" + 0.001*"minister" + 0.001*"italian" - 2021-01-23 12:14:09,470 : INFO : topic #4 (0.091): 0.001*"import" + 0.001*"minister" + 0.001*"bc" + 0.001*"lord" + 0.001*"ball" + 0.001*"km" + 0.001*"chinese" + 0.001*"season" + 0.001*"internet" + 0.001*"japanese" - 2021-01-23 12:14:09,470 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"cell" + 0.001*"actor" + 0.001*"israel" + 0.001*"election" + 0.001*"minister" + 0.001*"china" + 0.001*"economy" + 0.001*"energy" - 2021-01-23 12:14:09,471 : INFO : topic #1 (0.091): 0.001*"lake" + 0.001*"y" + 0.001*"soviet" + 0.001*"irish" + 0.001*"km" + 0.001*"actor" + 0.001*"league" + 0.001*"band" + 0.001*"male" + 0.001*"jewish" - 2021-01-23 12:14:09,471 : INFO : topic #6 (0.091): 0.001*"cell" + 0.001*"actor" + 0.001*"emperor" + 0.001*"election" + 0.001*"album" + 0.001*"band" + 0.001*"lake" + 0.001*"china" + 0.001*"minister" + 0.001*"la" - 2021-01-23 12:14:09,472 : INFO : topic diff=1.023212, rho=1.000000 - 2021-01-23 12:14:30,022 : INFO : -9.278 per-word bound, 620.9 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 12:14:30,023 : INFO : PROGRESS: pass 1, at document #1701/1701 - 2021-01-23 12:14:35,544 : INFO : topic #7 (0.091): 0.001*"emperor" + 0.001*"blue" + 0.001*"bear" + 0.001*"soviet" + 0.001*"actor" + 0.001*"alexander" + 0.001*"constitution" + 0.001*"bc" + 0.001*"mary" + 0.001*"county" - 2021-01-23 12:14:35,545 : INFO : topic #4 (0.091): 0.002*"import" + 0.001*"lord" + 0.001*"ball" + 0.001*"bc" + 0.001*"internet" + 0.001*"chinese" + 0.001*"season" + 0.001*"japanese" + 0.001*"jewish" + 0.001*"carbon" - 2021-01-23 12:14:35,545 : INFO : topic #3 (0.091): 0.002*"band" + 0.001*"india" + 0.001*"instrument" + 0.001*"soviet" + 0.001*"emperor" + 0.001*"actor" + 0.001*"bass" + 0.001*"indian" + 0.001*"album" + 0.001*"canadian" - 2021-01-23 12:14:35,546 : INFO : topic #8 (0.091): 0.002*"jewish" + 0.002*"ball" + 0.002*"jew" + 0.002*"actor" + 0.001*"soviet" + 0.001*"minister" + 0.001*"football" + 0.001*"russian" + 0.001*"australian" + 0.001*"japanese" - 2021-01-23 12:14:35,547 : INFO : topic #5 (0.091): 0.001*"km" + 0.001*"chinese" + 0.001*"israel" + 0.001*"energy" + 0.001*"flag" + 0.001*"moon" + 0.001*"cell" + 0.001*"speed" + 0.001*"china" + 0.001*"software" - 2021-01-23 12:14:35,547 : INFO : topic diff=0.163866, rho=0.577350 - 2021-01-23 12:14:35,548 : INFO : ensemble contains 9 models and 160 topics now - 2021-01-23 12:14:35,557 : INFO : ensemble contains 10 models and 169 topics now - 2021-01-23 12:14:35,564 : INFO : asymmetric distance matrix is outdated due to add_model - 2021-01-23 12:14:35,565 : INFO : generating a 180 x 180 asymmetric distance matrix... - 2021-01-23 12:14:37,408 : INFO : fitting the clustering model, using 5 for min_samples - 2021-01-23 12:14:37,438 : INFO : generating stable topics, using 3 for min_cores - 2021-01-23 12:14:37,438 : INFO : found 21 clusters - 2021-01-23 12:14:37,442 : INFO : found 1 stable topics - 2021-01-23 12:14:37,442 : INFO : generating classic gensim model representation based on results from the ensemble - 2021-01-23 12:14:37,444 : INFO : using symmetric alpha at 1.0 - 2021-01-23 12:14:37,444 : INFO : using symmetric eta at 1.0 - 2021-01-23 12:14:37,446 : INFO : using serial LDA version on this node - 2021-01-23 12:14:37,448 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 12:14:37,448 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 15:24:28,057 : INFO : using symmetric alpha at 0.1111111111111111 + 2021-01-23 15:24:28,058 : INFO : using symmetric eta at 0.1111111111111111 + 2021-01-23 15:24:28,059 : INFO : using serial LDA version on this node + 2021-01-23 15:24:28,074 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 15:24:28,074 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 15:24:28,074 : INFO : training LDA model using 7 processes + 2021-01-23 15:24:28,150 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 15:24:36,133 : INFO : topic #3 (0.111): 0.001*"minister" + 0.001*"actor" + 0.001*"china" + 0.001*"band" + 0.001*"km" + 0.001*"blue" + 0.001*"album" + 0.001*"economy" + 0.001*"soviet" + 0.001*"bc" + 2021-01-23 15:24:36,133 : INFO : topic #1 (0.111): 0.001*"software" + 0.001*"album" + 0.001*"minister" + 0.001*"election" + 0.001*"emperor" + 0.001*"male" + 0.001*"league" + 0.001*"lord" + 0.001*"band" + 0.001*"jewish" + 2021-01-23 15:24:36,134 : INFO : topic #2 (0.111): 0.001*"soviet" + 0.001*"china" + 0.001*"league" + 0.001*"actor" + 0.001*"km" + 0.001*"prime" + 0.001*"band" + 0.001*"energy" + 0.001*"economy" + 0.001*"season" + 2021-01-23 15:24:36,134 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"league" + 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"soviet" + 0.001*"philosophy" + 0.001*"lake" + 0.001*"software" + 0.001*"economy" + 2021-01-23 15:24:36,135 : INFO : topic #8 (0.111): 0.001*"soviet" + 0.001*"jewish" + 0.001*"energy" + 0.001*"jew" + 0.001*"minister" + 0.001*"band" + 0.001*"russian" + 0.001*"album" + 0.001*"emperor" + 0.001*"lord" + 2021-01-23 15:24:36,135 : INFO : topic diff=1.009252, rho=1.000000 + 2021-01-23 15:24:55,469 : INFO : -9.258 per-word bound, 612.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 15:24:55,469 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 15:25:02,694 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"lake" + 0.001*"apple" + 0.001*"philosophy" + 0.001*"software" + 0.001*"league" + 0.001*"km" + 0.001*"user" + 0.001*"band" + 0.001*"window" + 2021-01-23 15:25:02,695 : INFO : topic #1 (0.111): 0.002*"album" + 0.001*"software" + 0.001*"lord" + 0.001*"emperor" + 0.001*"band" + 0.001*"league" + 0.001*"minister" + 0.001*"saint" + 0.001*"novel" + 0.001*"season" + 2021-01-23 15:25:02,695 : INFO : topic #2 (0.111): 0.002*"soviet" + 0.002*"league" + 0.001*"china" + 0.001*"season" + 0.001*"actor" + 0.001*"km" + 0.001*"ball" + 0.001*"prime" + 0.001*"chinese" + 0.001*"economy" + 2021-01-23 15:25:02,696 : INFO : topic #6 (0.111): 0.001*"israel" + 0.001*"km" + 0.001*"cell" + 0.001*"user" + 0.001*"minister" + 0.001*"video" + 0.001*"jewish" + 0.001*"machine" + 0.001*"import" + 0.001*"internet" + 2021-01-23 15:25:02,697 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"actor" + 0.001*"soviet" + 0.001*"blue" + 0.001*"tree" + 0.001*"ball" + 0.001*"russian" + 0.001*"software" + 0.001*"fiction" + 0.001*"file" + 2021-01-23 15:25:02,697 : INFO : topic diff=0.143090, rho=0.592297 + 2021-01-23 15:25:21,875 : INFO : -9.195 per-word bound, 586.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 15:25:21,876 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 15:25:29,503 : INFO : topic #7 (0.111): 0.001*"economy" + 0.001*"africa" + 0.001*"bc" + 0.001*"diamond" + 0.001*"horse" + 0.001*"irish" + 0.001*"county" + 0.001*"election" + 0.001*"file" + 0.001*"car" + 2021-01-23 15:25:29,503 : INFO : topic #5 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"actor" + 0.002*"election" + 0.002*"minister" + 0.002*"india" + 0.001*"italian" + 0.001*"male" + 0.001*"female" + 0.001*"constitution" + 2021-01-23 15:25:29,504 : INFO : topic #2 (0.111): 0.002*"league" + 0.002*"season" + 0.002*"soviet" + 0.002*"china" + 0.001*"ball" + 0.001*"chinese" + 0.001*"football" + 0.001*"baseball" + 0.001*"canadian" + 0.001*"nfl" + 2021-01-23 15:25:29,505 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"ball" + 0.001*"tree" + 0.001*"engine" + 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"fiction" + 0.001*"file" + 0.001*"russian" + 2021-01-23 15:25:29,505 : INFO : topic #8 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"emperor" + 0.002*"soviet" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"jesus" + 0.001*"russian" + 0.001*"cell" + 0.001*"holy" + 2021-01-23 15:25:29,506 : INFO : topic diff=0.176887, rho=0.509614 + 2021-01-23 15:25:48,495 : INFO : -9.132 per-word bound, 561.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 15:25:48,495 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-23 15:25:56,501 : INFO : topic #2 (0.111): 0.003*"league" + 0.003*"season" + 0.002*"soviet" + 0.002*"ball" + 0.002*"football" + 0.002*"chinese" + 0.002*"baseball" + 0.002*"china" + 0.002*"nfl" + 0.001*"finland" + 2021-01-23 15:25:56,502 : INFO : topic #3 (0.111): 0.002*"card" + 0.002*"aircraft" + 0.002*"blue" + 0.001*"band" + 0.001*"comic" + 0.001*"kong" + 0.001*"album" + 0.001*"bridge" + 0.001*"hong" + 0.001*"love" + 2021-01-23 15:25:56,503 : INFO : topic #6 (0.111): 0.002*"cell" + 0.001*"user" + 0.001*"import" + 0.001*"gun" + 0.001*"disease" + 0.001*"specie" + 0.001*"video" + 0.001*"device" + 0.001*"animal" + 0.001*"weapon" + 2021-01-23 15:25:56,503 : INFO : topic #7 (0.111): 0.001*"irish" + 0.001*"horse" + 0.001*"bc" + 0.001*"diamond" + 0.001*"economy" + 0.001*"liberal" + 0.001*"car" + 0.001*"africa" + 0.001*"county" + 0.001*"file" + 2021-01-23 15:25:56,504 : INFO : topic #0 (0.111): 0.002*"energy" + 0.001*"engine" + 0.001*"tree" + 0.001*"ball" + 0.001*"acid" + 0.001*"soviet" + 0.001*"file" + 0.001*"fiction" + 0.001*"blue" + 0.001*"classical" + 2021-01-23 15:25:56,505 : INFO : topic diff=0.154924, rho=0.454053 + 2021-01-23 15:26:15,743 : INFO : -9.092 per-word bound, 545.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 15:26:15,821 : INFO : using symmetric alpha at 0.09090909090909091 + 2021-01-23 15:26:15,822 : INFO : using symmetric eta at 0.09090909090909091 + 2021-01-23 15:26:15,824 : INFO : using serial LDA version on this node + 2021-01-23 15:26:15,855 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 15:26:15,855 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-23 15:26:35,321 : INFO : -10.499 per-word bound, 1447.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 15:26:35,321 : INFO : PROGRESS: pass 0, at document #1701/1701 + 2021-01-23 15:26:40,838 : INFO : topic #4 (0.091): 0.001*"software" + 0.001*"election" + 0.001*"jewish" + 0.001*"actor" + 0.001*"minister" + 0.001*"territory" + 0.001*"league" + 0.001*"channel" + 0.001*"user" + 0.001*"japan" + 2021-01-23 15:26:40,839 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"energy" + 0.001*"software" + 0.001*"km" + 0.001*"economy" + 0.001*"minister" + 0.001*"bc" + 0.001*"china" + 0.001*"league" + 2021-01-23 15:26:40,839 : INFO : topic #7 (0.091): 0.001*"album" + 0.001*"irish" + 0.001*"ball" + 0.001*"y" + 0.001*"minister" + 0.001*"actor" + 0.001*"chinese" + 0.001*"election" + 0.001*"band" + 0.001*"la" + 2021-01-23 15:26:40,840 : INFO : topic #8 (0.091): 0.001*"soviet" + 0.001*"album" + 0.001*"league" + 0.001*"minister" + 0.001*"actor" + 0.001*"emperor" + 0.001*"software" + 0.001*"male" + 0.001*"energy" + 0.001*"japanese" + 2021-01-23 15:26:40,841 : INFO : topic #1 (0.091): 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"actor" + 0.001*"economy" + 0.001*"lord" + 0.001*"energy" + 0.001*"car" + 0.001*"county" + 0.001*"software" + 2021-01-23 15:26:40,841 : INFO : topic diff=1.022883, rho=1.000000 + 2021-01-23 15:27:00,369 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-23 15:27:00,369 : INFO : PROGRESS: pass 1, at document #1701/1701 + 2021-01-23 15:27:05,770 : INFO : topic #2 (0.091): 0.002*"km" + 0.002*"female" + 0.002*"est" + 0.001*"soviet" + 0.001*"election" + 0.001*"male" + 0.001*"minister" + 0.001*"economy" + 0.001*"russian" + 0.001*"russia" + 2021-01-23 15:27:05,771 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"album" + 0.001*"color" + 0.001*"energy" + 0.001*"love" + 0.001*"actor" + 0.001*"soviet" + 0.001*"frac" + 0.001*"lake" + 0.001*"blue" + 2021-01-23 15:27:05,771 : INFO : topic #6 (0.091): 0.002*"actor" + 0.001*"software" + 0.001*"user" + 0.001*"blue" + 0.001*"emperor" + 0.001*"album" + 0.001*"band" + 0.001*"election" + 0.001*"soviet" + 0.001*"minister" + 2021-01-23 15:27:05,772 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"novel" + 0.001*"album" + 0.001*"energy" + 0.001*"bc" + 0.001*"philosophy" + 0.001*"knowledge" + 0.001*"band" + 0.001*"japanese" + 2021-01-23 15:27:05,772 : INFO : topic #10 (0.091): 0.002*"economy" + 0.002*"km" + 0.001*"minister" + 0.001*"est" + 0.001*"election" + 0.001*"import" + 0.001*"russian" + 0.001*"prime" + 0.001*"soviet" + 0.001*"chinese" + 2021-01-23 15:27:05,773 : INFO : topic diff=0.163523, rho=0.577350 + 2021-01-23 15:27:05,774 : INFO : ensemble contains 9 models and 160 topics now + 2021-01-23 15:27:05,782 : INFO : ensemble contains 10 models and 169 topics now + 2021-01-23 15:27:05,789 : INFO : asymmetric distance matrix is outdated due to add_model + 2021-01-23 15:27:05,789 : INFO : generating a 180 x 180 asymmetric distance matrix... + 2021-01-23 15:27:07,632 : INFO : fitting the clustering model, using 5 for min_samples + 2021-01-23 15:27:07,670 : INFO : generating stable topics, using 3 for min_cores + 2021-01-23 15:27:07,670 : INFO : found 23 clusters + 2021-01-23 15:27:07,676 : INFO : found 1 stable topics + 2021-01-23 15:27:07,676 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-23 15:27:07,677 : INFO : using symmetric alpha at 1.0 + 2021-01-23 15:27:07,677 : INFO : using symmetric eta at 1.0 + 2021-01-23 15:27:07,680 : INFO : using serial LDA version on this node + 2021-01-23 15:27:07,682 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-23 15:27:07,682 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 180 1 @@ -424,9 +424,9 @@ Afterwards the number and quality of stable topics might be different depending .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 12 minutes 22.200 seconds) + **Total running time of the script:** ( 11 minutes 53.863 seconds) -**Estimated memory usage:** 1637 MB +**Estimated memory usage:** 1625 MB .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py: diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index 9ac6299492..56d9c96eb0 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,10 +5,10 @@ Computation times ================= -**12:22.200** total execution time for **auto_examples_tutorials** files: +**11:53.863** total execution time for **auto_examples_tutorials** files: +-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 12:22.200 | 1637.2 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 11:53.863 | 1625.0 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ From f8634ef1284e9e341a1198578e5d043ce918accf Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 24 Jan 2021 11:23:58 +0100 Subject: [PATCH 126/166] merge --- docs/src/auto_examples/howtos/run_doc.ipynb | 10 +- docs/src/auto_examples/howtos/run_doc.py | 2 +- docs/src/auto_examples/howtos/run_doc.py.md5 | 2 +- docs/src/auto_examples/howtos/run_doc.rst | 32 ++- .../howtos/sg_execution_times.rst | 20 +- docs/src/auto_examples/index.rst | 40 ++-- .../tutorials/run_ensemblelda.ipynb | 2 +- .../tutorials/run_ensemblelda.rst | 210 +++++++++--------- .../tutorials/sg_execution_times.rst | 6 +- 9 files changed, 173 insertions(+), 151 deletions(-) diff --git a/docs/src/auto_examples/howtos/run_doc.ipynb b/docs/src/auto_examples/howtos/run_doc.ipynb index 48820b593a..7ce62d39c5 100644 --- a/docs/src/auto_examples/howtos/run_doc.ipynb +++ b/docs/src/auto_examples/howtos/run_doc.ipynb @@ -15,14 +15,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nHow to Author Gensim Documentation\n==================================\n\nHow to author documentation for Gensim.\n\n" + "\n# How to Author Gensim Documentation\n\nHow to author documentation for Gensim.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Background\n----------\n\nGensim is a large project with a wide range of functionality.\nUnfortunately, not all of this functionality is documented **well**, and some of it is not documented at all.\nWithout good documentation, users are unable to unlock Gensim's full potential.\nTherefore, authoring new documentation and improving existing documentation is of great value to the Gensim project.\n\nIf you implement new functionality in Gensim, please include **helpful** documentation.\nBy \"helpful\", we mean that your documentation answers questions that Gensim users may have.\nFor example:\n\n- What is this new functionality?\n- **Why** is it important?\n- **How** is it relevant to Gensim?\n- **What** can I do with it? What are some real-world applications?\n- **How** do I use it to achieve those things?\n- ... and others (if you can think of them, please add them here)\n\nBefore you author documentation, I suggest reading\n`\"What nobody tells you about documentation\" `__\nor watching its `accompanying video `__\n(or even both, if you're really keen).\n\nThe summary of the above presentation is: there are four distinct kinds of documentation, and you really need them all:\n\n1. Tutorials\n2. Howto guides\n3. Explanations\n4. References\n\nEach kind has its own intended audience, purpose, and writing style.\nWhen you make a PR with new functionality, please consider authoring each kind of documentation.\nAt the very least, you will (indirectly) author reference documentation through module, class and function docstrings.\n\nMechanisms\n----------\n\nWe keep our documentation as individual Python scripts.\nThese scripts live under :file:`docs/src/gallery` in one of several subdirectories:\n\n- core: core tutorials. We try to keep this part small, avoid putting stuff here.\n- tutorials: tutorials.\n- howtos: howto guides.\n\nPick a subdirectory and save your script under it.\nPrefix the name of the script with ``run_``: this way, the the documentation builder will run your script each time it builds our docs.\n\nThe contents of the script are straightforward.\nAt the very top, you need a docstring describing what your script does.\n\n" + "## Background\n\nGensim is a large project with a wide range of functionality.\nUnfortunately, not all of this functionality is documented **well**, and some of it is not documented at all.\nWithout good documentation, users are unable to unlock Gensim's full potential.\nTherefore, authoring new documentation and improving existing documentation is of great value to the Gensim project.\n\nIf you implement new functionality in Gensim, please include **helpful** documentation.\nBy \"helpful\", we mean that your documentation answers questions that Gensim users may have.\nFor example:\n\n- What is this new functionality?\n- **Why** is it important?\n- **How** is it relevant to Gensim?\n- **What** can I do with it? What are some real-world applications?\n- **How** do I use it to achieve those things?\n- ... and others (if you can think of them, please add them here)\n\nBefore you author documentation, I suggest reading\n`\"What nobody tells you about documentation\" `__\nor watching its `accompanying video `__\n(or even both, if you're really keen).\n\nThe summary of the above presentation is: there are four distinct kinds of documentation, and you really need them all:\n\n1. Tutorials\n2. Howto guides\n3. Explanations\n4. References\n\nEach kind has its own intended audience, purpose, and writing style.\nWhen you make a PR with new functionality, please consider authoring each kind of documentation.\nAt the very least, you will (indirectly) author reference documentation through module, class and function docstrings.\n\n## Mechanisms\n\nWe keep our documentation as individual Python scripts.\nThese scripts live under :file:`docs/src/gallery` in one of several subdirectories:\n\n- core: core tutorials. We try to keep this part small, avoid putting stuff here.\n- tutorials: tutorials.\n- howtos: howto guides.\n\nPick a subdirectory and save your script under it.\nPrefix the name of the script with ``run_``: this way, the the documentation builder will run your script each time it builds our docs.\n\nThe contents of the script are straightforward.\nAt the very top, you need a docstring describing what your script does.\n\n" ] }, { @@ -54,14 +54,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Authoring Workflow\n------------------\n\nThere are several ways to author documentation.\nThe simplest and most straightforward is to author your ``script.py`` from scratch.\nYou'll have the following cycle:\n\n1. Make changes\n2. Run ``python script.py``\n3. Check standard output, standard error and return code\n4. If everything works well, stop.\n5. Otherwise, go back to step 1).\n\nIf the above is not your cup of tea, you can also author your documentation as a Jupyter notebook.\nThis is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.\n\nOnce you're happy with the notebook, convert it to a script.py.\nThere's a helpful `script `__ that will do it for you.\nTo use it::\n\n python to_python.py < notebook.ipynb > script.py\n\nYou may have to touch up the resulting ``script.py``.\nMore specifically:\n\n- Update the title\n- Update the description\n- Fix any issues that the markdown-to-RST converter could not deal with\n\nOnce your script.py works, put it in a suitable subdirectory.\nPlease don't include your original Jupyter notebook in the repository - we won't be using it.\n\n" + "## Authoring Workflow\n\nThere are several ways to author documentation.\nThe simplest and most straightforward is to author your ``script.py`` from scratch.\nYou'll have the following cycle:\n\n1. Make changes\n2. Run ``python script.py``\n3. Check standard output, standard error and return code\n4. If everything works well, stop.\n5. Otherwise, go back to step 1).\n\nIf the above is not your cup of tea, you can also author your documentation as a Jupyter notebook.\nThis is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.\n\nOnce you're happy with the notebook, convert it to a script.py.\nThere's a helpful `script `__ that will do it for you.\nTo use it::\n\n python to_python.py < notebook.ipynb > script.py\n\nYou may have to touch up the resulting ``script.py``.\nMore specifically:\n\n- Update the title\n- Update the description\n- Fix any issues that the markdown-to-RST converter could not deal with\n\nOnce your script.py works, put it in a suitable subdirectory.\nPlease don't include your original Jupyter notebook in the repository - we won't be using it.\n\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Correctness\n-----------\n\nIncorrect documentation can be worse than no documentation at all.\nTake the following steps to ensure correctness:\n\n- Run Python's doctest module on your docstrings\n- Run your documentation scripts from scratch, removing any temporary files/results\n\nUsing data in your documentation\n--------------------------------\n\nSome parts of the documentation require real-world data to be useful.\nFor example, you may need more than just a toy example to demonstrate the benefits of one model over another.\nThis subsection provides some tips for including data in your documentation.\n\nIf possible, use data available via Gensim's\n`downloader API `__.\nThis will reduce the risk of your documentation becoming obsolete because required data is no longer available.\n\nUse the smallest possible dataset: avoid making people unnecessarily load large datasets and models.\nThis will make your documentation faster to run and easier for people to use (they can modify your examples and re-run them quickly).\n\nFinalizing your contribution\n----------------------------\n\nFirst, get Sphinx Gallery to build your documentation::\n\n make -C docs/src html\n\nThis can take a while if your documentation uses a large dataset, or if you've changed many other tutorials or guides.\nOnce this completes successfully, open ``docs/auto_examples/index.html`` in your browser.\nYou should see your new tutorial or guide in the gallery.\n\nOnce your documentation script is working correctly, it's time to add it to the git repository::\n\n git add docs/src/gallery/tutorials/run_example.py\n git add docs/src/auto_examples/tutorials/run_example.{py,py.md5,rst,ipynb}\n git add docs/src/auto_examples/howtos/sg_execution_times.rst\n git commit -m \"enter a helpful commit message here\"\n git push origin branchname\n\n.. Note::\n You may be wondering what all those other files are.\n Sphinx Gallery puts a copy of your Python script in ``auto_examples/tutorials``.\n The .md5 contains MD5 hash of the script to enable easy detection of modifications.\n Gallery also generates .rst (RST for Sphinx) and .ipynb (Jupyter notebook) files from the script.\n Finally, ``sg_execution_times.rst`` contains the time taken to run each example.\n\nFinally, make a PR on `github `__.\nOne of our friendly maintainers will review it, make suggestions, and eventually merge it.\nYour documentation will then appear in the gallery alongside the rest of the example.\nAt that stage, give yourself a pat on the back: you're done!\n\n" + "## Correctness\n\nIncorrect documentation can be worse than no documentation at all.\nTake the following steps to ensure correctness:\n\n- Run Python's doctest module on your docstrings\n- Run your documentation scripts from scratch, removing any temporary files/results\n\n## Using data in your documentation\n\nSome parts of the documentation require real-world data to be useful.\nFor example, you may need more than just a toy example to demonstrate the benefits of one model over another.\nThis subsection provides some tips for including data in your documentation.\n\nIf possible, use data available via Gensim's\n`downloader API `__.\nThis will reduce the risk of your documentation becoming obsolete because required data is no longer available.\n\nUse the smallest possible dataset: avoid making people unnecessarily load large datasets and models.\nThis will make your documentation faster to run and easier for people to use (they can modify your examples and re-run them quickly).\n\n## Finalizing your contribution\n\nFirst, get Sphinx Gallery to build your documentation::\n\n make -C docs/src html\n\nThis can take a while if your documentation uses a large dataset, or if you've changed many other tutorials or guides.\nOnce this completes successfully, open ``docs/auto_examples/index.html`` in your browser.\nYou should see your new tutorial or guide in the gallery.\n\nOnce your documentation script is working correctly, it's time to add it to the git repository::\n\n git add docs/src/gallery/tutorials/run_example.py\n git add docs/src/auto_examples/tutorials/run_example.{py,py.md5,rst,ipynb}\n git add docs/src/auto_examples/howtos/sg_execution_times.rst\n git commit -m \"enter a helpful commit message here\"\n git push origin branchname\n\n.. Note::\n You may be wondering what all those other files are.\n Sphinx Gallery puts a copy of your Python script in ``auto_examples/tutorials``.\n The .md5 contains MD5 hash of the script to enable easy detection of modifications.\n Gallery also generates .rst (RST for Sphinx) and .ipynb (Jupyter notebook) files from the script.\n Finally, ``sg_execution_times.rst`` contains the time taken to run each example.\n\nFinally, make a PR on `github `__.\nOne of our friendly maintainers will review it, make suggestions, and eventually merge it.\nYour documentation will then appear in the gallery alongside the rest of the example.\nAt that stage, give yourself a pat on the back: you're done!\n\n" ] } ], @@ -81,7 +81,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/howtos/run_doc.py b/docs/src/auto_examples/howtos/run_doc.py index 7e07ab74bd..15e870f1be 100644 --- a/docs/src/auto_examples/howtos/run_doc.py +++ b/docs/src/auto_examples/howtos/run_doc.py @@ -111,7 +111,7 @@ # This is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary. # # Once you're happy with the notebook, convert it to a script.py. -# There's a helpful `script `__ that will do it for you. +# There's a helpful `script `__ that will do it for you. # To use it:: # # python to_python.py < notebook.ipynb > script.py diff --git a/docs/src/auto_examples/howtos/run_doc.py.md5 b/docs/src/auto_examples/howtos/run_doc.py.md5 index d81fe9d241..979aa0eb5e 100644 --- a/docs/src/auto_examples/howtos/run_doc.py.md5 +++ b/docs/src/auto_examples/howtos/run_doc.py.md5 @@ -1 +1 @@ -b3db0b66859316de13e1a36fa6181657 \ No newline at end of file +512a76ce743dd12482d21784a76b60fe \ No newline at end of file diff --git a/docs/src/auto_examples/howtos/run_doc.rst b/docs/src/auto_examples/howtos/run_doc.rst index 88d1904295..aa5257ed87 100644 --- a/docs/src/auto_examples/howtos/run_doc.rst +++ b/docs/src/auto_examples/howtos/run_doc.rst @@ -1,12 +1,21 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "auto_examples/howtos/run_doc.py" +.. LINE NUMBERS ARE GIVEN BELOW. + .. only:: html .. note:: :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title + Click :ref:`here ` + to download the full example code - .. _sphx_glr_auto_examples_howtos_run_doc.py: +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_auto_examples_howtos_run_doc.py: How to Author Gensim Documentation @@ -14,6 +23,8 @@ How to Author Gensim Documentation How to author documentation for Gensim. +.. GENERATED FROM PYTHON SOURCE LINES 9-59 + Background ---------- @@ -65,6 +76,7 @@ Prefix the name of the script with ``run_``: this way, the the documentation bui The contents of the script are straightforward. At the very top, you need a docstring describing what your script does. +.. GENERATED FROM PYTHON SOURCE LINES 59-67 .. code-block:: default @@ -91,6 +103,8 @@ At the very top, you need a docstring describing what your script does. +.. GENERATED FROM PYTHON SOURCE LINES 68-75 + The title is what will show up in the gallery. Keep this short and descriptive. @@ -99,6 +113,8 @@ When people mouse-over the title, they will see the description. Keep this short too. +.. GENERATED FROM PYTHON SOURCE LINES 77-95 + The rest of the script is Python, formatted in a special way so that Sphinx Gallery can parse it. The most important properties of this format are: @@ -118,6 +134,8 @@ You should be able to run your script directly from the command line:: and it should run to completion without error, occasionally printing stuff to standard output. +.. GENERATED FROM PYTHON SOURCE LINES 97-128 + Authoring Workflow ------------------ @@ -135,7 +153,7 @@ If the above is not your cup of tea, you can also author your documentation as a This is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary. Once you're happy with the notebook, convert it to a script.py. -There's a helpful `script `__ that will do it for you. +There's a helpful `script `__ that will do it for you. To use it:: python to_python.py < notebook.ipynb > script.py @@ -150,6 +168,8 @@ More specifically: Once your script.py works, put it in a suitable subdirectory. Please don't include your original Jupyter notebook in the repository - we won't be using it. +.. GENERATED FROM PYTHON SOURCE LINES 130-183 + Correctness ----------- @@ -207,9 +227,9 @@ At that stage, give yourself a pat on the back: you're done! .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 0.112 seconds) + **Total running time of the script:** ( 0 minutes 0.325 seconds) -**Estimated memory usage:** 6 MB +**Estimated memory usage:** 9 MB .. _sphx_glr_download_auto_examples_howtos_run_doc.py: diff --git a/docs/src/auto_examples/howtos/sg_execution_times.rst b/docs/src/auto_examples/howtos/sg_execution_times.rst index 628fe13c1f..03a66b6ece 100644 --- a/docs/src/auto_examples/howtos/sg_execution_times.rst +++ b/docs/src/auto_examples/howtos/sg_execution_times.rst @@ -5,14 +5,14 @@ Computation times ================= -**52:12.903** total execution time for **auto_examples_howtos** files: +**00:00.325** total execution time for **auto_examples_howtos** files: -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` (``run_doc2vec_imdb.py``) | 52:12.903 | 3494.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` (``run_compare_lda.py``) | 00:00.000 | 0.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_doc.py` (``run_doc.py``) | 00:00.000 | 0.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` (``run_downloader_api.py``) | 00:00.000 | 0.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ ++----------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_howtos_run_doc.py` (``run_doc.py``) | 00:00.325 | 9.4 MB | ++----------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` (``run_compare_lda.py``) | 00:00.000 | 0.0 MB | ++----------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` (``run_doc2vec_imdb.py``) | 00:00.000 | 0.0 MB | ++----------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` (``run_downloader_api.py``) | 00:00.000 | 0.0 MB | ++----------------------------------------------------------------------------------------+-----------+--------+ diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index e46809100c..dfee557a6c 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -190,14 +190,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :alt: Fast Similarity Queries with Annoy and Word2Vec + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` .. raw:: html @@ -207,18 +207,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_annoy + /auto_examples/tutorials/run_ensemblelda .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :alt: LDA Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + :alt: Fast Similarity Queries with Annoy and Word2Vec - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html @@ -228,18 +228,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_lda + /auto_examples/tutorials/run_annoy .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :alt: Word Mover's Distance + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + :alt: LDA Model - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html @@ -249,18 +249,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_wmd + /auto_examples/tutorials/run_lda .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png - :alt: Ensemble LDA + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + :alt: Word Mover's Distance - :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -270,7 +270,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_ensemblelda + /auto_examples/tutorials/run_wmd .. raw:: html
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb index d9573e5e22..b4f0ef8355 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.ipynb @@ -197,7 +197,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.rst b/docs/src/auto_examples/tutorials/run_ensemblelda.rst index 9b959482d9..d55714a177 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.rst +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.rst @@ -82,11 +82,11 @@ so it won't be explained again in detail. .. code-block:: none - 2021-01-23 15:15:53,986 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-01-23 15:16:00,703 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) - 2021-01-23 15:16:00,901 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... - 2021-01-23 15:16:00,901 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents - 2021-01-23 15:16:00,984 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) + 2021-01-24 11:02:47,097 : INFO : adding document #0 to Dictionary(0 unique tokens: []) + 2021-01-24 11:02:52,618 : INFO : built Dictionary(238542 unique tokens: ['a', 'abacus', 'ability', 'able', 'abnormal']...) from 1701 documents (total 17005207 corpus positions) + 2021-01-24 11:02:52,798 : INFO : discarding 218466 tokens: [('a', 1701), ('ability', 934), ('able', 1202), ('about', 1687), ('above', 1327), ('abstention', 13), ('accepted', 945), ('according', 1468), ('account', 1113), ('act', 1312)]... + 2021-01-24 11:02:52,798 : INFO : keeping 20076 tokens which were in no less than 20 and no more than 850 (=50.0%) documents + 2021-01-24 11:02:52,873 : INFO : resulting dictionary: Dictionary(20076 unique tokens: ['abacus', 'abnormal', 'abolished', 'abolition', 'absence']...) @@ -215,20 +215,20 @@ The number of stable topics which are clustered from all those topics is smaller .. code-block:: none - 2021-01-23 15:16:04,680 : INFO : generating 8 topic models... - 2021-01-23 15:24:25,880 : INFO : generating a 160 x 160 asymmetric distance matrix... - 2021-01-23 15:24:27,354 : INFO : fitting the clustering model, using 4 for min_samples - 2021-01-23 15:24:27,387 : INFO : generating stable topics, using 3 for min_cores - 2021-01-23 15:24:27,387 : INFO : found 25 clusters - 2021-01-23 15:24:27,391 : INFO : found 1 stable topics - 2021-01-23 15:24:27,391 : INFO : generating classic gensim model representation based on results from the ensemble - 2021-01-23 15:24:27,592 : INFO : using symmetric alpha at 1.0 - 2021-01-23 15:24:27,592 : INFO : using symmetric eta at 1.0 - 2021-01-23 15:24:27,594 : INFO : using serial LDA version on this node - 2021-01-23 15:24:27,596 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 15:24:27,596 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-24 11:02:56,264 : INFO : generating 8 topic models... + 2021-01-24 11:07:21,883 : INFO : generating a 160 x 160 asymmetric distance matrix... + 2021-01-24 11:07:22,745 : INFO : fitting the clustering model, using 4 for min_samples + 2021-01-24 11:07:22,773 : INFO : generating stable topics, using 3 for min_cores + 2021-01-24 11:07:22,773 : INFO : found 30 clusters + 2021-01-24 11:07:22,776 : INFO : found 2 stable topics + 2021-01-24 11:07:22,776 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-24 11:07:22,923 : INFO : using symmetric alpha at 0.5 + 2021-01-24 11:07:22,923 : INFO : using symmetric eta at 0.5 + 2021-01-24 11:07:22,925 : INFO : using serial LDA version on this node + 2021-01-24 11:07:22,927 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-24 11:07:22,927 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 160 - 1 + 2 @@ -273,18 +273,18 @@ Make sure to chose one that is within the range of values in ``asymmetric_distan .. code-block:: none - 0.0664596363067147 0.12630570073794478 0.2773686345664633 - 2021-01-23 15:24:27,763 : INFO : fitting the clustering model - 2021-01-23 15:24:27,777 : INFO : generating stable topics - 2021-01-23 15:24:27,777 : INFO : found 22 clusters - 2021-01-23 15:24:27,782 : INFO : found 3 stable topics - 2021-01-23 15:24:27,782 : INFO : generating classic gensim model representation based on results from the ensemble - 2021-01-23 15:24:27,784 : INFO : using symmetric alpha at 0.3333333333333333 - 2021-01-23 15:24:27,784 : INFO : using symmetric eta at 0.3333333333333333 - 2021-01-23 15:24:27,785 : INFO : using serial LDA version on this node - 2021-01-23 15:24:27,790 : INFO : running online (multi-pass) LDA training, 3 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 15:24:27,790 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 3 + 0.06984649987340175 0.12851575757379297 0.31934422319450084 + 2021-01-24 11:07:23,142 : INFO : fitting the clustering model + 2021-01-24 11:07:23,154 : INFO : generating stable topics + 2021-01-24 11:07:23,154 : INFO : found 22 clusters + 2021-01-24 11:07:23,157 : INFO : found 2 stable topics + 2021-01-24 11:07:23,157 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-24 11:07:23,159 : INFO : using symmetric alpha at 0.5 + 2021-01-24 11:07:23,159 : INFO : using symmetric eta at 0.5 + 2021-01-24 11:07:23,161 : INFO : using serial LDA version on this node + 2021-01-24 11:07:23,163 : INFO : running online (multi-pass) LDA training, 2 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-24 11:07:23,163 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2 @@ -342,79 +342,79 @@ Afterwards the number and quality of stable topics might be different depending .. code-block:: none - 2021-01-23 15:24:28,057 : INFO : using symmetric alpha at 0.1111111111111111 - 2021-01-23 15:24:28,058 : INFO : using symmetric eta at 0.1111111111111111 - 2021-01-23 15:24:28,059 : INFO : using serial LDA version on this node - 2021-01-23 15:24:28,074 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 14000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 15:24:28,074 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2021-01-23 15:24:28,074 : INFO : training LDA model using 7 processes - 2021-01-23 15:24:28,150 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 15:24:36,133 : INFO : topic #3 (0.111): 0.001*"minister" + 0.001*"actor" + 0.001*"china" + 0.001*"band" + 0.001*"km" + 0.001*"blue" + 0.001*"album" + 0.001*"economy" + 0.001*"soviet" + 0.001*"bc" - 2021-01-23 15:24:36,133 : INFO : topic #1 (0.111): 0.001*"software" + 0.001*"album" + 0.001*"minister" + 0.001*"election" + 0.001*"emperor" + 0.001*"male" + 0.001*"league" + 0.001*"lord" + 0.001*"band" + 0.001*"jewish" - 2021-01-23 15:24:36,134 : INFO : topic #2 (0.111): 0.001*"soviet" + 0.001*"china" + 0.001*"league" + 0.001*"actor" + 0.001*"km" + 0.001*"prime" + 0.001*"band" + 0.001*"energy" + 0.001*"economy" + 0.001*"season" - 2021-01-23 15:24:36,134 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"league" + 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"soviet" + 0.001*"philosophy" + 0.001*"lake" + 0.001*"software" + 0.001*"economy" - 2021-01-23 15:24:36,135 : INFO : topic #8 (0.111): 0.001*"soviet" + 0.001*"jewish" + 0.001*"energy" + 0.001*"jew" + 0.001*"minister" + 0.001*"band" + 0.001*"russian" + 0.001*"album" + 0.001*"emperor" + 0.001*"lord" - 2021-01-23 15:24:36,135 : INFO : topic diff=1.009252, rho=1.000000 - 2021-01-23 15:24:55,469 : INFO : -9.258 per-word bound, 612.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 15:24:55,469 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 15:25:02,694 : INFO : topic #4 (0.111): 0.001*"election" + 0.001*"lake" + 0.001*"apple" + 0.001*"philosophy" + 0.001*"software" + 0.001*"league" + 0.001*"km" + 0.001*"user" + 0.001*"band" + 0.001*"window" - 2021-01-23 15:25:02,695 : INFO : topic #1 (0.111): 0.002*"album" + 0.001*"software" + 0.001*"lord" + 0.001*"emperor" + 0.001*"band" + 0.001*"league" + 0.001*"minister" + 0.001*"saint" + 0.001*"novel" + 0.001*"season" - 2021-01-23 15:25:02,695 : INFO : topic #2 (0.111): 0.002*"soviet" + 0.002*"league" + 0.001*"china" + 0.001*"season" + 0.001*"actor" + 0.001*"km" + 0.001*"ball" + 0.001*"prime" + 0.001*"chinese" + 0.001*"economy" - 2021-01-23 15:25:02,696 : INFO : topic #6 (0.111): 0.001*"israel" + 0.001*"km" + 0.001*"cell" + 0.001*"user" + 0.001*"minister" + 0.001*"video" + 0.001*"jewish" + 0.001*"machine" + 0.001*"import" + 0.001*"internet" - 2021-01-23 15:25:02,697 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"actor" + 0.001*"soviet" + 0.001*"blue" + 0.001*"tree" + 0.001*"ball" + 0.001*"russian" + 0.001*"software" + 0.001*"fiction" + 0.001*"file" - 2021-01-23 15:25:02,697 : INFO : topic diff=0.143090, rho=0.592297 - 2021-01-23 15:25:21,875 : INFO : -9.195 per-word bound, 586.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 15:25:21,876 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 15:25:29,503 : INFO : topic #7 (0.111): 0.001*"economy" + 0.001*"africa" + 0.001*"bc" + 0.001*"diamond" + 0.001*"horse" + 0.001*"irish" + 0.001*"county" + 0.001*"election" + 0.001*"file" + 0.001*"car" - 2021-01-23 15:25:29,503 : INFO : topic #5 (0.111): 0.003*"km" + 0.002*"est" + 0.002*"actor" + 0.002*"election" + 0.002*"minister" + 0.002*"india" + 0.001*"italian" + 0.001*"male" + 0.001*"female" + 0.001*"constitution" - 2021-01-23 15:25:29,504 : INFO : topic #2 (0.111): 0.002*"league" + 0.002*"season" + 0.002*"soviet" + 0.002*"china" + 0.001*"ball" + 0.001*"chinese" + 0.001*"football" + 0.001*"baseball" + 0.001*"canadian" + 0.001*"nfl" - 2021-01-23 15:25:29,505 : INFO : topic #0 (0.111): 0.001*"energy" + 0.001*"ball" + 0.001*"tree" + 0.001*"engine" + 0.001*"soviet" + 0.001*"blue" + 0.001*"actor" + 0.001*"fiction" + 0.001*"file" + 0.001*"russian" - 2021-01-23 15:25:29,505 : INFO : topic #8 (0.111): 0.002*"jewish" + 0.002*"jew" + 0.002*"emperor" + 0.002*"soviet" + 0.001*"orthodox" + 0.001*"christ" + 0.001*"jesus" + 0.001*"russian" + 0.001*"cell" + 0.001*"holy" - 2021-01-23 15:25:29,506 : INFO : topic diff=0.176887, rho=0.509614 - 2021-01-23 15:25:48,495 : INFO : -9.132 per-word bound, 561.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 15:25:48,495 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 - 2021-01-23 15:25:56,501 : INFO : topic #2 (0.111): 0.003*"league" + 0.003*"season" + 0.002*"soviet" + 0.002*"ball" + 0.002*"football" + 0.002*"chinese" + 0.002*"baseball" + 0.002*"china" + 0.002*"nfl" + 0.001*"finland" - 2021-01-23 15:25:56,502 : INFO : topic #3 (0.111): 0.002*"card" + 0.002*"aircraft" + 0.002*"blue" + 0.001*"band" + 0.001*"comic" + 0.001*"kong" + 0.001*"album" + 0.001*"bridge" + 0.001*"hong" + 0.001*"love" - 2021-01-23 15:25:56,503 : INFO : topic #6 (0.111): 0.002*"cell" + 0.001*"user" + 0.001*"import" + 0.001*"gun" + 0.001*"disease" + 0.001*"specie" + 0.001*"video" + 0.001*"device" + 0.001*"animal" + 0.001*"weapon" - 2021-01-23 15:25:56,503 : INFO : topic #7 (0.111): 0.001*"irish" + 0.001*"horse" + 0.001*"bc" + 0.001*"diamond" + 0.001*"economy" + 0.001*"liberal" + 0.001*"car" + 0.001*"africa" + 0.001*"county" + 0.001*"file" - 2021-01-23 15:25:56,504 : INFO : topic #0 (0.111): 0.002*"energy" + 0.001*"engine" + 0.001*"tree" + 0.001*"ball" + 0.001*"acid" + 0.001*"soviet" + 0.001*"file" + 0.001*"fiction" + 0.001*"blue" + 0.001*"classical" - 2021-01-23 15:25:56,505 : INFO : topic diff=0.154924, rho=0.454053 - 2021-01-23 15:26:15,743 : INFO : -9.092 per-word bound, 545.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 15:26:15,821 : INFO : using symmetric alpha at 0.09090909090909091 - 2021-01-23 15:26:15,822 : INFO : using symmetric eta at 0.09090909090909091 - 2021-01-23 15:26:15,824 : INFO : using serial LDA version on this node - 2021-01-23 15:26:15,855 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 15:26:15,855 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy - 2021-01-23 15:26:35,321 : INFO : -10.499 per-word bound, 1447.2 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 15:26:35,321 : INFO : PROGRESS: pass 0, at document #1701/1701 - 2021-01-23 15:26:40,838 : INFO : topic #4 (0.091): 0.001*"software" + 0.001*"election" + 0.001*"jewish" + 0.001*"actor" + 0.001*"minister" + 0.001*"territory" + 0.001*"league" + 0.001*"channel" + 0.001*"user" + 0.001*"japan" - 2021-01-23 15:26:40,839 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"energy" + 0.001*"software" + 0.001*"km" + 0.001*"economy" + 0.001*"minister" + 0.001*"bc" + 0.001*"china" + 0.001*"league" - 2021-01-23 15:26:40,839 : INFO : topic #7 (0.091): 0.001*"album" + 0.001*"irish" + 0.001*"ball" + 0.001*"y" + 0.001*"minister" + 0.001*"actor" + 0.001*"chinese" + 0.001*"election" + 0.001*"band" + 0.001*"la" - 2021-01-23 15:26:40,840 : INFO : topic #8 (0.091): 0.001*"soviet" + 0.001*"album" + 0.001*"league" + 0.001*"minister" + 0.001*"actor" + 0.001*"emperor" + 0.001*"software" + 0.001*"male" + 0.001*"energy" + 0.001*"japanese" - 2021-01-23 15:26:40,841 : INFO : topic #1 (0.091): 0.001*"band" + 0.001*"km" + 0.001*"cell" + 0.001*"actor" + 0.001*"economy" + 0.001*"lord" + 0.001*"energy" + 0.001*"car" + 0.001*"county" + 0.001*"software" - 2021-01-23 15:26:40,841 : INFO : topic diff=1.022883, rho=1.000000 - 2021-01-23 15:27:00,369 : INFO : -9.279 per-word bound, 621.1 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words - 2021-01-23 15:27:00,369 : INFO : PROGRESS: pass 1, at document #1701/1701 - 2021-01-23 15:27:05,770 : INFO : topic #2 (0.091): 0.002*"km" + 0.002*"female" + 0.002*"est" + 0.001*"soviet" + 0.001*"election" + 0.001*"male" + 0.001*"minister" + 0.001*"economy" + 0.001*"russian" + 0.001*"russia" - 2021-01-23 15:27:05,771 : INFO : topic #8 (0.091): 0.001*"league" + 0.001*"album" + 0.001*"color" + 0.001*"energy" + 0.001*"love" + 0.001*"actor" + 0.001*"soviet" + 0.001*"frac" + 0.001*"lake" + 0.001*"blue" - 2021-01-23 15:27:05,771 : INFO : topic #6 (0.091): 0.002*"actor" + 0.001*"software" + 0.001*"user" + 0.001*"blue" + 0.001*"emperor" + 0.001*"album" + 0.001*"band" + 0.001*"election" + 0.001*"soviet" + 0.001*"minister" - 2021-01-23 15:27:05,772 : INFO : topic #3 (0.091): 0.001*"soviet" + 0.001*"cell" + 0.001*"novel" + 0.001*"album" + 0.001*"energy" + 0.001*"bc" + 0.001*"philosophy" + 0.001*"knowledge" + 0.001*"band" + 0.001*"japanese" - 2021-01-23 15:27:05,772 : INFO : topic #10 (0.091): 0.002*"economy" + 0.002*"km" + 0.001*"minister" + 0.001*"est" + 0.001*"election" + 0.001*"import" + 0.001*"russian" + 0.001*"prime" + 0.001*"soviet" + 0.001*"chinese" - 2021-01-23 15:27:05,773 : INFO : topic diff=0.163523, rho=0.577350 - 2021-01-23 15:27:05,774 : INFO : ensemble contains 9 models and 160 topics now - 2021-01-23 15:27:05,782 : INFO : ensemble contains 10 models and 169 topics now - 2021-01-23 15:27:05,789 : INFO : asymmetric distance matrix is outdated due to add_model - 2021-01-23 15:27:05,789 : INFO : generating a 180 x 180 asymmetric distance matrix... - 2021-01-23 15:27:07,632 : INFO : fitting the clustering model, using 5 for min_samples - 2021-01-23 15:27:07,670 : INFO : generating stable topics, using 3 for min_cores - 2021-01-23 15:27:07,670 : INFO : found 23 clusters - 2021-01-23 15:27:07,676 : INFO : found 1 stable topics - 2021-01-23 15:27:07,676 : INFO : generating classic gensim model representation based on results from the ensemble - 2021-01-23 15:27:07,677 : INFO : using symmetric alpha at 1.0 - 2021-01-23 15:27:07,677 : INFO : using symmetric eta at 1.0 - 2021-01-23 15:27:07,680 : INFO : using serial LDA version on this node - 2021-01-23 15:27:07,682 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 - 2021-01-23 15:27:07,682 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-24 11:07:23,511 : INFO : using symmetric alpha at 0.1111111111111111 + 2021-01-24 11:07:23,512 : INFO : using symmetric eta at 0.1111111111111111 + 2021-01-24 11:07:23,515 : INFO : using serial LDA version on this node + 2021-01-24 11:07:23,526 : INFO : running online LDA training, 9 topics, 4 passes over the supplied corpus of 1701 documents, updating every 22000 documents, evaluating every ~1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-24 11:07:23,526 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-24 11:07:23,572 : INFO : training LDA model using 11 processes + 2021-01-24 11:07:23,721 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-24 11:07:28,538 : INFO : topic #1 (0.111): 0.001*"minister" + 0.001*"soviet" + 0.001*"election" + 0.001*"israel" + 0.001*"jewish" + 0.001*"jew" + 0.001*"chinese" + 0.001*"emperor" + 0.001*"actor" + 0.001*"km" + 2021-01-24 11:07:28,538 : INFO : topic #4 (0.111): 0.001*"economy" + 0.001*"female" + 0.001*"minister" + 0.001*"km" + 0.001*"election" + 0.001*"est" + 0.001*"jewish" + 0.001*"japanese" + 0.001*"animal" + 0.001*"band" + 2021-01-24 11:07:28,539 : INFO : topic #7 (0.111): 0.001*"election" + 0.001*"minister" + 0.001*"km" + 0.001*"cell" + 0.001*"software" + 0.001*"energy" + 0.001*"actor" + 0.001*"band" + 0.001*"bc" + 0.001*"japanese" + 2021-01-24 11:07:28,539 : INFO : topic #0 (0.111): 0.002*"soviet" + 0.001*"league" + 0.001*"km" + 0.001*"actor" + 0.001*"territory" + 0.001*"album" + 0.001*"emperor" + 0.001*"season" + 0.001*"jewish" + 0.001*"female" + 2021-01-24 11:07:28,539 : INFO : topic #6 (0.111): 0.001*"actor" + 0.001*"india" + 0.001*"software" + 0.001*"band" + 0.001*"bc" + 0.001*"novel" + 0.001*"km" + 0.001*"album" + 0.001*"emperor" + 0.001*"y" + 2021-01-24 11:07:28,539 : INFO : topic diff=1.009234, rho=1.000000 + 2021-01-24 11:07:38,853 : INFO : -9.257 per-word bound, 612.0 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-24 11:07:38,853 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-24 11:07:43,986 : INFO : topic #5 (0.111): 0.001*"chinese" + 0.001*"y" + 0.001*"blue" + 0.001*"spanish" + 0.001*"user" + 0.001*"china" + 0.001*"actor" + 0.001*"canadian" + 0.001*"color" + 0.001*"emperor" + 2021-01-24 11:07:43,986 : INFO : topic #4 (0.111): 0.002*"est" + 0.002*"economy" + 0.002*"km" + 0.002*"election" + 0.001*"minister" + 0.001*"female" + 0.001*"male" + 0.001*"prime" + 0.001*"israel" + 0.001*"bank" + 2021-01-24 11:07:43,986 : INFO : topic #7 (0.111): 0.001*"cell" + 0.001*"energy" + 0.001*"color" + 0.001*"map" + 0.001*"horse" + 0.001*"ship" + 0.001*"bc" + 0.001*"election" + 0.001*"card" + 0.001*"specie" + 2021-01-24 11:07:43,987 : INFO : topic #1 (0.111): 0.001*"jewish" + 0.001*"jew" + 0.001*"israel" + 0.001*"aircraft" + 0.001*"minister" + 0.001*"comic" + 0.001*"soviet" + 0.001*"actor" + 0.001*"irish" + 0.001*"election" + 2021-01-24 11:07:43,987 : INFO : topic #6 (0.111): 0.001*"actor" + 0.001*"software" + 0.001*"india" + 0.001*"band" + 0.001*"bc" + 0.001*"novel" + 0.001*"y" + 0.001*"jewish" + 0.001*"album" + 0.001*"emperor" + 2021-01-24 11:07:43,987 : INFO : topic diff=0.143780, rho=0.592297 + 2021-01-24 11:07:54,251 : INFO : -9.190 per-word bound, 583.9 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-24 11:07:54,252 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-24 11:07:58,980 : INFO : topic #6 (0.111): 0.002*"actor" + 0.001*"software" + 0.001*"india" + 0.001*"novel" + 0.001*"bc" + 0.001*"band" + 0.001*"hebrew" + 0.001*"jewish" + 0.001*"y" + 0.001*"japanese" + 2021-01-24 11:07:58,980 : INFO : topic #5 (0.111): 0.002*"y" + 0.001*"chinese" + 0.001*"blue" + 0.001*"user" + 0.001*"china" + 0.001*"z" + 0.001*"color" + 0.001*"server" + 0.001*"canadian" + 0.001*"spanish" + 2021-01-24 11:07:58,980 : INFO : topic #4 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"economy" + 0.002*"election" + 0.002*"minister" + 0.002*"male" + 0.002*"territory" + 0.002*"israel" + 0.002*"female" + 0.002*"prime" + 2021-01-24 11:07:58,980 : INFO : topic #2 (0.111): 0.002*"cell" + 0.001*"import" + 0.001*"machine" + 0.001*"software" + 0.001*"user" + 0.001*"league" + 0.001*"memory" + 0.001*"bit" + 0.001*"apple" + 0.001*"aircraft" + 2021-01-24 11:07:58,981 : INFO : topic #8 (0.111): 0.002*"emperor" + 0.001*"chinese" + 0.001*"copyright" + 0.001*"minister" + 0.001*"mary" + 0.001*"software" + 0.001*"y" + 0.001*"animal" + 0.001*"actor" + 0.001*"japanese" + 2021-01-24 11:07:58,981 : INFO : topic diff=0.178240, rho=0.509614 + 2021-01-24 11:08:09,249 : INFO : -9.125 per-word bound, 558.3 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-24 11:08:09,249 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #1701/1701, outstanding queue size 1 + 2021-01-24 11:08:13,965 : INFO : topic #6 (0.111): 0.002*"actor" + 0.001*"hebrew" + 0.001*"novel" + 0.001*"jewish" + 0.001*"software" + 0.001*"bc" + 0.001*"singer" + 0.001*"india" + 0.001*"italian" + 0.001*"la" + 2021-01-24 11:08:13,965 : INFO : topic #7 (0.111): 0.002*"energy" + 0.002*"cell" + 0.001*"horse" + 0.001*"atom" + 0.001*"color" + 0.001*"specie" + 0.001*"chemical" + 0.001*"acid" + 0.001*"animal" + 0.001*"map" + 2021-01-24 11:08:13,965 : INFO : topic #8 (0.111): 0.003*"emperor" + 0.002*"chinese" + 0.002*"copyright" + 0.001*"mary" + 0.001*"software" + 0.001*"japanese" + 0.001*"minister" + 0.001*"y" + 0.001*"bc" + 0.001*"animal" + 2021-01-24 11:08:13,965 : INFO : topic #4 (0.111): 0.003*"est" + 0.003*"km" + 0.003*"election" + 0.003*"economy" + 0.003*"minister" + 0.002*"territory" + 0.002*"male" + 0.002*"israel" + 0.002*"prime" + 0.002*"female" + 2021-01-24 11:08:13,966 : INFO : topic #3 (0.111): 0.003*"album" + 0.003*"band" + 0.001*"bass" + 0.001*"alexander" + 0.001*"love" + 0.001*"ball" + 0.001*"fan" + 0.001*"philosophy" + 0.001*"artist" + 0.001*"me" + 2021-01-24 11:08:13,966 : INFO : topic diff=0.153747, rho=0.454053 + 2021-01-24 11:08:24,231 : INFO : -9.087 per-word bound, 543.7 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-24 11:08:24,310 : INFO : using symmetric alpha at 0.09090909090909091 + 2021-01-24 11:08:24,310 : INFO : using symmetric eta at 0.09090909090909091 + 2021-01-24 11:08:24,312 : INFO : using serial LDA version on this node + 2021-01-24 11:08:24,325 : INFO : running online (multi-pass) LDA training, 11 topics, 2 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-24 11:08:24,325 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy + 2021-01-24 11:08:36,120 : INFO : -10.499 per-word bound, 1447.4 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-24 11:08:36,120 : INFO : PROGRESS: pass 0, at document #1701/1701 + 2021-01-24 11:08:40,292 : INFO : topic #7 (0.091): 0.001*"band" + 0.001*"minister" + 0.001*"album" + 0.001*"km" + 0.001*"cell" + 0.001*"female" + 0.001*"lake" + 0.001*"election" + 0.001*"africa" + 0.001*"chinese" + 2021-01-24 11:08:40,292 : INFO : topic #5 (0.091): 0.001*"election" + 0.001*"japanese" + 0.001*"cell" + 0.001*"energy" + 0.001*"km" + 0.001*"economy" + 0.001*"actor" + 0.001*"ship" + 0.001*"county" + 0.001*"emperor" + 2021-01-24 11:08:40,293 : INFO : topic #6 (0.091): 0.001*"jewish" + 0.001*"actor" + 0.001*"energy" + 0.001*"cell" + 0.001*"la" + 0.001*"y" + 0.001*"russian" + 0.001*"china" + 0.001*"league" + 0.001*"soviet" + 2021-01-24 11:08:40,293 : INFO : topic #4 (0.091): 0.001*"chinese" + 0.001*"season" + 0.001*"emperor" + 0.001*"ball" + 0.001*"novel" + 0.001*"color" + 0.001*"league" + 0.001*"spanish" + 0.001*"lake" + 0.001*"energy" + 2021-01-24 11:08:40,293 : INFO : topic #8 (0.091): 0.001*"emperor" + 0.001*"km" + 0.001*"band" + 0.001*"software" + 0.001*"actor" + 0.001*"album" + 0.001*"russian" + 0.001*"economy" + 0.001*"male" + 0.001*"minister" + 2021-01-24 11:08:40,294 : INFO : topic diff=1.023860, rho=1.000000 + 2021-01-24 11:08:53,644 : INFO : -9.277 per-word bound, 620.5 perplexity estimate based on a held-out corpus of 1701 documents with 4692704 words + 2021-01-24 11:08:53,644 : INFO : PROGRESS: pass 1, at document #1701/1701 + 2021-01-24 11:08:57,030 : INFO : topic #3 (0.091): 0.002*"actor" + 0.001*"album" + 0.001*"energy" + 0.001*"india" + 0.001*"software" + 0.001*"love" + 0.001*"singer" + 0.001*"actress" + 0.001*"y" + 0.001*"canadian" + 2021-01-24 11:08:57,030 : INFO : topic #5 (0.091): 0.001*"aircraft" + 0.001*"emperor" + 0.001*"plant" + 0.001*"ireland" + 0.001*"irish" + 0.001*"county" + 0.001*"actor" + 0.001*"henry" + 0.001*"cell" + 0.001*"japanese" + 2021-01-24 11:08:57,030 : INFO : topic #4 (0.091): 0.002*"ball" + 0.002*"season" + 0.001*"chinese" + 0.001*"league" + 0.001*"color" + 0.001*"emperor" + 0.001*"novel" + 0.001*"nfl" + 0.001*"lake" + 0.001*"mary" + 2021-01-24 11:08:57,031 : INFO : topic #0 (0.091): 0.002*"soviet" + 0.002*"election" + 0.001*"japanese" + 0.001*"chinese" + 0.001*"league" + 0.001*"cell" + 0.001*"km" + 0.001*"japan" + 0.001*"card" + 0.001*"minister" + 2021-01-24 11:08:57,031 : INFO : topic #7 (0.091): 0.002*"band" + 0.001*"album" + 0.001*"lake" + 0.001*"bass" + 0.001*"kong" + 0.001*"minister" + 0.001*"cell" + 0.001*"chinese" + 0.001*"africa" + 0.001*"liberal" + 2021-01-24 11:08:57,031 : INFO : topic diff=0.167986, rho=0.577350 + 2021-01-24 11:08:57,032 : INFO : ensemble contains 9 models and 160 topics now + 2021-01-24 11:08:57,038 : INFO : ensemble contains 10 models and 169 topics now + 2021-01-24 11:08:57,045 : INFO : asymmetric distance matrix is outdated due to add_model + 2021-01-24 11:08:57,045 : INFO : generating a 180 x 180 asymmetric distance matrix... + 2021-01-24 11:08:58,136 : INFO : fitting the clustering model, using 5 for min_samples + 2021-01-24 11:08:58,170 : INFO : generating stable topics, using 3 for min_cores + 2021-01-24 11:08:58,171 : INFO : found 26 clusters + 2021-01-24 11:08:58,175 : INFO : found 1 stable topics + 2021-01-24 11:08:58,175 : INFO : generating classic gensim model representation based on results from the ensemble + 2021-01-24 11:08:58,176 : INFO : using symmetric alpha at 1.0 + 2021-01-24 11:08:58,176 : INFO : using symmetric eta at 1.0 + 2021-01-24 11:08:58,180 : INFO : using serial LDA version on this node + 2021-01-24 11:08:58,183 : INFO : running online (multi-pass) LDA training, 1 topics, 0 passes over the supplied corpus of 1701 documents, updating model once every 1701 documents, evaluating perplexity every 1701 documents, iterating 50x with a convergence threshold of 0.001000 + 2021-01-24 11:08:58,183 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy 180 1 @@ -424,9 +424,9 @@ Afterwards the number and quality of stable topics might be different depending .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 11 minutes 53.863 seconds) + **Total running time of the script:** ( 6 minutes 48.976 seconds) -**Estimated memory usage:** 1625 MB +**Estimated memory usage:** 1614 MB .. _sphx_glr_download_auto_examples_tutorials_run_ensemblelda.py: diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index 7003c2957e..268c3d8887 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,10 +5,10 @@ Computation times ================= -**11:26.674** total execution time for **auto_examples_tutorials** files: +**06:48.976** total execution time for **auto_examples_tutorials** files: +-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) | 11:26.674 | 7177.5 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 06:48.976 | 1614.4 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ @@ -20,3 +20,5 @@ Computation times +-------------------------------------------------------------------------------------+-----------+-----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+-----------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+-----------+ From f7bc8b20d8bafd51065c3090fe6f68912c542ef1 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 24 Jan 2021 11:29:56 +0100 Subject: [PATCH 127/166] idk --- .../tutorials/run_ensemblelda_codeobj.pickle | Bin 8641 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle b/docs/src/auto_examples/tutorials/run_ensemblelda_codeobj.pickle deleted file mode 100644 index 91585af924ab6004df91e11df745eacc51ffe39e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8641 zcmd5>ONbm*6dlccI@2@B%p{Xfi;B2OR7x@%6*n<4N+2IdBH}{YH&t)Cs#0Cm)W^vf zln9C%3IbZe@Ac!09t9t*c?y73WPgaw9_ndp*eeZtW z?HhCN-+o{w|Ce^n!1J&fJAof%Y4>7i;SgI-J7%1{mmR;99ZMHIJRG9fc?E~ri`lVP z;nS>-hB!-CpASOo0*=oKfpmTtScyj#eJ{RhL@^#3+aTuSI7^p`#eQT5A^dlih-7KU ziTb98q9}VO0d{+#irC(p?&XY`-KAiAc`~L@VR!9=L=vYfmJ`LuH*p_~x{AXHwp8fc z=Ycy>xI_H~JCZ*F?&tvf zkux;RARGlDGAsdst?g4+Gkf`M>KKitB-jEwUnn8Fd{`*xTt+YBa{;o*?v|(>N$hyU z9r_1umV%OqHw5v*c{Cb1{($iAm+qW_HcdjU+{0>^D$=!|5Mqm``7Lg#Ktt*}P6{1! zmmvglzqGJlfOO_(1#{v%)R zuk!G!Q}R|FNY`?c0C2kG1%m-3Pvh)`XD*5WsVrHygpS!<971M+;z_c9x1d~s0)&T< zbD9;x4+sjS9flG{bk9(ta~2}$g_Sla_1KFH2JpygH(jPBrC5c^#v)6n(FI#TrHJ{c z*zBU84Et2&#Crd6vCTDRwSg?F0E!K-OB#Pa?VFw($2FPflWsu!T~3 z`m`1z@qbphTRF?3WrP+;`P$xFtOe3(8u(}&RI>alQOlHOM)!yApxPGbYe77PNGk3Rq9N5z2I;QnK*CMtG@E9T1_OBx-)mp%m}^vPN4{22}X{|FWq7F1UEr| z-@5T@+<*$klB?W>IwK^57ZurE4PbXw;hyFWk|2IiRFr6EIK*-2n0@{hg%b^(YY;L; z#4Pw03PZF*&UX$<*9iXVy1m>GQiYE5|N=K;<+z;Qn&$4%bGG=>k!HBS-% z1E;L_6BtW(VSLvxfknC=kI-Ef-aWu8pSlq})w_>2f=PNdC;+D&G;$1&Jfpb_J6nmL z&jrY-%0mYCF-6a84g(n$!&b2ElW*kp)~)=S_lWakmXL^f#;gaEj*_mER2NDQZac9Z zBr)7VG=OP^XAB*mU%?)og7lfy{UGeawY7u%1+3VAhI5K6Cbm6ovHFKHs(UH7ma$jo zLgxD2OsX#^dMi@k_{7;w$FH?ZsxQ?UA>C$Ek=@;->Z;uG9cdK(I9(Kz0}*4|7=opy zrc*Z7=V&yoCIlK*^5^w@&!ia4!lggx69M%7%8OL2gB2_;^PaK!h+sTni<2BrA|2`H^hk@t7f)qn7o~pTizX;Z z%a{D(^h7c;=KIb7HLn*u#5=CTb{|K0+x>;8jcaaCuH#N)0_XZ;uDMCpp&1k@DN znY+moo;(&0u-P%Z(j60AvgHg!bshgInWGFv4(!Tk$TreM+3iqC*tz(r*x-K(W%rb7 zk|YJBbp@v|rH-Y#F0wA54$Uj%I^Wg2KQi!+G|?LGw4UO$Jp={&5^wl>Iq7QNd=;~0 z`s+9E6~r?N!2g3Ol>1O_Ht!bxwL#PgMel#~W_rBN2&u?CsmSi`o9Ptg#) Date: Wed, 5 May 2021 21:15:44 +0200 Subject: [PATCH 128/166] review --- docs/src/gallery/tutorials/run_ensemblelda.py | 2 +- gensim/corpora/opinosiscorpus.py | 4 +- gensim/downloader.py | 1 - gensim/models/ensemblelda.py | 39 +++++++------------ gensim/test/test_ensemblelda.py | 2 +- 5 files changed, 20 insertions(+), 28 deletions(-) diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py index 086f65fc44..3519cf7b9f 100644 --- a/docs/src/gallery/tutorials/run_ensemblelda.py +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -109,7 +109,7 @@ import numpy as np shape = ensemble.asymmetric_distance_matrix.shape -without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0],dtype=bool)].reshape(shape[0],-1) +without_diagonal = ensemble.asymmetric_distance_matrix[~np.eye(shape[0], dtype=bool)].reshape(shape[0], -1) print(without_diagonal.min(), without_diagonal.mean(), without_diagonal.max()) ensemble.recluster(eps=0.09, min_samples=2, min_cores=2) diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py index 1a68ec670c..eaee6f035a 100644 --- a/gensim/corpora/opinosiscorpus.py +++ b/gensim/corpora/opinosiscorpus.py @@ -1,7 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Author: Tobias B +# Author: Tobias B +# Copyright (C) 2021 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Creates a corpus and dictionary from the Opinosis dataset. diff --git a/gensim/downloader.py b/gensim/downloader.py index 47061ec793..aac4d0edcf 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -357,7 +357,6 @@ def _download(name): If md5sum on client and in repo are different. """ - print('_download') url_load_file = "{base}/{fname}/__init__.py".format(base=DOWNLOAD_BASE_URL, fname=name) data_folder_dir = os.path.join(BASE_DIR, name) data_folder_dir_tmp = data_folder_dir + '_tmp' diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 417b43e441..6682db3bbd 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -3,20 +3,22 @@ # # Authors: Tobias Brigl , Alex Salles , # Alex Loosley , Data Reply Munich -# +# Copyright (C) 2021 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. -Extracts reliable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that -the user does not need to know the exact number of topics the topic model should extract ahead of time +Extracts reliable topics that are consistently learned across multiple LDA models, for higher reproducibility +and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic +model should extract ahead of time. -For more details read our paper (https://www.hip70890b.de/machine_learning/ensemble_LDA/). +For more details read `our thesis `_. Usage examples -------------- -Train an ensemble of LdaModels using a Gensim corpus +Train an ensemble of LdaModels using a Gensim corpus: .. sourcecode:: pycon @@ -32,7 +34,7 @@ >>> # keyword argument, as they are passed through to the children. >>> elda = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=10, num_models=4) -Save a model to disk, or reload a pre-trained model +Save a model to disk, or reload a pre-trained model: .. sourcecode:: pycon @@ -45,7 +47,7 @@ >>> # Load a potentially pretrained model from disk. >>> elda = EnsembleLda.load(temp_file) -Query, the model using new, unseen documents +Query, the model using new, unseen documents: .. sourcecode:: pycon @@ -60,7 +62,7 @@ >>> unseen_doc = other_corpus[0] >>> vector = elda[unseen_doc] # get topic probability distribution for a document -Increase the ensemble size by adding a new model. Make sure it uses the same dictionary +Increase the ensemble size by adding a new model. Make sure it uses the same dictionary: .. sourcecode:: pycon @@ -70,28 +72,17 @@ >>> vector = elda[unseen_doc] To optimize the ensemble for your specific case, the children can be clustered again using -different hyperparameters +different hyperparameters: .. sourcecode:: pycon >>> elda.recluster(eps=0.2) -References ----------- -.. [1] REHUREK, Radim and Sojka, PETR, 2010, Software framework for topic modelling with large corpora. In : THE LREC - 2010 WORKSHOP ON NEW CHALLENGES FOR NLP FRAMEWORKS [online]. Msida : University of Malta. 2010. p. 45-50. - Available from: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.695.4595 - -.. [2] BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. - Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from: - https://www.hip70890b.de/machine_learning/ensemble_LDA/ - Citation -------- -At the moment, there is no paper associated to ensemble LDA but we are working on publicly releasing Tobi Brigl's -bachelor thesis on the topic. In the meantime, please include a mention of us (Brigl, T.; Salles, A.; Loosley, A) and -a link to this file (https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/ensemblelda.py) if this work -is presented, further developed, or used as the basis for a published result. +BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. +Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from: +https://www.sezanzeb.de/machine_learning/ensemble_LDA/ Other Notes ----------- @@ -118,7 +109,7 @@ class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that - the user does not need to know the exact number of topics the topic model should extract ahead of time [2]. + the user does not need to know the exact number of topics the topic model should extract ahead of time. """ diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 01c1ffa085..8d021d3658 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Author: Tobias B +# Author: Tobias B """ Automated tests for checking the EnsembleLda Class From c39848caf23a6dc479b4d34207a7c474d629ea61 Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 5 May 2021 21:18:03 +0200 Subject: [PATCH 129/166] stream documents instead of loading all into memory --- docs/src/gallery/tutorials/run_ensemblelda.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py index 086f65fc44..d593f88860 100644 --- a/docs/src/gallery/tutorials/run_ensemblelda.py +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -31,9 +31,12 @@ lemmatizer = WordNetLemmatizer() docs = api.load('text8') -docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] -dictionary = Dictionary(docs) + +dictionary = Dictionary() +for doc in docs: + dictionary.add_documents([[lemmatizer.lemmatize(token) for token in doc]]) dictionary.filter_extremes(no_below=20, no_above=0.5) + corpus = [dictionary.doc2bow(doc) for doc in docs] ############################################################################### From 05bd52e1b2a5adc214d99025606d1234fe22e64e Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 5 May 2021 21:33:58 +0200 Subject: [PATCH 130/166] streaming docs instead of loading all into memory --- docs/src/gallery/tutorials/run_ensemblelda.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py index 086f65fc44..d593f88860 100644 --- a/docs/src/gallery/tutorials/run_ensemblelda.py +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -31,9 +31,12 @@ lemmatizer = WordNetLemmatizer() docs = api.load('text8') -docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] -dictionary = Dictionary(docs) + +dictionary = Dictionary() +for doc in docs: + dictionary.add_documents([[lemmatizer.lemmatize(token) for token in doc]]) dictionary.filter_extremes(no_below=20, no_above=0.5) + corpus = [dictionary.doc2bow(doc) for doc in docs] ############################################################################### From 115cd1568f5762cd52a6e04b1b63556853fe27b6 Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 5 May 2021 22:40:57 +0200 Subject: [PATCH 131/166] .format replaced with f-string --- gensim/models/ensemblelda.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 6682db3bbd..dece967915 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -252,7 +252,7 @@ def __init__( if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: return - logger.info("generating {} topic models...".format(num_models)) + logger.info(f"generating {num_models} topic models...") if ensemble_workers > 1: self._generate_topic_models_multiproc(num_models, ensemble_workers) @@ -281,15 +281,14 @@ def get_topic_model_class(self): del self.topic_model_class_string except ModuleNotFoundError: logger.error( - 'Could not import the "{}" module in order to provide the "{}" class as ' - '"topic_model_class" attribute. {}' - .format(self.topic_model_class_string, self.topic_model_class_string, instruction) + f'Could not import the "{self.topic_model_class_string}" module in order to provide the ' + f'"{self.topic_model_class_string}" class as "topic_model_class" attribute. {instruction}' ) except AttributeError: logger.error( - 'Could not import the "{}" class from the "{}" module in order to set the ' - '"topic_model_class" attribute. {}' - .format(self.topic_model_class_string, self.topic_model_module_string, instruction) + f'Could not import the "{self.topic_model_class_string}" class from the ' + f'"{self.topic_model_module_string}" module in order to set the "topic_model_class" attribute. ' + f'{instruction}' ) return self.topic_model_class @@ -476,7 +475,7 @@ def add_model(self, target, num_new_models=None): # unknown else: - raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) + raise ValueError(f"target is of unknown type or a list of unknown types: {type(target[0])}") # new models were added, increase num_models # if the user didn't provide a custon numer to use @@ -508,7 +507,7 @@ def add_model(self, target, num_new_models=None): # unknown else: - raise ValueError("target is of unknown type or a list of unknown types: {}".format(type(target[0]))) + raise ValueError(f"target is of unknown type or a list of unknown types: {type(target[0])}") # in this case, len(self.tms) should # always match self.num_models @@ -519,13 +518,12 @@ def add_model(self, target, num_new_models=None): ) self.num_models = len(self.tms) - logger.info("ensemble contains {} models and {} topics now".format(self.num_models, len(self.ttda))) + logger.info(f"ensemble contains {self.num_models} models and {len(self.ttda)} topics now") if self.ttda.shape[1] != ttda.shape[1]: raise ValueError( - "target ttda dimensions do not match. Topics must be {} but was {} elements large".format( - self.ttda.shape[-1], ttda.shape[-1], - ) + f"target ttda dimensions do not match. Topics must be {self.ttda.shape[-1]} but was {ttda.shape[-1]} " + f"elements large" ) self.ttda = np.append(self.ttda, ttda, axis=0) @@ -545,7 +543,7 @@ def _teardown(self, pipes, processes, i): index of the process that could not be started """ - logger.error("could not start process {}".format(i)) + logger.error(f"could not start process {i}") for pipe in pipes: pipe[1].close() @@ -654,7 +652,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): """ if pipe is not None: - logger.info("spawned worker to generate {} topic models".format(num_models)) + logger.info(f"spawned worker to generate {num_models} topic models") if random_states is None: random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] @@ -697,7 +695,7 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): """Worker that computes the distance to all other nodes from a chunk of nodes.""" - logger.info("spawned worker to generate {} rows of the asymmetric distance matrix".format(n_ttdas)) + logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix") # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( @@ -724,7 +722,7 @@ def _generate_asymmetric_distance_matrix(self): if threshold is None: threshold = {"mass": 0.95, "rank": 0.11}[method] - logger.info("generating a {} x {} asymmetric distance matrix...".format(len(self.ttda), len(self.ttda))) + logger.info(f"generating a {len(self.ttda)} x {len(self.ttda)} asymmetric distance matrix...") # singlecore if workers is not None and workers <= 1: @@ -819,7 +817,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s # the worker might not have received a ttda because it was chunked up too much if method not in ["mass", "rank"]: - raise ValueError("method {} unknown".format(method)) + raise ValueError(f"method {method} unknown") # select masking method: def mass_masking(a): @@ -866,7 +864,7 @@ def rank_masking(a): distances[ttd1_idx][ttd2_idx] = distance percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) - logger.info('the given threshold of {} covered on average {}% of tokens'.format(threshold, percent)) + logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens') return distances From f56b60de45dd63b256275578186fd5fd47672f46 Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 5 May 2021 22:45:45 +0200 Subject: [PATCH 132/166] docstring --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index dece967915..79a6fc32fc 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -531,7 +531,7 @@ def add_model(self, target, num_new_models=None): self.asymmetric_distance_matrix_outdated = True def _teardown(self, pipes, processes, i): - """close pipes and terminate processes + """Close pipes and terminate processes. Parameters ---------- @@ -806,7 +806,7 @@ def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, s Returns ------- - 2D Numpy.numpy.ndarray of floats + 2D numpy.ndarray of floats Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. """ From 28739ce58d752bd22dd6f7b0dfe401fff86da920 Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 5 May 2021 22:56:50 +0200 Subject: [PATCH 133/166] tuple variable expansion --- gensim/models/ensemblelda.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 79a6fc32fc..af52fc7ce5 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -545,9 +545,9 @@ def _teardown(self, pipes, processes, i): """ logger.error(f"could not start process {i}") - for pipe in pipes: - pipe[1].close() - pipe[0].close() + for parent_conn, child_conn in pipes: + child_conn.close() + parent_conn.close() for process in processes: if process.is_alive(): @@ -621,9 +621,9 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): # aggregate results # will also block until workers are finished - for pipe in pipes: - answer = pipe[0].recv() # [0], because that is the parentConn - pipe[0].close() + for parent_conn, _ in pipes: + answer = parent_conn.recv() # [0], because that is the parentConn + parent_conn.close() # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): if not self.memory_friendly_ttda: self.tms += answer @@ -771,9 +771,9 @@ def _generate_asymmetric_distance_matrix(self): distances = [] # note, that the following loop maintains order in how the ttda will be concatenated # which is very important. Ordering in ttda has to be the same as when using only one process - for pipe in pipes: - answer = pipe[0].recv() # [0], because that is the parentConn - pipe[0].close() # child conn will be closed from inside the worker + for parent_conn, _ in pipes: + answer = parent_conn.recv() # [0], because that is the parentConn + parent_conn.close() # child conn will be closed from inside the worker # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): distances.append(answer[1]) From 336997576e15398a75f4fab28b329817a88c0ae2 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 5 May 2021 23:44:21 +0200 Subject: [PATCH 134/166] removed duplicate string in module docstring --- gensim/models/ensemblelda.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index af52fc7ce5..a6492ccd5b 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -9,12 +9,10 @@ """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. -Extracts reliable topics that are consistently learned across multiple LDA models, for higher reproducibility +Extract reliable topics that are consistently learned across multiple LDA models, for higher reproducibility and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic model should extract ahead of time. -For more details read `our thesis `_. - Usage examples -------------- From ed91a3fb434529ccba8ce0cc815d57d381eb7a13 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 5 May 2021 23:45:47 +0200 Subject: [PATCH 135/166] removed obsolete parent connection comment --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a6492ccd5b..04be03a165 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -620,7 +620,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): # aggregate results # will also block until workers are finished for parent_conn, _ in pipes: - answer = parent_conn.recv() # [0], because that is the parentConn + answer = parent_conn.recv() parent_conn.close() # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): if not self.memory_friendly_ttda: @@ -770,7 +770,7 @@ def _generate_asymmetric_distance_matrix(self): # note, that the following loop maintains order in how the ttda will be concatenated # which is very important. Ordering in ttda has to be the same as when using only one process for parent_conn, _ in pipes: - answer = parent_conn.recv() # [0], because that is the parentConn + answer = parent_conn.recv() parent_conn.close() # child conn will be closed from inside the worker # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): distances.append(answer[1]) From 42ac44936e4f2ca4e00335f3b4a03dfe8c510d03 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 5 May 2021 23:54:06 +0200 Subject: [PATCH 136/166] reliable -> stable --- gensim/models/ensemblelda.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 04be03a165..dc79d696a0 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -9,7 +9,7 @@ """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. -Extract reliable topics that are consistently learned across multiple LDA models, for higher reproducibility +Extract stable topics that are consistently learned across multiple LDA models, for higher reproducibility and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic model should extract ahead of time. @@ -82,11 +82,6 @@ Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from: https://www.sezanzeb.de/machine_learning/ensemble_LDA/ -Other Notes ------------ -The adjectives stable and reliable (topics) are used somewhat interchangeably throughout the doc strings and -comments. - """ import logging import os @@ -106,7 +101,7 @@ class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. - Extracts reliable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that + Extracts stable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that the user does not need to know the exact number of topics the topic model should extract ahead of time. """ @@ -1102,10 +1097,10 @@ def recluster(self, eps=0.1, min_samples=None, min_cores=None): # Run CBDBSCAN to get topic clusters: self._generate_topic_clusters(eps, min_samples) - # Interpret the results of CBDBSCAN to identify reliable topics: + # Interpret the results of CBDBSCAN to identify stable topics: self._generate_stable_topics(min_cores) - # Create gensim LdaModel representation of topic model with reliable topics (can be used for inference): + # Create gensim LdaModel representation of topic model with stable topics (can be used for inference): self.generate_gensim_representation() # GENSIM API @@ -1188,8 +1183,7 @@ class CBDBSCAN: 6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5) The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for - unreliable topics made of a composition of multiple reliable topics (something that occurs often LDA models that is - one cause of unreliable topics). + unstable topics made of a composition of multiple stable topics. """ From bbd6c2ff022712d01f8953d69014b36664a26108 Mon Sep 17 00:00:00 2001 From: aloosley Date: Mon, 31 May 2021 20:24:54 +0200 Subject: [PATCH 137/166] topic model intro v1 --- gensim/models/ensemblelda.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index dc79d696a0..1b78621bc1 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -7,11 +7,16 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. +"""Ensemble Latent Dirichlet Allocation (eLDA), an algorithm for extracting reliable topics. -Extract stable topics that are consistently learned across multiple LDA models, for higher reproducibility -and less noise. eLDA has the added benefit that the user does not need to know the exact number of topics the topic -model should extract ahead of time. +In topic modeling, the aim is to find a set of topics that represents global structure of a corpus of documents. One +issue that occur with topics extracted from an NMF or LDA model is reproducibility. That is, if the topic model is +trained repeatedly allowing only the random seed to change, would the same (or similar) topic representation be reliably +learned. + +Ensemble LDA addresses the issue by training an ensemble of topic models and throwing out topics that do not reoccur +across the ensemble. In this regard, the topics extracted are reliable and there is the added benefit over many topic +models that the user does not need to know the exact number of topics ahead of time. Usage examples -------------- From c032b20b72a2648a3bf8cf9922266047188e76de Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 31 May 2021 21:11:00 +0200 Subject: [PATCH 138/166] topic model intro v2 --- gensim/models/ensemblelda.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1b78621bc1..5f170747e9 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -9,14 +9,14 @@ """Ensemble Latent Dirichlet Allocation (eLDA), an algorithm for extracting reliable topics. -In topic modeling, the aim is to find a set of topics that represents global structure of a corpus of documents. One -issue that occur with topics extracted from an NMF or LDA model is reproducibility. That is, if the topic model is +The aim of topic modelling is to find a set of topics that represent the global structure of a corpus of documents. One +issue that occurs with topics extracted from an NMF or LDA model is reproducibility. That is, if the topic model is trained repeatedly allowing only the random seed to change, would the same (or similar) topic representation be reliably -learned. +learned. Unreliable topics are undesireable because they are not a good representation of the corpus. Ensemble LDA addresses the issue by training an ensemble of topic models and throwing out topics that do not reoccur -across the ensemble. In this regard, the topics extracted are reliable and there is the added benefit over many topic -models that the user does not need to know the exact number of topics ahead of time. +across the ensemble. In this regard, the topics extracted are more reliable and there is the added benefit over many +topic models that the user does not need to know the exact number of topics ahead of time. Usage examples -------------- From 8c99b51cc1a1eb14b589bc37664b84db3041f07b Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 31 May 2021 21:31:35 +0200 Subject: [PATCH 139/166] topic model intro v3 --- gensim/models/ensemblelda.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 5f170747e9..14e67eeeeb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -18,6 +18,12 @@ across the ensemble. In this regard, the topics extracted are more reliable and there is the added benefit over many topic models that the user does not need to know the exact number of topics ahead of time. +For more information see the link in the :ref:`Citation` below, watch our Machine Learning Prague 2019 talk `Solving +the Text Labeling challenge with EnsembleLDA and Active Learning +`_, +or view our `Machine Learning Summer School poster +`_. + Usage examples -------------- From 41e3591c050a56731a6561c36190eacc8a0c0702 Mon Sep 17 00:00:00 2001 From: aloosley Date: Mon, 31 May 2021 22:22:04 +0200 Subject: [PATCH 140/166] elda introduction with references --- docs/src/auto_examples/index.rst | 34 ++++++++++++++++---------------- gensim/models/ensemblelda.py | 5 +++-- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index d3dd2291be..05643de00c 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -71,7 +71,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -92,7 +92,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -169,14 +169,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png - :alt: Ensemble LDA + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + :alt: FastText Model - :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -186,18 +186,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_ensemblelda + /auto_examples/tutorials/run_fasttext .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :alt: FastText Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` .. raw:: html @@ -207,11 +207,11 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_fasttext + /auto_examples/tutorials/run_ensemblelda .. raw:: html -
+
.. only:: html @@ -309,7 +309,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -330,7 +330,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -447,13 +447,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. container:: sphx-glr-download sphx-glr-download-python - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 14e67eeeeb..318cccf05a 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -18,8 +18,7 @@ across the ensemble. In this regard, the topics extracted are more reliable and there is the added benefit over many topic models that the user does not need to know the exact number of topics ahead of time. -For more information see the link in the :ref:`Citation` below, watch our Machine Learning Prague 2019 talk `Solving -the Text Labeling challenge with EnsembleLDA and Active Learning +For more information, see the :ref:`citation section ` below, watch our `Machine Learning Prague 2019 talk `_, or view our `Machine Learning Summer School poster `_. @@ -87,6 +86,8 @@ >>> elda.recluster(eps=0.2) +.. _Citation: + Citation -------- BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. From 3ed4522972d81783ccfc736a7550743d14c57453 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 23 Jun 2021 23:05:14 +0200 Subject: [PATCH 141/166] Update gensim/corpora/opinosiscorpus.py Co-authored-by: Michael Penkov --- gensim/corpora/opinosiscorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/opinosiscorpus.py b/gensim/corpora/opinosiscorpus.py index eaee6f035a..b4e25731ce 100644 --- a/gensim/corpora/opinosiscorpus.py +++ b/gensim/corpora/opinosiscorpus.py @@ -22,7 +22,7 @@ from gensim.parsing.preprocessing import STOPWORDS -class OpinosisCorpus(): +class OpinosisCorpus: """Creates a corpus and dictionary from the Opinosis dataset. http://kavita-ganesan.com/opinosis-opinion-dataset/ From c1d2d5762b782670d9574fddf00ec72f8d9855f8 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 23 Jun 2021 23:20:13 +0200 Subject: [PATCH 142/166] Update gensim/models/ensemblelda.py Co-authored-by: Michael Penkov --- gensim/models/ensemblelda.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 318cccf05a..80b64a0193 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -135,6 +135,8 @@ def __init__( Examples: * 'ldamulticore' (default, recommended) * 'lda' + * ldamodel.LdaModel + * ldamulticore.LdaMulticore ensemble_workers : int, optional Spawns that many processes and distributes the models from the ensemble to those as evenly as possible. num_models should be a multiple of ensemble_workers. From 128c9dc25b864dd3eadbb0e7f5cff150db74200d Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 23 Jun 2021 23:20:40 +0200 Subject: [PATCH 143/166] update docstrings --- gensim/downloader.py | 4 ---- gensim/models/ensemblelda.py | 7 +++++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index aac4d0edcf..8a4395440e 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -372,8 +372,6 @@ def _download(name): fname = "{f}.gz_0{p}".format(f=name, p=part) dst_path = os.path.join(tmp_dir, fname) - print('downloading', url_data) - print() urllib.urlretrieve( url_data, dst_path, reporthook=partial(_progress, part=part, total_parts=total_parts) @@ -395,8 +393,6 @@ def _download(name): url_data = "{base}/{fname}/{fname}.gz".format(base=DOWNLOAD_BASE_URL, fname=name) fname = "{fname}.gz".format(fname=name) dst_path = os.path.join(tmp_dir, fname) - print('downloading', url_data, 'to', dst_path, data_folder_dir) - print() urllib.urlretrieve(url_data, dst_path, reporthook=_progress) if _calculate_md5_checksum(dst_path) == _get_checksum(name): sys.stdout.write("\n") diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 318cccf05a..ade756bcf1 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -298,7 +298,6 @@ def get_topic_model_class(self): return self.topic_model_class def save(self, *args, **kwargs): - """See :meth:`gensim.utils.SaveLoad.save`.""" if self.get_topic_model_class() is not None: self.topic_model_module_string = self.topic_model_class.__module__ self.topic_model_class_string = self.topic_model_class.__name__ @@ -308,7 +307,11 @@ def save(self, *args, **kwargs): save.__doc__ = SaveLoad.save.__doc__ def convert_to_memory_friendly(self): - """Remove the stored gensim models and only keep their ttdas.""" + """Remove the stored gensim models and only keep their ttdas. + + This frees up memory, but you won't have access to the individual models anymore if you intended to use them + outside of the ensemble. + """ self.tms = [] self.memory_friendly_ttda = True From e07906a5523c0c35e60eeb52ca1d3cafef6fc6f4 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 23 Jun 2021 23:25:10 +0200 Subject: [PATCH 144/166] static functions --- gensim/models/ensemblelda.py | 260 ++++++++++++++++++----------------- 1 file changed, 133 insertions(+), 127 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index f8ea578fb2..8f70de146a 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -110,6 +110,131 @@ logger = logging.getLogger(__name__) +def _is_valid_core(topic): + """Check if the topic is a valid core. + + Parameters + ---------- + topic : {'is_core', 'valid_parents', 'label'} + topic to validate + + """ + return topic["is_core"] and (topic["valid_parents"] == {topic["label"]}) + + +def _remove_from_all_sets(label, clusters): + """Remove a label from every set in "neighboring_labels" for each core in ``clusters``.""" + for cluster in clusters: + for neighboring_labels_set in cluster["neighboring_labels"]: + if label in neighboring_labels_set: + neighboring_labels_set.remove(label) + + +def _contains_isolated_cores(label, cluster, min_cores): + """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster.""" + return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores + + +def _aggregate_topics(grouped_by_labels): + """Aggregate the labeled topics to a list of clusters. + + Parameters + ---------- + grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) + The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the + label. + + Returns + ------- + list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'} + max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label + refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the + neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending + order. There is one single element for each cluster. + + """ + clusters = [] + + for label, group in grouped_by_labels.items(): + max_num_neighboring_labels = 0 + neighboring_labels = [] # will be a list of sets + + for topic in group: + max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels) + neighboring_labels.append(topic["neighboring_labels"]) + + neighboring_labels = [x for x in neighboring_labels if len(x) > 0] + + clusters.append({ + "max_num_neighboring_labels": max_num_neighboring_labels, + "neighboring_labels": neighboring_labels, + "label": label, + "num_cores": len([topic for topic in group if topic["is_core"]]), + }) + + logger.info("found %s clusters", len(clusters)) + + return clusters + + +def _group_by_labels(results): + """Group all the learned cores by their label, which was assigned in the cluster_model. + + Parameters + ---------- + results : {list of {'is_core', 'neighboring_labels', 'label'}} + After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results + member, which can be used as the argument to this function. It's a list of infos gathered during + the clustering step and each element in the list corresponds to a single topic. + + Returns + ------- + dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) + A mapping of the label to a list of topics that belong to that particular label. Also adds + a new member to each topic called num_neighboring_labels, which is the number of + neighboring_labels of that topic. + + """ + grouped_by_labels = {} + for topic in results: + if topic["is_core"]: + topic = topic.copy() + + # counts how many different labels a core has as parents + topic["num_neighboring_labels"] = len(topic["neighboring_labels"]) + + label = topic["label"] + if label not in grouped_by_labels: + grouped_by_labels[label] = [] + grouped_by_labels[label].append(topic) + return grouped_by_labels + + +def _teardown(pipes, processes, i): + """Close pipes and terminate processes. + + Parameters + ---------- + pipes : {list of :class:`multiprocessing.Pipe`} + list of pipes that the processes use to communicate with the parent + processes : {list of :class:`multiprocessing.Process`} + list of worker processes + i : int + index of the process that could not be started + + """ + logger.error(f"could not start process {i}") + + for parent_conn, child_conn in pipes: + child_conn.close() + parent_conn.close() + + for process in processes: + if process.is_alive(): + process.terminate() + del process + + class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. @@ -540,30 +665,6 @@ def add_model(self, target, num_new_models=None): # tell recluster that the distance matrix needs to be regenerated self.asymmetric_distance_matrix_outdated = True - def _teardown(self, pipes, processes, i): - """Close pipes and terminate processes. - - Parameters - ---------- - pipes : {list of :class:`multiprocessing.Pipe`} - list of pipes that the processes use to communicate with the parent - processes : {list of :class:`multiprocessing.Process`} - list of worker processes - i : int - index of the process that could not be started - - """ - logger.error(f"could not start process {i}") - - for parent_conn, child_conn in pipes: - child_conn.close() - parent_conn.close() - - for process in processes: - if process.is_alive(): - process.terminate() - del process - def _generate_topic_models_multiproc(self, num_models, ensemble_workers): """Generate the topic models to form the ensemble in a multiprocessed way. @@ -626,7 +727,7 @@ def _generate_topic_models_multiproc(self, num_models, ensemble_workers): num_models_unhandled -= num_subprocess_models except ProcessError: - self._teardown(pipes, processes, i) + _teardown(pipes, processes, i) raise ProcessError # aggregate results @@ -775,7 +876,7 @@ def _generate_asymmetric_distance_matrix(self): process.start() except ProcessError: - self._teardown(pipes, processes, i) + _teardown(pipes, processes, i) raise ProcessError distances = [] @@ -901,101 +1002,6 @@ def _generate_topic_clusters(self, eps=0.1, min_samples=None): self.cluster_model = CBDBSCAN(eps=eps, min_samples=min_samples) self.cluster_model.fit(self.asymmetric_distance_matrix) - def _is_valid_core(self, topic): - """Check if the topic is a valid core. - - Parameters - ---------- - topic : {'is_core', 'valid_parents', 'label'} - topic to validate - - """ - return topic["is_core"] and (topic["valid_parents"] == {topic["label"]}) - - def _group_by_labels(self, results): - """Group all the learned cores by their label, which was assigned in the cluster_model. - - Parameters - ---------- - results : {list of {'is_core', 'neighboring_labels', 'label'}} - After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results - member, which can be used as the argument to this function. It's a list of infos gathered during - the clustering step and each element in the list corresponds to a single topic. - - Returns - ------- - dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) - A mapping of the label to a list of topics that belong to that particular label. Also adds - a new member to each topic called num_neighboring_labels, which is the number of - neighboring_labels of that topic. - - """ - grouped_by_labels = {} - for topic in results: - if topic["is_core"]: - topic = topic.copy() - - # counts how many different labels a core has as parents - topic["num_neighboring_labels"] = len(topic["neighboring_labels"]) - - label = topic["label"] - if label not in grouped_by_labels: - grouped_by_labels[label] = [] - grouped_by_labels[label].append(topic) - return grouped_by_labels - - def _aggregate_topics(self, grouped_by_labels): - """Aggregate the labeled topics to a list of clusters. - - Parameters - ---------- - grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) - The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the - label. - - Returns - ------- - list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'} - max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label - refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the - neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending - order. There is one single element for each cluster. - - """ - clusters = [] - - for label, group in grouped_by_labels.items(): - max_num_neighboring_labels = 0 - neighboring_labels = [] # will be a list of sets - - for topic in group: - max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels) - neighboring_labels.append(topic["neighboring_labels"]) - - neighboring_labels = [x for x in neighboring_labels if len(x) > 0] - - clusters.append({ - "max_num_neighboring_labels": max_num_neighboring_labels, - "neighboring_labels": neighboring_labels, - "label": label, - "num_cores": len([topic for topic in group if topic["is_core"]]), - }) - - logger.info("found %s clusters", len(clusters)) - - return clusters - - def _remove_from_all_sets(self, label, clusters): - """Remove a label from every set in "neighboring_labels" for each core in ``clusters``.""" - for cluster in clusters: - for neighboring_labels_set in cluster["neighboring_labels"]: - if label in neighboring_labels_set: - neighboring_labels_set.remove(label) - - def _contains_isolated_cores(self, label, cluster, min_cores): - """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster.""" - return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores - def _generate_stable_topics(self, min_cores=None): """Generate stable topics out of the clusters. @@ -1028,9 +1034,9 @@ def _generate_stable_topics(self, min_cores=None): results = self.cluster_model.results - grouped_by_labels = self._group_by_labels(results) + grouped_by_labels = _group_by_labels(results) - clusters = self._aggregate_topics(grouped_by_labels) + clusters = _aggregate_topics(grouped_by_labels) # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the # easy ones and potentially already remove some noise by also sorting smaller clusters to the front. @@ -1049,17 +1055,17 @@ def _generate_stable_topics(self, min_cores=None): cluster["is_valid"] = None if cluster["num_cores"] < min_cores: cluster["is_valid"] = False - self._remove_from_all_sets(cluster["label"], sorted_clusters) + _remove_from_all_sets(cluster["label"], sorted_clusters) # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any # other cluster. for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: label = cluster["label"] - if self._contains_isolated_cores(label, cluster, min_cores): + if _contains_isolated_cores(label, cluster, min_cores): cluster["is_valid"] = True else: cluster["is_valid"] = False - self._remove_from_all_sets(label, sorted_clusters) + _remove_from_all_sets(label, sorted_clusters) # list of all the label numbers that are valid valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]]) @@ -1068,7 +1074,7 @@ def _generate_stable_topics(self, min_cores=None): topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels} # keeping only VALID cores - valid_core_mask = np.vectorize(self._is_valid_core)(results) + valid_core_mask = np.vectorize(_is_valid_core)(results) valid_topics = self.ttda[valid_core_mask] topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask] unique_labels = np.unique(topic_labels) From 8647aea9a30b1089dbd9c687846d8584684b37a0 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 23 Jun 2021 23:26:39 +0200 Subject: [PATCH 145/166] module-level constant --- gensim/models/ensemblelda.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 8f70de146a..7e0fe76f59 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -109,6 +109,10 @@ logger = logging.getLogger(__name__) +# _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping +# distance calculations for highly masked topic-term distributions +_COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05 + def _is_valid_core(topic): """Check if the topic is a valid core. @@ -319,10 +323,6 @@ def __init__( # nps max random state of 2**32 - 1 is too large for windows: self._MAX_RANDOM_STATE = np.iinfo(np.int32).max - # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping - # distance calculations for highly masked topic-term distributions - self._COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05 - if "id2word" not in gensim_kw_args: gensim_kw_args["id2word"] = None if "corpus" not in gensim_kw_args: @@ -967,7 +967,7 @@ def rank_masking(a): # Smart distance calculation avoids calculating cosine distance for highly masked topic-term # distributions that will have distance values near 1. - if ttd2_masked.sum() <= self._COSINE_DISTANCE_CALCULATION_THRESHOLD: + if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD: distance = 1 else: distance = cosine(ttd1_masked, ttd2_masked) From 5109799d1432c99767bc0beda51709394042ffe5 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 23 Jun 2021 23:28:35 +0200 Subject: [PATCH 146/166] assert and no return --- gensim/models/ensemblelda.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 7e0fe76f59..aa610412eb 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -840,7 +840,6 @@ def _generate_asymmetric_distance_matrix(self): self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method, ) - return self.asymmetric_distance_matrix # else, if workers > 1 use multiprocessing # best performance on 2-core machine: 2 workers @@ -894,8 +893,6 @@ def _generate_asymmetric_distance_matrix(self): self.asymmetric_distance_matrix = np.concatenate(distances) - return self.asymmetric_distance_matrix - def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. @@ -1140,7 +1137,7 @@ def get_topics(self): """ return self.stable_topics - def _has_gensim_representation(self): + def _ensure_gensim_representation(self): """Check if stable topics and the internal gensim representation exist. Raise an error if not.""" if self.classic_model_representation is None: if len(self.stable_topics) == 0: @@ -1150,22 +1147,22 @@ def _has_gensim_representation(self): def __getitem__(self, i): """See :meth:`gensim.models.LdaModel.__getitem__`.""" - self._has_gensim_representation() + self._ensure_gensim_representation() return self.classic_model_representation[i] def inference(self, *posargs, **kwargs): """See :meth:`gensim.models.LdaModel.inference`.""" - self._has_gensim_representation() + self._ensure_gensim_representation() return self.classic_model_representation.inference(*posargs, **kwargs) def log_perplexity(self, *posargs, **kwargs): """See :meth:`gensim.models.LdaModel.log_perplexity`.""" - self._has_gensim_representation() + self._ensure_gensim_representation() return self.classic_model_representation.log_perplexity(*posargs, **kwargs) def print_topics(self, *posargs, **kwargs): """See :meth:`gensim.models.LdaModel.print_topics`.""" - self._has_gensim_representation() + self._ensure_gensim_representation() return self.classic_model_representation.print_topics(*posargs, **kwargs) @property From 0dd9f29c76cbd16b7a1f78631b9873d39fa6d185 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 23 Jun 2021 23:30:54 +0200 Subject: [PATCH 147/166] static _calculate_asymmetric_distance_matrix_chunk --- gensim/models/ensemblelda.py | 171 ++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 85 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index aa610412eb..126db88622 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -239,6 +239,90 @@ def _teardown(pipes, processes, i): del process +def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_index, method): + """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. + + Parameters + ---------- + ttda1 and ttda2: 2D arrays of floats + Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one + topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. + threshold : float, optional + threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method + start_index : int + this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the + complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into + two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 + method : {'mass', 'rank}, optional + method can be "mass" for the original masking method or "rank" for a faster masking method that selects + by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the + topic. + + Returns + ------- + 2D numpy.ndarray of floats + Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. + + """ + # initialize the distance matrix. ndarray is faster than zeros + distances = np.ndarray((len(ttda1), len(ttda2))) + + if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: + # the worker might not have received a ttda because it was chunked up too much + + if method not in ["mass", "rank"]: + raise ValueError(f"method {method} unknown") + + # select masking method: + def mass_masking(a): + """Original masking method. Returns a new binary mask.""" + sorted_a = np.sort(a)[::-1] + largest_mass = sorted_a.cumsum() < threshold + smallest_valid = sorted_a[largest_mass][-1] + return a >= smallest_valid + + def rank_masking(a): + """Faster masking method. Returns a new binary mask.""" + return a > np.sort(a)[::-1][int(len(a) * threshold)] + + create_mask = {"mass": mass_masking, "rank": rank_masking}[method] + + # some help to find a better threshold by useful log messages + avg_mask_size = 0 + + # now iterate over each topic + for ttd1_idx, ttd1 in enumerate(ttda1): + # create mask from ttd1 that removes noise from a and keeps the largest terms + mask = create_mask(ttd1) + ttd1_masked = ttd1[mask] + + avg_mask_size += mask.sum() + + # now look at every possible pair for topic a: + for ttd2_idx, ttd2 in enumerate(ttda2): + # distance to itself is 0 + if ttd1_idx + start_index == ttd2_idx: + distances[ttd1_idx][ttd2_idx] = 0 + continue + + # now mask b based on a, which will force the shape of a onto b + ttd2_masked = ttd2[mask] + + # Smart distance calculation avoids calculating cosine distance for highly masked topic-term + # distributions that will have distance values near 1. + if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD: + distance = 1 + else: + distance = cosine(ttd1_masked, ttd2_masked) + + distances[ttd1_idx][ttd2_idx] = distance + + percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) + logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens') + + return distances + + class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. @@ -809,7 +893,7 @@ def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pip logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix") # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] - distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( + distance_chunk = _calculate_asymmetric_distance_matrix_chunk( ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method, ) pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory @@ -837,7 +921,7 @@ def _generate_asymmetric_distance_matrix(self): # singlecore if workers is not None and workers <= 1: - self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( + self.asymmetric_distance_matrix = _calculate_asymmetric_distance_matrix_chunk( ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method, ) @@ -893,89 +977,6 @@ def _generate_asymmetric_distance_matrix(self): self.asymmetric_distance_matrix = np.concatenate(distances) - def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, threshold, start_index, method): - """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. - - Parameters - ---------- - ttda1 and ttda2: 2D arrays of floats - Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one - topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. - threshold : float, optional - threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method - start_index : int - this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the - complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into - two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 - method : {'mass', 'rank}, optional - method can be "mass" for the original masking method or "rank" for a faster masking method that selects - by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the - topic. - - Returns - ------- - 2D numpy.ndarray of floats - Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. - - """ - # initialize the distance matrix. ndarray is faster than zeros - distances = np.ndarray((len(ttda1), len(ttda2))) - - if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: - # the worker might not have received a ttda because it was chunked up too much - - if method not in ["mass", "rank"]: - raise ValueError(f"method {method} unknown") - - # select masking method: - def mass_masking(a): - """Original masking method. Returns a new binary mask.""" - sorted_a = np.sort(a)[::-1] - largest_mass = sorted_a.cumsum() < threshold - smallest_valid = sorted_a[largest_mass][-1] - return a >= smallest_valid - - def rank_masking(a): - """Faster masking method. Returns a new binary mask.""" - return a > np.sort(a)[::-1][int(len(a) * threshold)] - - create_mask = {"mass": mass_masking, "rank": rank_masking}[method] - - # some help to find a better threshold by useful log messages - avg_mask_size = 0 - - # now iterate over each topic - for ttd1_idx, ttd1 in enumerate(ttda1): - # create mask from ttd1 that removes noise from a and keeps the largest terms - mask = create_mask(ttd1) - ttd1_masked = ttd1[mask] - - avg_mask_size += mask.sum() - - # now look at every possible pair for topic a: - for ttd2_idx, ttd2 in enumerate(ttda2): - # distance to itself is 0 - if ttd1_idx + start_index == ttd2_idx: - distances[ttd1_idx][ttd2_idx] = 0 - continue - - # now mask b based on a, which will force the shape of a onto b - ttd2_masked = ttd2[mask] - - # Smart distance calculation avoids calculating cosine distance for highly masked topic-term - # distributions that will have distance values near 1. - if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD: - distance = 1 - else: - distance = cosine(ttd1_masked, ttd2_masked) - - distances[ttd1_idx][ttd2_idx] = distance - - percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) - logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens') - - return distances - def _generate_topic_clusters(self, eps=0.1, min_samples=None): """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices. From 78a16b4855d46d520639ecf3056ee800b60008cc Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 30 Jun 2021 21:09:04 +0200 Subject: [PATCH 148/166] better variable names and data types --- gensim/models/ensemblelda.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 126db88622..17654fc676 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -181,12 +181,13 @@ def _aggregate_topics(grouped_by_labels): return clusters -def _group_by_labels(results): +def _group_by_labels(cbdbscan_topics): """Group all the learned cores by their label, which was assigned in the cluster_model. Parameters ---------- - results : {list of {'is_core', 'neighboring_labels', 'label'}} + cbdbscan_topics : {list of {'is_core', 'neighboring_labels', 'label'}} + A list of topic data resulting from fitting a :class:`~CBDBSCAN` object. After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results member, which can be used as the argument to this function. It's a list of infos gathered during the clustering step and each element in the list corresponds to a single topic. @@ -200,7 +201,7 @@ def _group_by_labels(results): """ grouped_by_labels = {} - for topic in results: + for topic in cbdbscan_topics: if topic["is_core"]: topic = topic.copy() @@ -1030,9 +1031,9 @@ def _generate_stable_topics(self, min_cores=None): else: logger.info("generating stable topics") - results = self.cluster_model.results + cbdbscan_topics = self.cluster_model.results - grouped_by_labels = _group_by_labels(results) + grouped_by_labels = _group_by_labels(cbdbscan_topics) clusters = _aggregate_topics(grouped_by_labels) @@ -1066,15 +1067,15 @@ def _generate_stable_topics(self, min_cores=None): _remove_from_all_sets(label, sorted_clusters) # list of all the label numbers that are valid - valid_labels = np.array([cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]]) + valid_labels = {cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]} - for topic in results: + for topic in cbdbscan_topics: topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels} # keeping only VALID cores - valid_core_mask = np.vectorize(_is_valid_core)(results) + valid_core_mask = np.vectorize(_is_valid_core)(cbdbscan_topics) valid_topics = self.ttda[valid_core_mask] - topic_labels = np.array([topic["label"] for topic in results])[valid_core_mask] + topic_labels = np.array([topic["label"] for topic in cbdbscan_topics])[valid_core_mask] unique_labels = np.unique(topic_labels) num_stable_topics = len(unique_labels) From 8c10975814ff6bfe905f5038592d018c8ffc5bf4 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 30 Jun 2021 21:45:00 +0200 Subject: [PATCH 149/166] refactoring --- gensim/models/ensemblelda.py | 110 +++++++++++++++++--------------- gensim/test/test_ensemblelda.py | 4 +- 2 files changed, 62 insertions(+), 52 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 17654fc676..71c9874c2e 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -115,15 +115,15 @@ def _is_valid_core(topic): - """Check if the topic is a valid core. + """Check if the topic is a valid core, i.e. no neighboring valid cluster is overlapping with it. Parameters ---------- - topic : {'is_core', 'valid_parents', 'label'} + topic : {'is_core', 'valid_neighboring_labels', 'label'} topic to validate """ - return topic["is_core"] and (topic["valid_parents"] == {topic["label"]}) + return topic["is_core"] and (topic["valid_neighboring_labels"] == {topic["label"]}) def _remove_from_all_sets(label, clusters): @@ -201,6 +201,7 @@ def _group_by_labels(cbdbscan_topics): """ grouped_by_labels = {} + for topic in cbdbscan_topics: if topic["is_core"]: topic = topic.copy() @@ -212,6 +213,7 @@ def _group_by_labels(cbdbscan_topics): if label not in grouped_by_labels: grouped_by_labels[label] = [] grouped_by_labels[label].append(topic) + return grouped_by_labels @@ -240,6 +242,19 @@ def _teardown(pipes, processes, i): del process +def mass_masking(a, threshold): + """Original masking method. Returns a new binary mask.""" + sorted_a = np.sort(a)[::-1] + largest_mass = sorted_a.cumsum() < threshold + smallest_valid = sorted_a[largest_mass][-1] + return a >= smallest_valid + + +def rank_masking(a, threshold): + """Faster masking method. Returns a new binary mask.""" + return a > np.sort(a)[::-1][int(len(a) * threshold)] + + def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_index, method): """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. @@ -275,17 +290,6 @@ def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_i raise ValueError(f"method {method} unknown") # select masking method: - def mass_masking(a): - """Original masking method. Returns a new binary mask.""" - sorted_a = np.sort(a)[::-1] - largest_mass = sorted_a.cumsum() < threshold - smallest_valid = sorted_a[largest_mass][-1] - return a >= smallest_valid - - def rank_masking(a): - """Faster masking method. Returns a new binary mask.""" - return a > np.sort(a)[::-1][int(len(a) * threshold)] - create_mask = {"mass": mass_masking, "rank": rank_masking}[method] # some help to find a better threshold by useful log messages @@ -294,7 +298,7 @@ def rank_masking(a): # now iterate over each topic for ttd1_idx, ttd1 in enumerate(ttda1): # create mask from ttd1 that removes noise from a and keeps the largest terms - mask = create_mask(ttd1) + mask = create_mask(ttd1, threshold) ttd1_masked = ttd1[mask] avg_mask_size += mask.sum() @@ -324,6 +328,40 @@ def rank_masking(a): return distances +def _validate_clusters(clusters, min_cores): + """Check which clusters from the cbdbscan step are significant enough. is_valid is set accordingly.""" + # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the + # easy ones and potentially already remove some noise by also sorting smaller clusters to the front. + # This clears up the clusters a bit before checking the ones with many neighbors. + sorted_clusters = sorted( + clusters, + key=lambda cluster: ( + cluster["max_num_neighboring_labels"], + cluster["num_cores"], + cluster["label"], + ), + reverse=False, + ) + + for cluster in sorted_clusters: + cluster["is_valid"] = None + if cluster["num_cores"] < min_cores: + cluster["is_valid"] = False + _remove_from_all_sets(cluster["label"], sorted_clusters) + + # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any + # other cluster. + for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: + label = cluster["label"] + if _contains_isolated_cores(label, cluster, min_cores): + cluster["is_valid"] = True + else: + cluster["is_valid"] = False + _remove_from_all_sets(label, sorted_clusters) + + return [cluster for cluster in sorted_clusters if cluster["is_valid"]] + + class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. @@ -1034,43 +1072,15 @@ def _generate_stable_topics(self, min_cores=None): cbdbscan_topics = self.cluster_model.results grouped_by_labels = _group_by_labels(cbdbscan_topics) - clusters = _aggregate_topics(grouped_by_labels) - - # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the - # easy ones and potentially already remove some noise by also sorting smaller clusters to the front. - # This clears up the clusters a bit before checking the ones with many neighbors. - sorted_clusters = sorted( - clusters, - key=lambda cluster: ( - cluster["max_num_neighboring_labels"], - cluster["num_cores"], - cluster["label"], - ), - reverse=False, - ) - - for cluster in sorted_clusters: - cluster["is_valid"] = None - if cluster["num_cores"] < min_cores: - cluster["is_valid"] = False - _remove_from_all_sets(cluster["label"], sorted_clusters) - - # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any - # other cluster. - for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: - label = cluster["label"] - if _contains_isolated_cores(label, cluster, min_cores): - cluster["is_valid"] = True - else: - cluster["is_valid"] = False - _remove_from_all_sets(label, sorted_clusters) - - # list of all the label numbers that are valid - valid_labels = {cluster["label"] for cluster in sorted_clusters if cluster["is_valid"]} + valid_clusters = _validate_clusters(clusters, min_cores) + valid_cluster_labels = {cluster["label"] for cluster in valid_clusters} for topic in cbdbscan_topics: - topic["valid_parents"] = {label for label in topic["neighboring_labels"] if label in valid_labels} + topic["valid_neighboring_labels"] = { + label for label in topic["neighboring_labels"] + if label in valid_cluster_labels + } # keeping only VALID cores valid_core_mask = np.vectorize(_is_valid_core)(cbdbscan_topics) @@ -1087,7 +1097,7 @@ def _generate_stable_topics(self, min_cores=None): topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label]) stable_topics[l] = topics_of_cluster.mean(axis=0) - self.sorted_clusters = sorted_clusters + self.valid_clusters = valid_clusters self.stable_topics = stable_topics logger.info("found %s stable topics", len(stable_topics)) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 8d021d3658..4288f9ef98 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -84,7 +84,7 @@ def test_clustering(self): reference = EnsembleLda.load(datapath('ensemblelda')) cluster_model_results = deepcopy(reference.cluster_model.results) - sorted_clusters = deepcopy(reference.sorted_clusters) + valid_clusters = deepcopy(reference.valid_clusters) stable_topics = deepcopy(reference.get_topics()) # continue training with the distance matrix of the pretrained reference and see if @@ -93,7 +93,7 @@ def test_clustering(self): reference.recluster() self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) - self.assertEqual(reference.sorted_clusters, sorted_clusters) + self.assertEqual(reference.valid_clusters, valid_clusters) np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=rtol) def test_not_trained(self): From dbb758121d8177d67bb362c6669782d4d52c4ebe Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 30 Jun 2021 21:26:23 +0200 Subject: [PATCH 150/166] pythonic varnames + pytest style asserts --- gensim/models/ensemblelda.py | 4 +- gensim/test/test_ensemblelda.py | 347 ++++++++++++++++---------------- 2 files changed, 177 insertions(+), 174 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 17654fc676..e480ba07a2 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -327,7 +327,7 @@ def rank_masking(a): class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. - Extracts stable topics that are consistently learned accross multiple LDA models. eLDA has the added benefit that + Extracts stable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that the user does not need to know the exact number of topics the topic model should extract ahead of time. """ @@ -454,7 +454,7 @@ def __init__( self.sstats_sum = 0 self.eta = None self.tms = [] - # initialize empty topic term distribution array + # initialize empty 2D topic term distribution array (ttda) (number of topics x number of terms) self.ttda = np.empty((0, len(gensim_kw_args["id2word"]))) self.asymmetric_distance_matrix_outdated = True diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 8d021d3658..44afb0e4a6 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -14,15 +14,17 @@ import numpy as np from copy import deepcopy +import pytest + from gensim.models import EnsembleLda, LdaMulticore, LdaModel from gensim.test.utils import datapath, get_tmpfile, common_corpus, common_dictionary -num_topics = 2 -num_models = 4 -passes = 50 +NUM_TOPICS = 2 +NUM_MODELS = 4 +PASSES = 50 # windows tests fail due to the required assertion precision being too high -rtol = 1e-04 if os.name == 'nt' else 1e-05 +RTOL = 1e-04 if os.name == 'nt' else 1e-05 class TestModel(unittest.TestCase): @@ -31,49 +33,49 @@ def setUp(self): # the topics are equal random_state = 0 - self.eLDA = EnsembleLda( - corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=random_state, + self.elda = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, + passes=PASSES, num_models=NUM_MODELS, random_state=random_state, topic_model_class=LdaModel, ) - self.eLDA_mu = EnsembleLda( - corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=random_state, + self.elda_mem_unfriendly = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, + passes=PASSES, num_models=NUM_MODELS, random_state=random_state, memory_friendly_ttda=False, topic_model_class=LdaModel, ) - def check_ttda(self, ensemble): - """tests the integrity of the ttda of any ensemble""" - self.assertGreater(len(ensemble.ttda), 0) - a = ensemble.ttda.sum(axis=1) - b = np.ones(len(ensemble.ttda)).astype(np.float32) - np.testing.assert_allclose(a, b, rtol=1e-04) + def assert_ttda_is_valid(self, ensemble): + """Check that ttda has one or more topic and that term probabilities add to one.""" + assert len(ensemble.ttda) > 0 + sum_over_terms = ensemble.ttda.sum(axis=1) + expected_sum_over_terms = np.ones(len(ensemble.ttda)).astype(np.float32) + np.testing.assert_allclose(sum_over_terms, expected_sum_over_terms, rtol=1e-04) - def test_eLDA(self): + def test_elda(self): # given that the random_state doesn't change, it should # always be 2 detected topics in this setup. - self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) - self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) - self.check_ttda(self.eLDA) + assert self.elda.stable_topics.shape[1] == len(common_dictionary) + assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS + self.assert_ttda_is_valid(self.elda) # reclustering shouldn't change anything without # added models or different parameters - self.eLDA.recluster() + self.elda.recluster() - self.assertEqual(self.eLDA.stable_topics.shape[1], len(common_dictionary)) - self.assertEqual(len(self.eLDA.ttda), num_models * num_topics) - self.check_ttda(self.eLDA) + assert self.elda.stable_topics.shape[1] == len(common_dictionary) + assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS + self.assert_ttda_is_valid(self.elda) # compare with a pre-trained reference model reference = EnsembleLda.load(datapath('ensemblelda')) - np.testing.assert_allclose(self.eLDA.ttda, reference.ttda, rtol=rtol) + np.testing.assert_allclose(self.elda.ttda, reference.ttda, rtol=RTOL) # small values in the distance matrix tend to vary quite a bit around 2%, # so use some absolute tolerance measurement to check if the matrix is at least # close to the target. atol = reference.asymmetric_distance_matrix.max() * 1e-05 np.testing.assert_allclose( - self.eLDA.asymmetric_distance_matrix, + self.elda.asymmetric_distance_matrix, reference.asymmetric_distance_matrix, atol=atol, ) @@ -93,54 +95,54 @@ def test_clustering(self): reference.recluster() self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) - self.assertEqual(reference.sorted_clusters, sorted_clusters) - np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=rtol) + assert reference.sorted_clusters == sorted_clusters + np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=RTOL) def test_not_trained(self): # should not throw errors and no training should happen # 0 passes - eLDA = EnsembleLda( - corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=0, num_models=num_models, random_state=0, + elda = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, + passes=0, num_models=NUM_MODELS, random_state=0, ) - self.assertEqual(len(eLDA.ttda), 0) + assert len(elda.ttda) == 0 # no corpus - eLDA = EnsembleLda( - id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=num_models, random_state=0, + elda = EnsembleLda( + id2word=common_dictionary, num_topics=NUM_TOPICS, + passes=PASSES, num_models=NUM_MODELS, random_state=0, ) - self.assertEqual(len(eLDA.ttda), 0) + assert len(elda.ttda) == 0 # 0 iterations - eLDA = EnsembleLda( - corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - iterations=0, num_models=num_models, random_state=0, + elda = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, + iterations=0, num_models=NUM_MODELS, random_state=0, ) - self.assertEqual(len(eLDA.ttda), 0) + assert len(elda.ttda) == 0 # 0 models - eLDA = EnsembleLda( - corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, - passes=passes, num_models=0, random_state=0 + elda = EnsembleLda( + corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, + passes=PASSES, num_models=0, random_state=0 ) - self.assertEqual(len(eLDA.ttda), 0) + assert len(elda.ttda) == 0 - def test_memory_unfriendly(self): - # at this point, self.eLDA_mu and self.eLDA are already trained + def test_mem_unfriendly(self): + # at this point, self.elda_mem_unfriendly and self.eLDA are already trained # in the setUpClass function # both should be 100% similar (but floats cannot # be compared that easily, so check for threshold) - self.assertEqual(len(self.eLDA_mu.tms), num_models) - np.testing.assert_allclose(self.eLDA.ttda, self.eLDA_mu.ttda, rtol=rtol) - np.testing.assert_allclose(self.eLDA.get_topics(), self.eLDA_mu.get_topics(), rtol=rtol) - self.check_ttda(self.eLDA_mu) + assert len(self.elda_mem_unfriendly.tms) == NUM_MODELS + np.testing.assert_allclose(self.elda.ttda, self.elda_mem_unfriendly.ttda, rtol=RTOL) + np.testing.assert_allclose(self.elda.get_topics(), self.elda_mem_unfriendly.get_topics(), rtol=RTOL) + self.assert_ttda_is_valid(self.elda_mem_unfriendly) def test_generate_gensim_rep(self): - gensimModel = self.eLDA.generate_gensim_representation() - topics = gensimModel.get_topics() - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol) + gensim_model = self.elda.generate_gensim_representation() + topics = gensim_model.get_topics() + np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL) def assert_cluster_results_equal(self, a, b): """compares important attributes of the cluster results""" @@ -155,38 +157,38 @@ def assert_cluster_results_equal(self, a, b): def test_persisting(self): fname = get_tmpfile('gensim_models_ensemblelda') - self.eLDA.save(fname) - loaded_eLDA = EnsembleLda.load(fname) + self.elda.save(fname) + loaded_elda = EnsembleLda.load(fname) # storing the ensemble without memory_friendy_ttda - self.eLDA_mu.save(fname) - loaded_eLDA_mu = EnsembleLda.load(fname) + self.elda_mem_unfriendly.save(fname) + loaded_elda_mu = EnsembleLda.load(fname) # topic_model_class will be lazy loaded and should be None first - assert loaded_eLDA.topic_model_class is None + assert loaded_elda.topic_model_class is None # was it stored and loaded correctly? # memory friendly. - loaded_eLDA_representation = loaded_eLDA.generate_gensim_representation() + loaded_elda_representation = loaded_elda.generate_gensim_representation() # generating the representation also lazily loads the topic_model_class - assert loaded_eLDA.topic_model_class == LdaModel + assert loaded_elda.topic_model_class == LdaModel - topics = loaded_eLDA_representation.get_topics() - ttda = loaded_eLDA.ttda - amatrix = loaded_eLDA.asymmetric_distance_matrix - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol) - np.testing.assert_allclose(self.eLDA.ttda, ttda, rtol=rtol) - np.testing.assert_allclose(self.eLDA.asymmetric_distance_matrix, amatrix, rtol=rtol) + topics = loaded_elda_representation.get_topics() + ttda = loaded_elda.ttda + amatrix = loaded_elda.asymmetric_distance_matrix + np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL) + np.testing.assert_allclose(self.elda.ttda, ttda, rtol=RTOL) + np.testing.assert_allclose(self.elda.asymmetric_distance_matrix, amatrix, rtol=RTOL) - a = self.eLDA.cluster_model.results - b = loaded_eLDA.cluster_model.results + a = self.elda.cluster_model.results + b = loaded_elda.cluster_model.results self.assert_cluster_results_equal(a, b) # memory unfriendly - loaded_eLDA_mu_representation = loaded_eLDA_mu.generate_gensim_representation() - topics = loaded_eLDA_mu_representation.get_topics() - np.testing.assert_allclose(self.eLDA.get_topics(), topics, rtol=rtol) + loaded_elda_mem_unfriendly_representation = loaded_elda_mu.generate_gensim_representation() + topics = loaded_elda_mem_unfriendly_representation.get_topics() + np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL) def test_multiprocessing(self): # same configuration @@ -198,40 +200,40 @@ def test_multiprocessing(self): workers = 3 # memory friendly. contains List of topic word distributions - eLDA_multi = EnsembleLda( + elda_multiprocessing = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, - num_topics=num_topics, passes=passes, num_models=num_models, + num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS, random_state=random_state, ensemble_workers=workers, distance_workers=workers, ) # memory unfriendly. contains List of models - eLDA_multi_mu = EnsembleLda( + elda_multiprocessing_mem_unfriendly = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, - num_topics=num_topics, passes=passes, num_models=num_models, + num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS, random_state=random_state, ensemble_workers=workers, distance_workers=workers, memory_friendly_ttda=False, ) - np.testing.assert_allclose(self.eLDA.get_topics(), eLDA_multi.get_topics(), rtol=rtol) - np.testing.assert_allclose(self.eLDA_mu.get_topics(), eLDA_multi_mu.get_topics(), rtol=rtol) + np.testing.assert_allclose(self.elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL) + np.testing.assert_allclose(self.elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL) def test_add_models_to_empty(self): ensemble = EnsembleLda(id2word=common_dictionary, num_models=0) - ensemble.add_model(self.eLDA.ttda[0:1]) - ensemble.add_model(self.eLDA.ttda[1:]) + ensemble.add_model(self.elda.ttda[0:1]) + ensemble.add_model(self.elda.ttda[1:]) ensemble.recluster() - np.testing.assert_allclose(ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol) + np.testing.assert_allclose(ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL) # persisting an ensemble that is entirely built from existing ttdas fname = get_tmpfile('gensim_models_ensemblelda') ensemble.save(fname) loaded_ensemble = EnsembleLda.load(fname) - np.testing.assert_allclose(loaded_ensemble.get_topics(), self.eLDA.get_topics(), rtol=rtol) + np.testing.assert_allclose(loaded_ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL) self.test_inference(loaded_ensemble) def test_add_models(self): # same configuration - num_models = self.eLDA.num_models + num_models = self.elda.num_models # make sure countings and sizes after adding are correct # create new models and add other models to them. @@ -244,7 +246,7 @@ def test_add_models(self): num_new_topics = 3 # 1. memory friendly - eLDA_base = EnsembleLda( + elda_base = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, iterations=1, random_state=0, topic_model_class=LdaMulticore, @@ -252,47 +254,47 @@ def test_add_models(self): ) # 1.1 ttda - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model(self.eLDA.ttda) - self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) - self.assertEqual(eLDA_base.num_models, b + 1) # defaults to 1 for one ttda matrix + a = len(elda_base.ttda) + b = elda_base.num_models + elda_base.add_model(self.elda.ttda) + assert len(elda_base.ttda) == a + len(self.elda.ttda) + assert elda_base.num_models == b + 1 # defaults to 1 for one ttda matrix # 1.2 an ensemble - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model(self.eLDA, 5) - self.assertEqual(len(eLDA_base.ttda), a + len(self.eLDA.ttda)) - self.assertEqual(eLDA_base.num_models, b + 5) + a = len(elda_base.ttda) + b = elda_base.num_models + elda_base.add_model(self.elda, 5) + assert len(elda_base.ttda) == a + len(self.elda.ttda) + assert elda_base.num_models == b + 5 # 1.3 a list of ensembles - a = len(eLDA_base.ttda) - b = eLDA_base.num_models + a = len(elda_base.ttda) + b = elda_base.num_models # it should be totally legit to add a memory unfriendly object to a memory friendly one - eLDA_base.add_model([self.eLDA, self.eLDA_mu]) - self.assertEqual(len(eLDA_base.ttda), a + 2 * len(self.eLDA.ttda)) - self.assertEqual(eLDA_base.num_models, b + 2 * num_models) + elda_base.add_model([self.elda, self.elda_mem_unfriendly]) + assert len(elda_base.ttda) == a + 2 * len(self.elda.ttda) + assert elda_base.num_models == b + 2 * num_models # 1.4 a single gensim model - model = self.eLDA.classic_model_representation + model = self.elda.classic_model_representation - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model(model) - self.assertEqual(len(eLDA_base.ttda), a + len(model.get_topics())) - self.assertEqual(eLDA_base.num_models, b + 1) + a = len(elda_base.ttda) + b = elda_base.num_models + elda_base.add_model(model) + assert len(elda_base.ttda) == a + len(model.get_topics()) + assert elda_base.num_models == b + 1 # 1.5 a list gensim models - a = len(eLDA_base.ttda) - b = eLDA_base.num_models - eLDA_base.add_model([model, model]) - self.assertEqual(len(eLDA_base.ttda), a + 2 * len(model.get_topics())) - self.assertEqual(eLDA_base.num_models, b + 2) + a = len(elda_base.ttda) + b = elda_base.num_models + elda_base.add_model([model, model]) + assert len(elda_base.ttda) == a + 2 * len(model.get_topics()) + assert elda_base.num_models == b + 2 - self.check_ttda(eLDA_base) + self.assert_ttda_is_valid(elda_base) # 2. memory unfriendly - eLDA_base_mu = EnsembleLda( + elda_base_mem_unfriendly = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, iterations=1, random_state=0, topic_model_class=LdaMulticore, @@ -300,44 +302,45 @@ def test_add_models(self): ) # 2.1 a single ensemble - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model(self.eLDA_mu) - self.assertEqual(len(eLDA_base_mu.tms), a + num_models) - self.assertEqual(eLDA_base_mu.num_models, b + num_models) + a = len(elda_base_mem_unfriendly.tms) + b = elda_base_mem_unfriendly.num_models + elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly) + assert len(elda_base_mem_unfriendly.tms) == a + num_models + assert elda_base_mem_unfriendly.num_models == b + num_models # 2.2 a list of ensembles - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model([self.eLDA_mu, self.eLDA_mu]) - self.assertEqual(len(eLDA_base_mu.tms), a + 2 * num_models) - self.assertEqual(eLDA_base_mu.num_models, b + 2 * num_models) + a = len(elda_base_mem_unfriendly.tms) + b = elda_base_mem_unfriendly.num_models + elda_base_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly]) + assert len(elda_base_mem_unfriendly.tms) == a + 2 * num_models + assert elda_base_mem_unfriendly.num_models == b + 2 * num_models # 2.3 a single gensim model - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model(self.eLDA_mu.tms[0]) - self.assertEqual(len(eLDA_base_mu.tms), a + 1) - self.assertEqual(eLDA_base_mu.num_models, b + 1) + a = len(elda_base_mem_unfriendly.tms) + b = elda_base_mem_unfriendly.num_models + elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0]) + assert len(elda_base_mem_unfriendly.tms) == a + 1 + assert elda_base_mem_unfriendly.num_models == b + 1 # 2.4 a list of gensim models - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - eLDA_base_mu.add_model(self.eLDA_mu.tms) - self.assertEqual(len(eLDA_base_mu.tms), a + num_models) - self.assertEqual(eLDA_base_mu.num_models, b + num_models) + a = len(elda_base_mem_unfriendly.tms) + b = elda_base_mem_unfriendly.num_models + elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms) + assert len(elda_base_mem_unfriendly.tms) == a + num_models + assert elda_base_mem_unfriendly.num_models == b + num_models # 2.5 topic term distributions should throw errors, because the # actual models are needed for the memory unfriendly ensemble - a = len(eLDA_base_mu.tms) - b = eLDA_base_mu.num_models - self.assertRaises(ValueError, lambda: eLDA_base_mu.add_model(self.eLDA_mu.tms[0].get_topics())) + a = len(elda_base_mem_unfriendly.tms) + b = elda_base_mem_unfriendly.num_models + with pytest.raises(ValueError): + elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics()) # remains unchanged - self.assertEqual(len(eLDA_base_mu.tms), a) - self.assertEqual(eLDA_base_mu.num_models, b) + assert len(elda_base_mem_unfriendly.tms) == a + assert elda_base_mem_unfriendly.num_models == b - self.assertEqual(eLDA_base_mu.num_models, len(eLDA_base_mu.tms)) - self.check_ttda(eLDA_base_mu) + assert elda_base_mem_unfriendly.num_models == len(elda_base_mem_unfriendly.tms) + self.assert_ttda_is_valid(elda_base_mem_unfriendly) def test_add_and_recluster(self): # See if after adding a model, the model still makes sense @@ -345,73 +348,73 @@ def test_add_and_recluster(self): num_new_topics = 3 # train - new_eLDA = EnsembleLda( + elda = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, iterations=30, random_state=1, topic_model_class='lda', distance_workers=4, ) - new_eLDA_mu = EnsembleLda( + elda_mem_unfriendly = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, iterations=30, random_state=1, topic_model_class=LdaModel, distance_workers=4, memory_friendly_ttda=False, ) # both should be similar - np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol) - np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol) + np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL) + np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL) # and every next step applied to both should result in similar results # 1. adding to ttda and tms - new_eLDA.add_model(self.eLDA) - new_eLDA_mu.add_model(self.eLDA_mu) - np.testing.assert_allclose(new_eLDA.ttda, new_eLDA_mu.ttda, rtol=rtol) - self.assertEqual(len(new_eLDA.ttda), len(self.eLDA.ttda) + num_new_models * num_new_topics) - self.assertEqual(len(new_eLDA_mu.ttda), len(self.eLDA_mu.ttda) + num_new_models * num_new_topics) - self.assertEqual(len(new_eLDA_mu.tms), num_models + num_new_models) - self.check_ttda(new_eLDA) - self.check_ttda(new_eLDA_mu) + elda.add_model(self.elda) + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly) + np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL) + assert len(elda.ttda) == len(self.elda.ttda) + num_new_models * num_new_topics + assert len(elda_mem_unfriendly.ttda) == len(self.elda_mem_unfriendly.ttda) + num_new_models * num_new_topics + assert len(elda_mem_unfriendly.tms) == NUM_MODELS + num_new_models + self.assert_ttda_is_valid(elda) + self.assert_ttda_is_valid(elda_mem_unfriendly) # 2. distance matrix - new_eLDA._generate_asymmetric_distance_matrix() - new_eLDA_mu._generate_asymmetric_distance_matrix() + elda._generate_asymmetric_distance_matrix() + elda_mem_unfriendly._generate_asymmetric_distance_matrix() np.testing.assert_allclose( - new_eLDA.asymmetric_distance_matrix, - new_eLDA_mu.asymmetric_distance_matrix, + elda.asymmetric_distance_matrix, + elda_mem_unfriendly.asymmetric_distance_matrix, ) # 3. CBDBSCAN results - new_eLDA._generate_topic_clusters() - new_eLDA_mu._generate_topic_clusters() - a = new_eLDA.cluster_model.results - b = new_eLDA_mu.cluster_model.results + elda._generate_topic_clusters() + elda_mem_unfriendly._generate_topic_clusters() + a = elda.cluster_model.results + b = elda_mem_unfriendly.cluster_model.results self.assert_cluster_results_equal(a, b) # 4. finally, the stable topics - new_eLDA._generate_stable_topics() - new_eLDA_mu._generate_stable_topics() + elda._generate_stable_topics() + elda_mem_unfriendly._generate_stable_topics() np.testing.assert_allclose( - new_eLDA.get_topics(), - new_eLDA_mu.get_topics(), + elda.get_topics(), + elda_mem_unfriendly.get_topics(), ) - new_eLDA.generate_gensim_representation() - new_eLDA_mu.generate_gensim_representation() + elda.generate_gensim_representation() + elda_mem_unfriendly.generate_gensim_representation() # same random state, hence topics should be still similar - np.testing.assert_allclose(new_eLDA.get_topics(), new_eLDA_mu.get_topics(), rtol=rtol) + np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL) - def test_inference(self, eLDA=None): - if eLDA is None: - eLDA = self.eLDA + def test_inference(self, elda=None): + if elda is None: + elda = self.elda # get the most likely token id from topic 0 - max_id = np.argmax(eLDA.get_topics()[0, :]) - self.assertGreater(eLDA.classic_model_representation.iterations, 0) + max_id = np.argmax(elda.get_topics()[0, :]) + assert elda.classic_model_representation.iterations > 0 # topic 0 should be dominant in the inference. # the difference between the probabilities should be significant and larger than 0.3 - infered = eLDA[[(max_id, 1)]] - self.assertGreater(infered[0][1] - 0.3, infered[1][1]) + inferred = elda[[(max_id, 1)]] + assert inferred[0][1] - 0.3 > inferred[1][1] if __name__ == '__main__': From 06cc33a49882506087383b16f1c438c277915431 Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 30 Jun 2021 22:24:22 +0200 Subject: [PATCH 151/166] better var names --- gensim/test/test_ensemblelda.py | 113 ++++++++++++++++---------------- 1 file changed, 55 insertions(+), 58 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 38ada1caaa..98720ea7d1 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -232,9 +232,6 @@ def test_add_models_to_empty(self): self.test_inference(loaded_ensemble) def test_add_models(self): - # same configuration - num_models = self.elda.num_models - # make sure countings and sizes after adding are correct # create new models and add other models to them. @@ -246,7 +243,7 @@ def test_add_models(self): num_new_topics = 3 # 1. memory friendly - elda_base = EnsembleLda( + elda = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, iterations=1, random_state=0, topic_model_class=LdaMulticore, @@ -254,47 +251,47 @@ def test_add_models(self): ) # 1.1 ttda - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model(self.elda.ttda) - assert len(elda_base.ttda) == a + len(self.elda.ttda) - assert elda_base.num_models == b + 1 # defaults to 1 for one ttda matrix + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model(self.elda.ttda) + assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda) + assert elda.num_models == num_models_before_add_model + 1 # defaults to 1 for one ttda matrix # 1.2 an ensemble - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model(self.elda, 5) - assert len(elda_base.ttda) == a + len(self.elda.ttda) - assert elda_base.num_models == b + 5 + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model(self.elda, 5) + assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda) + assert elda.num_models == num_models_before_add_model + 5 # 1.3 a list of ensembles - a = len(elda_base.ttda) - b = elda_base.num_models + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models # it should be totally legit to add a memory unfriendly object to a memory friendly one - elda_base.add_model([self.elda, self.elda_mem_unfriendly]) - assert len(elda_base.ttda) == a + 2 * len(self.elda.ttda) - assert elda_base.num_models == b + 2 * num_models + elda.add_model([self.elda, self.elda_mem_unfriendly]) + assert len(elda.ttda) == num_topics_before_add_model + 2 * len(self.elda.ttda) + assert elda.num_models == num_models_before_add_model + 2 * NUM_MODELS # 1.4 a single gensim model model = self.elda.classic_model_representation - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model(model) - assert len(elda_base.ttda) == a + len(model.get_topics()) - assert elda_base.num_models == b + 1 + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model(model) + assert len(elda.ttda) == num_topics_before_add_model + len(model.get_topics()) + assert elda.num_models == num_models_before_add_model + 1 # 1.5 a list gensim models - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model([model, model]) - assert len(elda_base.ttda) == a + 2 * len(model.get_topics()) - assert elda_base.num_models == b + 2 + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model([model, model]) + assert len(elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics()) + assert elda.num_models == num_models_before_add_model + 2 - self.assert_ttda_is_valid(elda_base) + self.assert_ttda_is_valid(elda) # 2. memory unfriendly - elda_base_mem_unfriendly = EnsembleLda( + elda_mem_unfriendly = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, iterations=1, random_state=0, topic_model_class=LdaMulticore, @@ -302,45 +299,45 @@ def test_add_models(self): ) # 2.1 a single ensemble - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly) - assert len(elda_base_mem_unfriendly.tms) == a + num_models - assert elda_base_mem_unfriendly.num_models == b + num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS + assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS # 2.2 a list of ensembles - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly]) - assert len(elda_base_mem_unfriendly.tms) == a + 2 * num_models - assert elda_base_mem_unfriendly.num_models == b + 2 * num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly]) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 2 * NUM_MODELS + assert elda_mem_unfriendly.num_models == num_models_before_add_model + 2 * NUM_MODELS # 2.3 a single gensim model - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0]) - assert len(elda_base_mem_unfriendly.tms) == a + 1 - assert elda_base_mem_unfriendly.num_models == b + 1 + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0]) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 1 + assert elda_mem_unfriendly.num_models == num_models_before_add_model + 1 # 2.4 a list of gensim models - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms) - assert len(elda_base_mem_unfriendly.tms) == a + num_models - assert elda_base_mem_unfriendly.num_models == b + num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS + assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS # 2.5 topic term distributions should throw errors, because the # actual models are needed for the memory unfriendly ensemble - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models with pytest.raises(ValueError): - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics()) + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics()) # remains unchanged - assert len(elda_base_mem_unfriendly.tms) == a - assert elda_base_mem_unfriendly.num_models == b + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + assert elda_mem_unfriendly.num_models == num_models_before_add_model - assert elda_base_mem_unfriendly.num_models == len(elda_base_mem_unfriendly.tms) - self.assert_ttda_is_valid(elda_base_mem_unfriendly) + assert elda_mem_unfriendly.num_models == len(elda_mem_unfriendly.tms) + self.assert_ttda_is_valid(elda_mem_unfriendly) def test_add_and_recluster(self): # See if after adding a model, the model still makes sense From ec4b487a2fcc44661e7ea2cfbfe3286279853fa1 Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 30 Jun 2021 22:24:22 +0200 Subject: [PATCH 152/166] better var names --- gensim/test/test_ensemblelda.py | 134 ++++++++++++++++---------------- 1 file changed, 65 insertions(+), 69 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 38ada1caaa..1c0004f51b 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -27,7 +27,7 @@ RTOL = 1e-04 if os.name == 'nt' else 1e-05 -class TestModel(unittest.TestCase): +class TestEnsembleLda(unittest.TestCase): def setUp(self): # same configuration for each model to make sure # the topics are equal @@ -130,10 +130,9 @@ def test_not_trained(self): assert len(elda.ttda) == 0 def test_mem_unfriendly(self): - # at this point, self.elda_mem_unfriendly and self.eLDA are already trained - # in the setUpClass function - # both should be 100% similar (but floats cannot - # be compared that easily, so check for threshold) + # at this point, self.elda_mem_unfriendly and self.elda are already trained + # in the setUp function and the topics extracted should be equal up to floating + # point variations between the two implementations. assert len(self.elda_mem_unfriendly.tms) == NUM_MODELS np.testing.assert_allclose(self.elda.ttda, self.elda_mem_unfriendly.ttda, rtol=RTOL) np.testing.assert_allclose(self.elda.get_topics(), self.elda_mem_unfriendly.get_topics(), rtol=RTOL) @@ -144,15 +143,15 @@ def test_generate_gensim_rep(self): topics = gensim_model.get_topics() np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL) - def assert_cluster_results_equal(self, a, b): - """compares important attributes of the cluster results""" + def assert_cluster_results_equal(self, cluster_model_results_1, cluster_model_results_2): + """Assert important attributes of the cluster results""" np.testing.assert_array_equal( - [row["label"] for row in a], - [row["label"] for row in b], + [row["label"] for row in cluster_model_results_1], + [row["label"] for row in cluster_model_results_2], ) np.testing.assert_array_equal( - [row["is_core"] for row in a], - [row["is_core"] for row in b], + [row["is_core"] for row in cluster_model_results_1], + [row["is_core"] for row in cluster_model_results_2], ) def test_persisting(self): @@ -232,9 +231,6 @@ def test_add_models_to_empty(self): self.test_inference(loaded_ensemble) def test_add_models(self): - # same configuration - num_models = self.elda.num_models - # make sure countings and sizes after adding are correct # create new models and add other models to them. @@ -246,7 +242,7 @@ def test_add_models(self): num_new_topics = 3 # 1. memory friendly - elda_base = EnsembleLda( + elda = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, iterations=1, random_state=0, topic_model_class=LdaMulticore, @@ -254,47 +250,47 @@ def test_add_models(self): ) # 1.1 ttda - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model(self.elda.ttda) - assert len(elda_base.ttda) == a + len(self.elda.ttda) - assert elda_base.num_models == b + 1 # defaults to 1 for one ttda matrix + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model(self.elda.ttda) + assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda) + assert elda.num_models == num_models_before_add_model + 1 # defaults to 1 for one ttda matrix # 1.2 an ensemble - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model(self.elda, 5) - assert len(elda_base.ttda) == a + len(self.elda.ttda) - assert elda_base.num_models == b + 5 + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model(self.elda, 5) + assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda) + assert elda.num_models == num_models_before_add_model + 5 # 1.3 a list of ensembles - a = len(elda_base.ttda) - b = elda_base.num_models + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models # it should be totally legit to add a memory unfriendly object to a memory friendly one - elda_base.add_model([self.elda, self.elda_mem_unfriendly]) - assert len(elda_base.ttda) == a + 2 * len(self.elda.ttda) - assert elda_base.num_models == b + 2 * num_models + elda.add_model([self.elda, self.elda_mem_unfriendly]) + assert len(elda.ttda) == num_topics_before_add_model + 2 * len(self.elda.ttda) + assert elda.num_models == num_models_before_add_model + 2 * NUM_MODELS # 1.4 a single gensim model model = self.elda.classic_model_representation - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model(model) - assert len(elda_base.ttda) == a + len(model.get_topics()) - assert elda_base.num_models == b + 1 + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model(model) + assert len(elda.ttda) == num_topics_before_add_model + len(model.get_topics()) + assert elda.num_models == num_models_before_add_model + 1 # 1.5 a list gensim models - a = len(elda_base.ttda) - b = elda_base.num_models - elda_base.add_model([model, model]) - assert len(elda_base.ttda) == a + 2 * len(model.get_topics()) - assert elda_base.num_models == b + 2 + num_topics_before_add_model = len(elda.ttda) + num_models_before_add_model = elda.num_models + elda.add_model([model, model]) + assert len(elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics()) + assert elda.num_models == num_models_before_add_model + 2 - self.assert_ttda_is_valid(elda_base) + self.assert_ttda_is_valid(elda) # 2. memory unfriendly - elda_base_mem_unfriendly = EnsembleLda( + elda_mem_unfriendly = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, iterations=1, random_state=0, topic_model_class=LdaMulticore, @@ -302,45 +298,45 @@ def test_add_models(self): ) # 2.1 a single ensemble - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly) - assert len(elda_base_mem_unfriendly.tms) == a + num_models - assert elda_base_mem_unfriendly.num_models == b + num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS + assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS # 2.2 a list of ensembles - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly]) - assert len(elda_base_mem_unfriendly.tms) == a + 2 * num_models - assert elda_base_mem_unfriendly.num_models == b + 2 * num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly]) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 2 * NUM_MODELS + assert elda_mem_unfriendly.num_models == num_models_before_add_model + 2 * NUM_MODELS # 2.3 a single gensim model - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0]) - assert len(elda_base_mem_unfriendly.tms) == a + 1 - assert elda_base_mem_unfriendly.num_models == b + 1 + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0]) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 1 + assert elda_mem_unfriendly.num_models == num_models_before_add_model + 1 # 2.4 a list of gensim models - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms) - assert len(elda_base_mem_unfriendly.tms) == a + num_models - assert elda_base_mem_unfriendly.num_models == b + num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms) + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS + assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS # 2.5 topic term distributions should throw errors, because the # actual models are needed for the memory unfriendly ensemble - a = len(elda_base_mem_unfriendly.tms) - b = elda_base_mem_unfriendly.num_models + num_topics_before_add_model = len(elda_mem_unfriendly.tms) + num_models_before_add_model = elda_mem_unfriendly.num_models with pytest.raises(ValueError): - elda_base_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics()) + elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics()) # remains unchanged - assert len(elda_base_mem_unfriendly.tms) == a - assert elda_base_mem_unfriendly.num_models == b + assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + assert elda_mem_unfriendly.num_models == num_models_before_add_model - assert elda_base_mem_unfriendly.num_models == len(elda_base_mem_unfriendly.tms) - self.assert_ttda_is_valid(elda_base_mem_unfriendly) + assert elda_mem_unfriendly.num_models == len(elda_mem_unfriendly.tms) + self.assert_ttda_is_valid(elda_mem_unfriendly) def test_add_and_recluster(self): # See if after adding a model, the model still makes sense From 531ac6aa77a4e63f1f960dd655c7dc26d094ea29 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Wed, 30 Jun 2021 23:15:31 +0200 Subject: [PATCH 153/166] simplified some function calls to use attributes instead of parameters --- gensim/models/ensemblelda.py | 172 +++++++++++++++++------------------ 1 file changed, 81 insertions(+), 91 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 71c9874c2e..53dcbb99d4 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -242,90 +242,23 @@ def _teardown(pipes, processes, i): del process -def mass_masking(a, threshold): +def mass_masking(a, threshold=None): """Original masking method. Returns a new binary mask.""" + if threshold is None: + threshold = 0.95 + sorted_a = np.sort(a)[::-1] largest_mass = sorted_a.cumsum() < threshold smallest_valid = sorted_a[largest_mass][-1] return a >= smallest_valid -def rank_masking(a, threshold): +def rank_masking(a, threshold=None): """Faster masking method. Returns a new binary mask.""" - return a > np.sort(a)[::-1][int(len(a) * threshold)] - - -def _calculate_asymmetric_distance_matrix_chunk(ttda1, ttda2, threshold, start_index, method): - """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. - - Parameters - ---------- - ttda1 and ttda2: 2D arrays of floats - Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one - topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. - threshold : float, optional - threshold defaults to: ``{"mass": 0.95, "rank": 0.11}``, depending on the selected method - start_index : int - this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the - complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into - two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 - method : {'mass', 'rank}, optional - method can be "mass" for the original masking method or "rank" for a faster masking method that selects - by rank of largest elements in the topic term distribution, to determine which tokens are relevant for the - topic. - - Returns - ------- - 2D numpy.ndarray of floats - Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. - - """ - # initialize the distance matrix. ndarray is faster than zeros - distances = np.ndarray((len(ttda1), len(ttda2))) - - if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: - # the worker might not have received a ttda because it was chunked up too much - - if method not in ["mass", "rank"]: - raise ValueError(f"method {method} unknown") - - # select masking method: - create_mask = {"mass": mass_masking, "rank": rank_masking}[method] - - # some help to find a better threshold by useful log messages - avg_mask_size = 0 - - # now iterate over each topic - for ttd1_idx, ttd1 in enumerate(ttda1): - # create mask from ttd1 that removes noise from a and keeps the largest terms - mask = create_mask(ttd1, threshold) - ttd1_masked = ttd1[mask] - - avg_mask_size += mask.sum() - - # now look at every possible pair for topic a: - for ttd2_idx, ttd2 in enumerate(ttda2): - # distance to itself is 0 - if ttd1_idx + start_index == ttd2_idx: - distances[ttd1_idx][ttd2_idx] = 0 - continue - - # now mask b based on a, which will force the shape of a onto b - ttd2_masked = ttd2[mask] - - # Smart distance calculation avoids calculating cosine distance for highly masked topic-term - # distributions that will have distance values near 1. - if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD: - distance = 1 - else: - distance = cosine(ttd1_masked, ttd2_masked) - - distances[ttd1_idx][ttd2_idx] = distance - - percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) - logger.info(f'the given threshold of {threshold} covered on average {percent}% of tokens') + if threshold is None: + threshold = 0.11 - return distances + return a > np.sort(a)[::-1][int(len(a) * threshold)] def _validate_clusters(clusters, min_cores): @@ -374,7 +307,7 @@ def __init__( self, topic_model_class="ldamulticore", num_models=3, min_cores=None, # default value from _generate_stable_topics() epsilon=0.1, ensemble_workers=1, memory_friendly_ttda=True, - min_samples=None, masking_method="mass", masking_threshold=None, + min_samples=None, masking_method=mass_masking, masking_threshold=None, distance_workers=1, random_state=None, **gensim_kw_args, ): """Create and train a new EnsembleLda model. @@ -416,8 +349,9 @@ def __init__( If False, any topic term matrix can be suplied to add_model. min_samples : int, optional Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering. - masking_method : str, optional - Choose one of "mass" (default) or "rank" (percentile, faster). + masking_method : function, optional + Choose one of :meth:`~gensim.models.ensemblelda.mass_masking` (default) or + :meth:`~gensim.models.ensemblelda.rank_masking` (percentile, faster). For clustering, distances between topic-term distributions are asymmetric. In particular, the distance (technically a divergence) from distribution A to B is more of a measure of if A is contained in B. At a @@ -429,7 +363,6 @@ def __init__( 2. rank: forms mask by taking the top ranked terms (by mass) until the 'masking_threshold' is reached. For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask. - masking_threshold : float, optional Default: None, which uses ``0.95`` for "mass", and ``0.11`` for masking_method "rank". In general, too small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy @@ -750,7 +683,8 @@ def add_model(self, target, num_new_models=None): raise ValueError( 'ttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, ' 'you can call convert_to_memory_friendly, but it will discard the stored gensim ' - 'models and only keep the relevant topic term distributions from them.') + 'models and only keep the relevant topic term distributions from them.' + ) # 2. list of ensembles elif isinstance(target[0], type(self)): @@ -783,6 +717,7 @@ def add_model(self, target, num_new_models=None): f"target ttda dimensions do not match. Topics must be {self.ttda.shape[-1]} but was {ttda.shape[-1]} " f"elements large" ) + self.ttda = np.append(self.ttda, ttda, axis=0) # tell recluster that the distance matrix needs to be regenerated @@ -927,13 +862,73 @@ def _generate_topic_models(self, num_models, random_states=None, pipe=None): pipe.close() - def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe, threshold, method): + def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, start_index): + """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. + + Parameters + ---------- + ttda1 and ttda2: 2D arrays of floats + Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one + topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. + start_index : int + this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the + complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into + two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 + + Returns + ------- + 2D numpy.ndarray of floats + Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. + + """ + # initialize the distance matrix. ndarray is faster than zeros + distances = np.ndarray((len(ttda1), len(ttda2))) + + if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: + # the worker might not have received a ttda because it was chunked up too much + + # some help to find a better threshold by useful log messages + avg_mask_size = 0 + + # now iterate over each topic + for ttd1_idx, ttd1 in enumerate(ttda1): + # create mask from ttd1 that removes noise from a and keeps the largest terms + mask = self.masking_method(ttd1, self.masking_threshold) + ttd1_masked = ttd1[mask] + + avg_mask_size += mask.sum() + + # now look at every possible pair for topic a: + for ttd2_idx, ttd2 in enumerate(ttda2): + # distance to itself is 0 + if ttd1_idx + start_index == ttd2_idx: + distances[ttd1_idx][ttd2_idx] = 0 + continue + + # now mask b based on a, which will force the shape of a onto b + ttd2_masked = ttd2[mask] + + # Smart distance calculation avoids calculating cosine distance for highly masked topic-term + # distributions that will have distance values near 1. + if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD: + distance = 1 + else: + distance = cosine(ttd1_masked, ttd2_masked) + + distances[ttd1_idx][ttd2_idx] = distance + + percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) + logger.info(f'the given threshold of {self.masking_threshold} covered on average {percent}% of tokens') + + return distances + + def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe): """Worker that computes the distance to all other nodes from a chunk of nodes.""" logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix") # the chunk of ttda that's going to be calculated: ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] - distance_chunk = _calculate_asymmetric_distance_matrix_chunk( - ttda1=ttda1, ttda2=self.ttda, threshold=threshold, start_index=ttdas_sent, method=method, + distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( + ttda1=ttda1, ttda2=self.ttda, start_index=ttdas_sent ) pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory pipe.close() @@ -947,21 +942,16 @@ def _generate_asymmetric_distance_matrix(self): """ workers = self.distance_workers - threshold = self.masking_threshold - method = self.masking_method # matrix is up to date afterwards self.asymmetric_distance_matrix_outdated = False - if threshold is None: - threshold = {"mass": 0.95, "rank": 0.11}[method] - logger.info(f"generating a {len(self.ttda)} x {len(self.ttda)} asymmetric distance matrix...") # singlecore if workers is not None and workers <= 1: - self.asymmetric_distance_matrix = _calculate_asymmetric_distance_matrix_chunk( - ttda1=self.ttda, ttda2=self.ttda, threshold=threshold, start_index=0, method=method, + self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( + ttda1=self.ttda, ttda2=self.ttda, start_index=0, ) # else, if workers > 1 use multiprocessing @@ -989,7 +979,7 @@ def _generate_asymmetric_distance_matrix(self): process = Process( target=self._asymmetric_distance_matrix_worker, - args=(i, ttdas_sent, n_ttdas, child_conn, threshold, method), + args=(i, ttdas_sent, n_ttdas, child_conn), ) ttdas_sent += n_ttdas From 0311d94cd24206980fb88411068a4234881e9dbb Mon Sep 17 00:00:00 2001 From: aloosley Date: Wed, 30 Jun 2021 23:17:02 +0200 Subject: [PATCH 154/166] sort key function --- gensim/models/ensemblelda.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 1b01faccf7..007443edfe 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -136,7 +136,7 @@ def _remove_from_all_sets(label, clusters): def _contains_isolated_cores(label, cluster, min_cores): """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster.""" - return sum(map(lambda x: x == {label}, cluster["neighboring_labels"])) >= min_cores + return sum(map(lambda neighboring_labels: neighboring_labels == {label}, cluster["neighboring_labels"])) >= min_cores def _aggregate_topics(grouped_by_labels): @@ -333,15 +333,10 @@ def _validate_clusters(clusters, min_cores): # Clusters with noisy invalid neighbors may have a harder time being marked as stable, so start with the # easy ones and potentially already remove some noise by also sorting smaller clusters to the front. # This clears up the clusters a bit before checking the ones with many neighbors. - sorted_clusters = sorted( - clusters, - key=lambda cluster: ( - cluster["max_num_neighboring_labels"], - cluster["num_cores"], - cluster["label"], - ), - reverse=False, - ) + def _cluster_sort_key(cluster): + return (cluster["max_num_neighboring_labels"], cluster["num_cores"], cluster["label"]) + + sorted_clusters = sorted(clusters, key=_cluster_sort_key, reverse=False) for cluster in sorted_clusters: cluster["is_valid"] = None From 9617e8f2e746eb5e6dfef9fd6fd6e630181f1d72 Mon Sep 17 00:00:00 2001 From: aloosley Date: Thu, 1 Jul 2021 00:18:15 +0200 Subject: [PATCH 155/166] more efficient tests with better case names --- gensim/test/test_ensemblelda.py | 326 +++++++++++++++++--------------- 1 file changed, 174 insertions(+), 152 deletions(-) diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 1c0004f51b..6cd18ab550 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -22,145 +22,154 @@ NUM_TOPICS = 2 NUM_MODELS = 4 PASSES = 50 +RANDOM_STATE = 0 # windows tests fail due to the required assertion precision being too high RTOL = 1e-04 if os.name == 'nt' else 1e-05 class TestEnsembleLda(unittest.TestCase): - def setUp(self): - # same configuration for each model to make sure - # the topics are equal - random_state = 0 - - self.elda = EnsembleLda( + def get_elda(self): + return EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, - passes=PASSES, num_models=NUM_MODELS, random_state=random_state, + passes=PASSES, num_models=NUM_MODELS, random_state=RANDOM_STATE, topic_model_class=LdaModel, ) - self.elda_mem_unfriendly = EnsembleLda( + def get_elda_mem_unfriendly(self): + return EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, - passes=PASSES, num_models=NUM_MODELS, random_state=random_state, + passes=PASSES, num_models=NUM_MODELS, random_state=RANDOM_STATE, memory_friendly_ttda=False, topic_model_class=LdaModel, ) - def assert_ttda_is_valid(self, ensemble): + def assert_ttda_is_valid(self, elda): """Check that ttda has one or more topic and that term probabilities add to one.""" - assert len(ensemble.ttda) > 0 - sum_over_terms = ensemble.ttda.sum(axis=1) - expected_sum_over_terms = np.ones(len(ensemble.ttda)).astype(np.float32) + assert len(elda.ttda) > 0 + sum_over_terms = elda.ttda.sum(axis=1) + expected_sum_over_terms = np.ones(len(elda.ttda)).astype(np.float32) np.testing.assert_allclose(sum_over_terms, expected_sum_over_terms, rtol=1e-04) def test_elda(self): + elda = self.get_elda() + # given that the random_state doesn't change, it should # always be 2 detected topics in this setup. - assert self.elda.stable_topics.shape[1] == len(common_dictionary) - assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS - self.assert_ttda_is_valid(self.elda) - - # reclustering shouldn't change anything without - # added models or different parameters - self.elda.recluster() + assert elda.stable_topics.shape[1] == len(common_dictionary) + assert len(elda.ttda) == NUM_MODELS * NUM_TOPICS + self.assert_ttda_is_valid(elda) - assert self.elda.stable_topics.shape[1] == len(common_dictionary) - assert len(self.elda.ttda) == NUM_MODELS * NUM_TOPICS - self.assert_ttda_is_valid(self.elda) + def test_backwards_compatibility_with_persisted_model(self): + elda = self.get_elda() # compare with a pre-trained reference model - reference = EnsembleLda.load(datapath('ensemblelda')) - np.testing.assert_allclose(self.elda.ttda, reference.ttda, rtol=RTOL) - # small values in the distance matrix tend to vary quite a bit around 2%, - # so use some absolute tolerance measurement to check if the matrix is at least - # close to the target. - atol = reference.asymmetric_distance_matrix.max() * 1e-05 + loaded_elda = EnsembleLda.load(datapath('ensemblelda')) + np.testing.assert_allclose(elda.ttda, loaded_elda.ttda, rtol=RTOL) + atol = loaded_elda.asymmetric_distance_matrix.max() * 1e-05 np.testing.assert_allclose( - self.elda.asymmetric_distance_matrix, - reference.asymmetric_distance_matrix, atol=atol, + elda.asymmetric_distance_matrix, + loaded_elda.asymmetric_distance_matrix, atol=atol, ) - def test_clustering(self): + def test_recluster(self): # the following test is quite specific to the current implementation and not part of any api, # but it makes improving those sections of the code easier as long as sorted_clusters and the # cluster_model results are supposed to stay the same. Potentially this test will deprecate. - reference = EnsembleLda.load(datapath('ensemblelda')) - cluster_model_results = deepcopy(reference.cluster_model.results) - valid_clusters = deepcopy(reference.valid_clusters) - stable_topics = deepcopy(reference.get_topics()) + elda = EnsembleLda.load(datapath('ensemblelda')) + loaded_cluster_model_results = deepcopy(elda.cluster_model.results) + loaded_valid_clusters = deepcopy(elda.valid_clusters) + loaded_stable_topics = deepcopy(elda.get_topics()) # continue training with the distance matrix of the pretrained reference and see if # the generated clusters match. - reference.asymmetric_distance_matrix_outdated = True - reference.recluster() + elda.asymmetric_distance_matrix_outdated = True + elda.recluster() - self.assert_cluster_results_equal(reference.cluster_model.results, cluster_model_results) - assert reference.valid_clusters == valid_clusters - np.testing.assert_allclose(reference.get_topics(), stable_topics, rtol=RTOL) + self.assert_clustering_results_equal(elda.cluster_model.results, loaded_cluster_model_results) + assert elda.valid_clusters == loaded_valid_clusters + np.testing.assert_allclose(elda.get_topics(), loaded_stable_topics, rtol=RTOL) - def test_not_trained(self): - # should not throw errors and no training should happen + def test_recluster_does_nothing_when_stable_topics_already_found(self): + elda = self.get_elda() - # 0 passes + # reclustering shouldn't change anything without + # added models or different parameters + elda.recluster() + + assert elda.stable_topics.shape[1] == len(common_dictionary) + assert len(elda.ttda) == NUM_MODELS * NUM_TOPICS + self.assert_ttda_is_valid(elda) + + def test_not_trained_given_zero_passes(self): elda = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, - passes=0, num_models=NUM_MODELS, random_state=0, + passes=0, num_models=NUM_MODELS, random_state=RANDOM_STATE, ) assert len(elda.ttda) == 0 - # no corpus + def test_not_trained_given_no_corpus(self): elda = EnsembleLda( id2word=common_dictionary, num_topics=NUM_TOPICS, - passes=PASSES, num_models=NUM_MODELS, random_state=0, + passes=PASSES, num_models=NUM_MODELS, random_state=RANDOM_STATE, ) assert len(elda.ttda) == 0 - # 0 iterations + def test_not_trained_given_zero_iterations(self): elda = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, - iterations=0, num_models=NUM_MODELS, random_state=0, + iterations=0, num_models=NUM_MODELS, random_state=RANDOM_STATE, ) assert len(elda.ttda) == 0 - # 0 models + def test_not_trained_given_zero_models(self): elda = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=NUM_TOPICS, - passes=PASSES, num_models=0, random_state=0 + passes=PASSES, num_models=0, random_state=RANDOM_STATE ) assert len(elda.ttda) == 0 def test_mem_unfriendly(self): - # at this point, self.elda_mem_unfriendly and self.elda are already trained - # in the setUp function and the topics extracted should be equal up to floating - # point variations between the two implementations. - assert len(self.elda_mem_unfriendly.tms) == NUM_MODELS - np.testing.assert_allclose(self.elda.ttda, self.elda_mem_unfriendly.ttda, rtol=RTOL) - np.testing.assert_allclose(self.elda.get_topics(), self.elda_mem_unfriendly.get_topics(), rtol=RTOL) - self.assert_ttda_is_valid(self.elda_mem_unfriendly) - - def test_generate_gensim_rep(self): - gensim_model = self.elda.generate_gensim_representation() + # elda_mem_unfriendly and self.elda should have topics that are + # the same up to floating point variations caused by the two different + # implementations + + elda = self.get_elda() + elda_mem_unfriendly = self.get_elda_mem_unfriendly() + + assert len(elda_mem_unfriendly.tms) == NUM_MODELS + np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL) + np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL) + self.assert_ttda_is_valid(elda_mem_unfriendly) + + def test_generate_gensim_representation(self): + elda = self.get_elda() + + gensim_model = elda.generate_gensim_representation() topics = gensim_model.get_topics() - np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL) + np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL) - def assert_cluster_results_equal(self, cluster_model_results_1, cluster_model_results_2): + def assert_clustering_results_equal(self, clustering_results_1, clustering_results_2): """Assert important attributes of the cluster results""" np.testing.assert_array_equal( - [row["label"] for row in cluster_model_results_1], - [row["label"] for row in cluster_model_results_2], + [element["label"] for element in clustering_results_1], + [element["label"] for element in clustering_results_2], ) np.testing.assert_array_equal( - [row["is_core"] for row in cluster_model_results_1], - [row["is_core"] for row in cluster_model_results_2], + [element["is_core"] for element in clustering_results_1], + [element["is_core"] for element in clustering_results_2], ) def test_persisting(self): + elda = self.get_elda() + elda_mem_unfriendly = self.get_elda_mem_unfriendly() + fname = get_tmpfile('gensim_models_ensemblelda') - self.elda.save(fname) + elda.save(fname) loaded_elda = EnsembleLda.load(fname) # storing the ensemble without memory_friendy_ttda - self.elda_mem_unfriendly.save(fname) - loaded_elda_mu = EnsembleLda.load(fname) + elda_mem_unfriendly.save(fname) + loaded_elda_mem_unfriendly = EnsembleLda.load(fname) # topic_model_class will be lazy loaded and should be None first assert loaded_elda.topic_model_class is None @@ -175,23 +184,23 @@ def test_persisting(self): topics = loaded_elda_representation.get_topics() ttda = loaded_elda.ttda amatrix = loaded_elda.asymmetric_distance_matrix - np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL) - np.testing.assert_allclose(self.elda.ttda, ttda, rtol=RTOL) - np.testing.assert_allclose(self.elda.asymmetric_distance_matrix, amatrix, rtol=RTOL) + np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL) + np.testing.assert_allclose(elda.ttda, ttda, rtol=RTOL) + np.testing.assert_allclose(elda.asymmetric_distance_matrix, amatrix, rtol=RTOL) - a = self.elda.cluster_model.results - b = loaded_elda.cluster_model.results + expected_clustering_results = elda.cluster_model.results + loaded_clustering_results = loaded_elda.cluster_model.results - self.assert_cluster_results_equal(a, b) + self.assert_clustering_results_equal(expected_clustering_results, loaded_clustering_results) # memory unfriendly - loaded_elda_mem_unfriendly_representation = loaded_elda_mu.generate_gensim_representation() + loaded_elda_mem_unfriendly_representation = loaded_elda_mem_unfriendly.generate_gensim_representation() topics = loaded_elda_mem_unfriendly_representation.get_topics() - np.testing.assert_allclose(self.elda.get_topics(), topics, rtol=RTOL) + np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL) def test_multiprocessing(self): # same configuration - random_state = 0 + random_state = RANDOM_STATE # use 3 processes for the ensemble and the distance, # so that the 4 models and 8 topics cannot be distributed @@ -199,6 +208,7 @@ def test_multiprocessing(self): workers = 3 # memory friendly. contains List of topic word distributions + elda = self.get_elda() elda_multiprocessing = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS, @@ -206,6 +216,7 @@ def test_multiprocessing(self): ) # memory unfriendly. contains List of models + elda_mem_unfriendly = self.get_elda_mem_unfriendly() elda_multiprocessing_mem_unfriendly = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, topic_model_class=LdaModel, num_topics=NUM_TOPICS, passes=PASSES, num_models=NUM_MODELS, @@ -213,21 +224,23 @@ def test_multiprocessing(self): memory_friendly_ttda=False, ) - np.testing.assert_allclose(self.elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL) - np.testing.assert_allclose(self.elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL) + np.testing.assert_allclose(elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL) + np.testing.assert_allclose(elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL) def test_add_models_to_empty(self): + elda = self.get_elda() + ensemble = EnsembleLda(id2word=common_dictionary, num_models=0) - ensemble.add_model(self.elda.ttda[0:1]) - ensemble.add_model(self.elda.ttda[1:]) + ensemble.add_model(elda.ttda[0:1]) + ensemble.add_model(elda.ttda[1:]) ensemble.recluster() - np.testing.assert_allclose(ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL) + np.testing.assert_allclose(ensemble.get_topics(), elda.get_topics(), rtol=RTOL) # persisting an ensemble that is entirely built from existing ttdas fname = get_tmpfile('gensim_models_ensemblelda') ensemble.save(fname) loaded_ensemble = EnsembleLda.load(fname) - np.testing.assert_allclose(loaded_ensemble.get_topics(), self.elda.get_topics(), rtol=RTOL) + np.testing.assert_allclose(loaded_ensemble.get_topics(), elda.get_topics(), rtol=RTOL) self.test_inference(loaded_ensemble) def test_add_models(self): @@ -242,86 +255,88 @@ def test_add_models(self): num_new_topics = 3 # 1. memory friendly - elda = EnsembleLda( + base_elda = self.get_elda() + cumulative_elda = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_class=LdaMulticore, + iterations=1, random_state=RANDOM_STATE, topic_model_class=LdaMulticore, workers=3, ensemble_workers=2, ) # 1.1 ttda - num_topics_before_add_model = len(elda.ttda) - num_models_before_add_model = elda.num_models - elda.add_model(self.elda.ttda) - assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda) - assert elda.num_models == num_models_before_add_model + 1 # defaults to 1 for one ttda matrix + num_topics_before_add_model = len(cumulative_elda.ttda) + num_models_before_add_model = cumulative_elda.num_models + cumulative_elda.add_model(base_elda.ttda) + assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(base_elda.ttda) + assert cumulative_elda.num_models == num_models_before_add_model + 1 # defaults to 1 for one ttda matrix # 1.2 an ensemble - num_topics_before_add_model = len(elda.ttda) - num_models_before_add_model = elda.num_models - elda.add_model(self.elda, 5) - assert len(elda.ttda) == num_topics_before_add_model + len(self.elda.ttda) - assert elda.num_models == num_models_before_add_model + 5 + num_topics_before_add_model = len(cumulative_elda.ttda) + num_models_before_add_model = cumulative_elda.num_models + cumulative_elda.add_model(base_elda, 5) + assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(base_elda.ttda) + assert cumulative_elda.num_models == num_models_before_add_model + 5 # 1.3 a list of ensembles - num_topics_before_add_model = len(elda.ttda) - num_models_before_add_model = elda.num_models + num_topics_before_add_model = len(cumulative_elda.ttda) + num_models_before_add_model = cumulative_elda.num_models # it should be totally legit to add a memory unfriendly object to a memory friendly one - elda.add_model([self.elda, self.elda_mem_unfriendly]) - assert len(elda.ttda) == num_topics_before_add_model + 2 * len(self.elda.ttda) - assert elda.num_models == num_models_before_add_model + 2 * NUM_MODELS + base_elda_mem_unfriendly = self.get_elda_mem_unfriendly() + cumulative_elda.add_model([base_elda, base_elda_mem_unfriendly]) + assert len(cumulative_elda.ttda) == num_topics_before_add_model + 2 * len(base_elda.ttda) + assert cumulative_elda.num_models == num_models_before_add_model + 2 * NUM_MODELS # 1.4 a single gensim model - model = self.elda.classic_model_representation + model = base_elda.classic_model_representation - num_topics_before_add_model = len(elda.ttda) - num_models_before_add_model = elda.num_models - elda.add_model(model) - assert len(elda.ttda) == num_topics_before_add_model + len(model.get_topics()) - assert elda.num_models == num_models_before_add_model + 1 + num_topics_before_add_model = len(cumulative_elda.ttda) + num_models_before_add_model = cumulative_elda.num_models + cumulative_elda.add_model(model) + assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(model.get_topics()) + assert cumulative_elda.num_models == num_models_before_add_model + 1 # 1.5 a list gensim models - num_topics_before_add_model = len(elda.ttda) - num_models_before_add_model = elda.num_models - elda.add_model([model, model]) - assert len(elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics()) - assert elda.num_models == num_models_before_add_model + 2 + num_topics_before_add_model = len(cumulative_elda.ttda) + num_models_before_add_model = cumulative_elda.num_models + cumulative_elda.add_model([model, model]) + assert len(cumulative_elda.ttda) == num_topics_before_add_model + 2 * len(model.get_topics()) + assert cumulative_elda.num_models == num_models_before_add_model + 2 - self.assert_ttda_is_valid(elda) + self.assert_ttda_is_valid(cumulative_elda) # 2. memory unfriendly elda_mem_unfriendly = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=1, num_models=num_new_models, - iterations=1, random_state=0, topic_model_class=LdaMulticore, + iterations=1, random_state=RANDOM_STATE, topic_model_class=LdaMulticore, workers=3, ensemble_workers=2, memory_friendly_ttda=False, ) # 2.1 a single ensemble num_topics_before_add_model = len(elda_mem_unfriendly.tms) num_models_before_add_model = elda_mem_unfriendly.num_models - elda_mem_unfriendly.add_model(self.elda_mem_unfriendly) + elda_mem_unfriendly.add_model(base_elda_mem_unfriendly) assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS # 2.2 a list of ensembles num_topics_before_add_model = len(elda_mem_unfriendly.tms) num_models_before_add_model = elda_mem_unfriendly.num_models - elda_mem_unfriendly.add_model([self.elda_mem_unfriendly, self.elda_mem_unfriendly]) + elda_mem_unfriendly.add_model([base_elda_mem_unfriendly, base_elda_mem_unfriendly]) assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 2 * NUM_MODELS assert elda_mem_unfriendly.num_models == num_models_before_add_model + 2 * NUM_MODELS # 2.3 a single gensim model num_topics_before_add_model = len(elda_mem_unfriendly.tms) num_models_before_add_model = elda_mem_unfriendly.num_models - elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0]) + elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms[0]) assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 1 assert elda_mem_unfriendly.num_models == num_models_before_add_model + 1 # 2.4 a list of gensim models num_topics_before_add_model = len(elda_mem_unfriendly.tms) num_models_before_add_model = elda_mem_unfriendly.num_models - elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms) + elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms) assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + NUM_MODELS assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS @@ -330,7 +345,7 @@ def test_add_models(self): num_topics_before_add_model = len(elda_mem_unfriendly.tms) num_models_before_add_model = elda_mem_unfriendly.num_models with pytest.raises(ValueError): - elda_mem_unfriendly.add_model(self.elda_mem_unfriendly.tms[0].get_topics()) + elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms[0].get_topics()) # remains unchanged assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model assert elda_mem_unfriendly.num_models == num_models_before_add_model @@ -342,67 +357,74 @@ def test_add_and_recluster(self): # See if after adding a model, the model still makes sense num_new_models = 3 num_new_topics = 3 + random_state = 1 - # train - elda = EnsembleLda( + # train models two sets of models (mem friendly and unfriendly) + elda_1 = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class='lda', + iterations=30, random_state=random_state, topic_model_class='lda', distance_workers=4, ) - elda_mem_unfriendly = EnsembleLda( + elda_mem_unfriendly_1 = EnsembleLda( corpus=common_corpus, id2word=common_dictionary, num_topics=num_new_topics, passes=10, num_models=num_new_models, - iterations=30, random_state=1, topic_model_class=LdaModel, + iterations=30, random_state=random_state, topic_model_class=LdaModel, distance_workers=4, memory_friendly_ttda=False, ) + elda_2 = self.get_elda() + elda_mem_unfriendly_2 = self.get_elda_mem_unfriendly() + assert elda_1.random_state != elda_2.random_state + assert elda_mem_unfriendly_1.random_state != elda_mem_unfriendly_2.random_state + # both should be similar - np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL) - np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL) + np.testing.assert_allclose(elda_1.ttda, elda_mem_unfriendly_1.ttda, rtol=RTOL) + np.testing.assert_allclose(elda_1.get_topics(), elda_mem_unfriendly_1.get_topics(), rtol=RTOL) # and every next step applied to both should result in similar results # 1. adding to ttda and tms - elda.add_model(self.elda) - elda_mem_unfriendly.add_model(self.elda_mem_unfriendly) - np.testing.assert_allclose(elda.ttda, elda_mem_unfriendly.ttda, rtol=RTOL) - assert len(elda.ttda) == len(self.elda.ttda) + num_new_models * num_new_topics - assert len(elda_mem_unfriendly.ttda) == len(self.elda_mem_unfriendly.ttda) + num_new_models * num_new_topics - assert len(elda_mem_unfriendly.tms) == NUM_MODELS + num_new_models - self.assert_ttda_is_valid(elda) - self.assert_ttda_is_valid(elda_mem_unfriendly) + elda_1.add_model(elda_2) + elda_mem_unfriendly_1.add_model(elda_mem_unfriendly_2) + + np.testing.assert_allclose(elda_1.ttda, elda_mem_unfriendly_1.ttda, rtol=RTOL) + assert len(elda_1.ttda) == len(elda_2.ttda) + num_new_models * num_new_topics + assert len(elda_mem_unfriendly_1.ttda) == len(elda_mem_unfriendly_2.ttda) + num_new_models * num_new_topics + assert len(elda_mem_unfriendly_1.tms) == NUM_MODELS + num_new_models + self.assert_ttda_is_valid(elda_1) + self.assert_ttda_is_valid(elda_mem_unfriendly_1) # 2. distance matrix - elda._generate_asymmetric_distance_matrix() - elda_mem_unfriendly._generate_asymmetric_distance_matrix() + elda_1._generate_asymmetric_distance_matrix() + elda_mem_unfriendly_1._generate_asymmetric_distance_matrix() np.testing.assert_allclose( - elda.asymmetric_distance_matrix, - elda_mem_unfriendly.asymmetric_distance_matrix, + elda_1.asymmetric_distance_matrix, + elda_mem_unfriendly_1.asymmetric_distance_matrix, ) # 3. CBDBSCAN results - elda._generate_topic_clusters() - elda_mem_unfriendly._generate_topic_clusters() - a = elda.cluster_model.results - b = elda_mem_unfriendly.cluster_model.results - self.assert_cluster_results_equal(a, b) + elda_1._generate_topic_clusters() + elda_mem_unfriendly_1._generate_topic_clusters() + clustering_results = elda_1.cluster_model.results + mem_unfriendly_clustering_results = elda_mem_unfriendly_1.cluster_model.results + self.assert_clustering_results_equal(clustering_results, mem_unfriendly_clustering_results) # 4. finally, the stable topics - elda._generate_stable_topics() - elda_mem_unfriendly._generate_stable_topics() + elda_1._generate_stable_topics() + elda_mem_unfriendly_1._generate_stable_topics() np.testing.assert_allclose( - elda.get_topics(), - elda_mem_unfriendly.get_topics(), + elda_1.get_topics(), + elda_mem_unfriendly_1.get_topics(), ) - elda.generate_gensim_representation() - elda_mem_unfriendly.generate_gensim_representation() + elda_1.generate_gensim_representation() + elda_mem_unfriendly_1.generate_gensim_representation() # same random state, hence topics should be still similar - np.testing.assert_allclose(elda.get_topics(), elda_mem_unfriendly.get_topics(), rtol=RTOL) + np.testing.assert_allclose(elda_1.get_topics(), elda_mem_unfriendly_1.get_topics(), rtol=RTOL) def test_inference(self, elda=None): if elda is None: - elda = self.elda + elda = self.get_elda() # get the most likely token id from topic 0 max_id = np.argmax(elda.get_topics()[0, :]) From 07f514827dff0d99d6434f66bfed0aa91c5377e8 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 4 Jul 2021 22:18:20 +0200 Subject: [PATCH 156/166] new reference model --- gensim/test/test_data/ensemblelda | Bin 10082 -> 10643 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 127d00888c3c5184ca18d49e7117e0d31352d7c9..9c746b200f18efbf64f9c5b3e514a0abd7456a45 100644 GIT binary patch delta 7306 zcmbVR2V4_L+YhAB0xA|jL?aeJA@nMQE=T}DEMOsqWXT57Ora_Zr$}*CkQEObARrdh z6A=rj2ztJXB8mmYf@d#>t2!`qfN)(VdP6UPWxmXcqm4z+CEYV?{FyCm-EZ^{enPHr$*}k&_v5*Lh>}W_G@{hPOQIt&gspx?P}$nwo~fBq*_vR2?^HFHDD&fjgF;q9;d*hoy zwUyFDNa7&m6DHE;NyRdi5<=t{n}kwfm6$65HbQlT5+x*$=kg$ot%+*OA%#o|+=H>n zC{3Y~Cqb#W(P@fQg%T2DY${4o3dP_t4W+0Q5MZW-k|7!Jj5bPD${`5UbWo~5&XvI! zTNkB?g%YV85cO23Hqb?h;6mXNu}QESTLX4yYr-CEGCYkexEpv@llUWnk^(L*HW|2!0!mFb4P4sb(g7`9aOtr%!ASnM zk;u=6lK=ZqZ9Mw;QXZj~7T%CaLEQ``>JmvB#vX$X6$#@ZUMf!naiAnfqQoN+sECrq z3IT?qaVnum$x!|g+!;Vjl^CC%VX2fVxgv&xJ43xP7(i$t9xe*R3PZK|TqQ)f5gqMd z=U{JR=W1i;G~2Pm=jiU{hM_cd$btBBV%jt9n0zS74p3#ON?0nvP$x&G zJJXqA$%o>(Dv{Ef5ynkrI5;tImeve=2M=dE4@YM!hVFtOe}9I(J4Fyj)*wE#0gaQzL5TAH9lEaZhMGt7h5YST!3nMh#|35p2~@r3n2+#l*&;m!CsY%4&@6KO0I+laloQXgz&)D zK!=IBibSDAz)`}$L$FlD$HGyAuS78fti{J*5=sXo1qT$`cyF-tp+z5QW||mU7-@u? z1#E|5_%`=_Po(mdsWJ$tO(=-Laxg|S-U-Xg&B11YB}7IwJhAZbaD1a2APP1~iN(Qo zuoJ3-$0}IbVjv?O)x!ms3**2Jz{m1nKG0W(2dgBB3SkclY3S zO_RpQD2hcL3>A}qk8z1Dj6PKU{Vqcft`k~ zZsY3$>{P&7x=<2V^tGuAj7PBa20URQ2Ru?iKX4o(gXFlW`J6#N`mCJ!*ML?u%%wHo8QyUNLBH0+6 zoae6%wDZO<4A>4ekjlW)09N6sdFO%sP07XJJa`p89$x)*G+jN~?(1k$(UIX62*HE? zjR!t752eew624T-0kNY5?i@k5lc^>$IUoQNMSyGw;*;n{bmQ=pQaK(ZT4A&8-Q68s zv25UTd{@zr6Dm1iX+m-?-UTj0scOdHA%c<(Rn%Eh*pbTp|daawv~7l zWUct!#+k7MF%6AyE}!=6k-CHN(+)(iV~-aZW$oXw??K!3D@zx`<{6<*$J(5lZb!cP zv-Iwzjv?Wz@2?fTrOsOEJgGf+;rUucwp&?2TxYS%63XD)qx@w88^OOo<= zz%_b{=gs3A%B@E(hNhNpd+fI3-WTn?)tbEgCRJVjT}AVr{aMJakiy~_6X$UJ`!`>C zmr+L=cyRtztd4RIa`6`BUTJl~&JvVQ(hs^{GxwtJ)xzlYOB(LZ7n$jt2>J7AuChS8 zmln3ad9`E4`rn;i!=-H8D=lH>>USGrUv^e*e1jij)&nS-G^r_+6-tfGIMCTwC}4y82N|ru$&M4}u;rytCNpjb0VCac<@6g}E=BM$7t-F^+HX*%h?0qS(E| z=E)Mj7gI*VBg$is{js)_dc|0L&M1<83F_lWKV5zEFe$Ei#g0mS)rh-Y#mNq7zfZPK zAjyUnC-10tw%l(*@%Syaao2dG2^&0CLofF#hHl!OIsYa@G)L=G*o}+&nuyb<_Hw1G-W>%eYjYE>r;J8b2WYIu%S2p zH2OVcitnwzGB_DR!=fhXy5C>@vd0%eEJBw@s15`h(_1v3O)}4TSP)xtp})|nvvd+S z?!>mPL;L>oLQl5PW6R_6r-!>;Z{0P`@rhpba`K|i#CVs=j0pEcZ7IiR-siCsWzTq< zdB2^DIv72r<}-Jq*h5Jcq^{QOl2mu5_uXm>w@9&ZAa|ZKJ~~F_x?iAgH`icC+f?Mh z&Xcp!5`BiBsT)X*Nx9=selJ?=MxW{!dV?9fL3`-iqh=Sfra!tL+tFJ+V#(fSNBHSc zK`gxDg8%Zv%O*#L-ms@Fz5S|y6Z-23M)&JDZ_c*W@=z(zqsJaIogNe5>wl~4S)lHwq_u}C zUt>OOgWA^m51ThuZRoju`q8$;#@H$6*jg^Nwd+o54?i;7CBiRe82#u zcmL>;FwP)iICc6gxo>u+>SKd{pHW%lmegOiX3TkHRs8YXx!u_FY1dAFf>Sa(&2uHQ zlBQWB+o@eW2Y1IbCeEM6_li2dBx+6!#YVzpD2BBdZhTv@=yGj}+rE(s(}5uuv!^~= zw#>Pr^mR*Xb&ev6x8jxD-P8>V>?{kZxg$;4p%*GX_oB6E?x(qWs#W7^r5|%$BfTU~ zDUWvF@v+NUn;f-S@7aF-jx~F~Oc#&3cKFiLV266d%Fn~QmQuZAT>#e~bu9lSI<3oM zQ)c^!_j4ZBety8*LU!mie%Nri`hbDaaRqCpW%c{M>_DG62b?Dq@Js)ket@$fBB7<< z_Ky>5(j;_jYqZbxN4kxh#{R zMw^1>xp9z7^N?LvpKe%_m>*2sn1{x}{o zBW-x9PYQaoUFLFUst4!z^VGh&m?^r8oZfYkd!ob0{8L(5%QdbCT)07IY@9v!^|h?h z4%_^wjg1zI^)eP)j{fpYzNljN!KHq_uutcTd5sGjqWuxSXSehEdne6!UCY|^V)MzR z78@I8t}xU(XkDJ1-*oike)HhW9reE$-z!?CC%$=`Ip4?h!`S;g=GD6GGXov223%fC zw`5W&Dsyc~)DUK=!Q-}at*N2+-o(<6*)MNZbRTvezv0unQDYKThwKBxv2Nb#&5jW?O(^_ORMILC5EuE63deyn2@G zI9xH&>2n9uloUn}Yk4dAep! z{R+%(7B6ZUcYXcjq#^lmTEE};#DVrH2Nx0Pnu{R~jAHV`jPxq9Yra!D&w5xRvLV>u%%&3lk`@G8D8zzp?pdYi#fH0 zW%sswtb4I$&C&k5)M?6m`~PSxn$tbsLDS@VqZP?+Q^dX}8CE{N`)s4q0`pduhFO<3 z^jIa>r-i=gaUZzp+kiOj)w_8^wz9X;%qpLDvdP>3T0o>n%eGA&CoaXFky>>IxNn}2 z!y7J!4;zsdh53R=oi%f* zQ@;qwZ`-QlRyCT26pheJ>OWVbdk8YA{f|w=#}g1~LR17Tg<|^9Y;qs%O1od_zRS;F z6*(>SVx|4{FnD)Tug`c-rS;CfwQFo|2Wwt$>AazAC~vQzc+{cEb^g<%nBs)oC)-CB zE}zm+k-lsUpE8yllvLl|jr1I8gs%waZ@El`-%k1PR#)(u-*+E5Vf4iRw0V(t$qPLOv*T-fE;J3pbi zAh(cll{^^00Z2HJR(;Qo;3`;5q&88++4n0t0b}q6fGXhUR>i+hs~~@;uO!aqi(}J* zS=%aRu(lM=Up4D1ryJr^O==6j`81Dv0}eGVoPk#DDSC9*Jk_rP#B2YlWAizKE#r zd?504^AdUPs*Yhjy%fU=E*`-$G%xX@4lD68Ff8$!Jfp;`?6jem!oZA$*E5t)y#$L& zyn+jnX*Z0QPK)$QpN0zzhYJRf-Mi>(Cd5)o!(CCGsRHb5bg_r?x^I*m##6?{MNi1{BD<&0D#u%y2j&y^CRQ|YXKaon z?%mTL9l@V7)`t*npRH7zw2Cz>KZSz+32?Vl_C z6P+~)or$86XZg-+Z>#tF$byZd*R=v^o1C8dl(C|(ciF~Q;cXISLA#RWwC=XM`~cC8 zFh`zFWYu5PxwG~1eZ0fqZ*k0c>pE8R@UeO!mk7FPVoDAxcDGmASarWwht`c}oxPhc zuLtc;Ar`)?74N4W_m#_NmA@kT96?`J^>Qog(a|W!sYJh~Rm%(yveuQ_w;?Ch1VU1` z<`ZYCYFfQdkyEx*6c7>;yk4wzaqg-hG&&moYYvNky~pC_e!T5deABUv^~`mY*}%o` zf&xK^2{R1)M7!GvI>F~{+4*pU=+{(#A~2uzXq*PJYg9YYfiMkyTN~^2o>uQ$ndw9J z6@H&k_GwSko67G39oA5l_16;Z5v*h-%P`es;h`Ta158h7=+|YI6h~N4Jz;heY~=cb zr5m%5q&ID9LWDVRg=MOa_LzOG-qxp>E8i17RBNd1=^7HS{riN6pJpUS)O{C-h%kMg zNClu^xf+qjVJ&TczW~1|f8TJh_|#ZF>=WdDqy+zMMD+l^6Uk9<#WE2N3TcAZ1C*lV zh`4b8=mJeG0GR>s5MQ8{IJluDRB!-h2?4OpPyz`BaGX?*V{60!1-2HTUK#~xVb5rU zsN=w|Rsl*K+q9N$PdFb{_n#yzE4IXWLHn7@oh4Vf}c$Gx48hw4Mar#4>JC3GBPmK|7SAF&+UQy z(~SR^jQX?5z&imoP&G2FpUf6(Qkda1$Gib0W2`=E4tAbE{ za83c=tG zypesGIEbKX%Qg|ae|t%p`rc)68s9FXBZ`T4jhiDz+(|`l=Amkw(S$%n)k_gCRX|1@ z<^||7ICUBS_>?EY!RN0JC4&bDbV$5}D~32yIR^&NBaTA4+KMGgTk!he+WG@D+E)Dy z;Oh+_yGmP7AW9C=1$>JmKIMr3JdWxrAQ=~&86gq)CPfLCN&q_@K;wu2psK_|NTf0n zN>fAf_!qx_$%hsGFNY5YJAi?HJ$%@xPaojD&rQPB{o5e~O9%mu6%4^X;E=;LUxv-*;cRSDWxK+8|3`P-1B}h#o2> zFbQ9bMGARzKpU|L=&OMN1Jz0x_G5q{Iv@p;5+aGGL?*< zYX)L~7$GKDPCT$dwGUivG@w5Y5w-}82j>Q$mh*UGk&r9rDKIG%JCH;UqL6l|iHP6| zWC{$}=b=WFl@16^r3Ql;cwdro2m{ea3=v~e{!m;ec^INcN_NC`qMRtm8Mn{F9S}X0 zS_52qd0iO@?K`g<#p_N{52tt;6g1)=UNEwkbreK>cgurf9ZlKnNwJQhATNrQw0LYU ztHzsDt52EiL-CHI6pg2NCs5GDA8LJjdHqPW22?aBQLK|GMgA0P00l9BsAcuC22xrL zu@o4aAc}Vir6`!<4WXdWA8M!e@`h1rjmRK}Q>+n`qG=Q>LP3$=)nY0>s$I^o(|{<{ z6lQM}nV}#WHBd?j5W_(Yl}aHG#0sP;i9F4dTwqTLOezsiq2Q&_xe81mlM+gWN&u56 zj-sMVg%lRVBYHU~jlVP=B#=ItA{M#5Btm7H9HWCoR6p4l28y=SBOahI0a2QNbaeEz z9{AJ!L-dsV{6swiJ@VIid+kvc9J9244xJIF7Z$&ngf`X8(v6)=wr+G>AEz7j!k!)@ z@qw$S>EOT$YCdaV423^P&36`VxJ1%5Xg3TSMoZGx4?ESbAzP%Q>Ac@dt1WPhp*d>r zIm+}qw`-%FJZTMDwUbz*7{|TqO=MxJ)brGow)p1y3rJNw@?g#K5fH;8T-yE;n9> zO@K%g;|i2A1s%*D5fA1tP?M;{v3`C&69EU0!+%8o7a090xqc1~$sL~|3v<3kg| zVr2$D8tCI>4YAzJPogm_? zl$a1mp{3OF$^IXg$u)%AKn`67YM@}Crd+OWrx8FyaPVY21)t7A;gX9|q;TuOv{G_V zV}cY+0M$=xJdZjxEOPObd^Cxl9?sla6~U}5iD1rc4P_dl&8!$k2os0jyHi7%m^_5J z`oz*ei(5+rv)X(EcNm^vee14cMZc+I#ZNfFy0kEGBr6~zIFJNJobMZG8GC{SI-Bb| z>MqsW?!H_P)4>2!^1%F`j95S#VUa_c;H71mxN2ieEXX}#0-X}d-~y&0W~7gVKqOBi zKspDWK2|chQX~z!1PF!prrbq)CgpQgCaFsNb5T$D{kQMct3EXPmh2kPEq*Y;H z(LW*~IZrI+^8~2`SR9R-k*f_e1)hP~hy`jaOHPK@Bv=y0deghuMwGERKe?Z)TLYc@ zrjU+2Rai<5AGP?$MKWYd^HDlX6**5Sz%d2Li^kJAI02Vgr<6+ceDZ?F7RW=zvJ^ha zwCpEE%YRzz8{Ja>8HZ&4aXV%H=?sbgj*6-PA4{?Bd;jA(V*j83vHz~^Gnp@1XEMV} z`!lVb*9MsMTN_|yy*6NY#M*#OP1XT~)et86ovG{!Na3sv2rto0ZXdL8a@>@R$x1#B z^Ui`w|K1p{KxlZTK+B=oY3Ly$0ePVD6lru&0CTxe4T>0~OQxhtx`i($7hWk##iZUM zAt+IUl?=8qj1OEC8Qhy%#GGve!4w5gj)PSQU56yn;L}I~RwG6_q*8&w%~y(;vB~>@ zeC?DWrfgFU1OzgP9PT>>ScB+MtSZGkEDfwh4B3VduOc9B9b%%BgR3MwDJVy1IuuMk z46H|t*=CSVB!xnfc>)YnhrK2gHcnM!Mdn`N;Rm0;_x*v zuH}e6z5y}7E8vN_7BRwCBgXi8!~|c5nBrxK8D5Ie@l}X9uGLA?zJ9I>Y{lUGV+&A9lecRPpio_yT+uPTnTl8Tika$PSoux=VzcpV$Cnl@vHX`t-ML z+Fvz)$_e%Es(_)wA zhiUAfj=ej#7&nAm3qNwn?~&i6ad(=W>fH{^YTA4z%IRZmIjt%>__X@y9vgOJ?HKDl zJ*9K+3Q}21@1BYITpHS>g@ zQnFrOEpZo40k`vQ%*9iN)vq<%Ug5E~?D^;Hg8lCT=vgQ7l+FY#aE05V+T764RKKxR4KW8JD<*h*;~+iqs>_6BZpYUZ$E=`_%iH&gbwp$n ze)|uXM`4+R|ID6VKn%DU_WLi_vVZsHr1VAq%1byIER>&4^zSO^95}rCkJ`!GK0mY# zqB|t^S&*4@)t%L#aPnm9IgEaMve3I@eR)h)XJzN}%~n!PVFJcqZE3L50jZkWiB6c_ z_2&4*B!!d1DbbZjm;bnbpxS1a*YFQDo88M97lS`tGU70~{qETm$hEGYIfr=axBQn0 z7xG%&7KkS3*>aBVxW#^+wXpOR+Wp%8R#(%uj=QBHjmLL7yE&@Exp#_;W1r4H)6@N7 z(W=JWxKf9K?Aw|)$DGF7d*=jyam#oyYEGi7>UqKK8^Z*L5-L|Le=dK`m^@J(_1NT| z&GVcGw659}`}gU`zLXUEKeHZU+qG|NY+%K41HX>dv;1W*E=+Z9@vrVIJnN8KaLc}@ zGQn_bJ-&uDbf|GwtJ-x(`MD`c(&a9dk1Ma3_&$~%d;UJ6&U%o>r(t~L%9cf2UprxQ zPTl_e+3NU&%i&w*j(A$~Mb+@`VSSCs%is+slG!0L&qb@+*{w7W*^Ja9r&_pOMvBJP z@=1B&Qd#OlFFmi;_@pnQyF;%ka+(f{heQVUJoTQL-hlU&yfD1oXr+Xmm5!e-4e58;^9q@%90SZ!wmXCkHeqw8t5Z}iOq1g9>h9Q4gA^W5c| z5b}-q+iTOwr3>4#o7$$>Z+tdAlF{Sr5$xw`_Wu2jG1;5_8;|e>Q!`hU8*gGbX>x@iN1vvORF(X+?HHy+O`{$1m39#oL)P=Z9)IW=}E#I-jH! z#c3`cxPNeU!D+#?F51c9?y|GeBad#BeJy*vP=`=q@*cm5fF zSo%kMMcJgA1(M>&(|7bUXJ1d0aT_hKd>s;ctf^|^1i#W~?YB_}W^%4&olq87yXe1X zBxQSdUQSN_yR-9QTKBPa*@L%f4ja?t7jLLDFPz)PZe796B(mQ|s*`>l^r|`TkJIBG zolEjvzM)a?qg?XvgLRB?c=z1*M#!rN??0K$;f%XhwLgAr*9YsYx|~&oM;>Up`#1BG z``HhDn5o`H*DW^gto`C1 z@=bQ&(bbV2E1t-@%!9Ld7F)u+IUDAr$zSiCsqz~&_T3uG3v(CE8hpQaZCIye&SS?d zW%6@9H`!UiqYR^FSL7{QhKJlKt8~by(hLwR6gFQsF0eUU=Dv8vkV+kMnr4@?Ii$Y+wCQ>Fknh@3B=vZMS25hkzz@oVw)g$a_9E z3qK(fhhN={v|XRd3>_7mXg0I@c4jqm_WJ#L%Z`o5EbfmVvc_Wb@pAOA7pH69;m#=~ z12yyWD;zJsKD&A3+JX&odz)+CTXL-@Y@AVdX3481^TUhW#+w}7;qWwJtlP4XTf6HL zu_3oQ{DqfXpQ#VFj+oE2Q~kBA%zK=ou{gTs5KXIGc;?==%qwXV!z=U3MYFeP*X574 ze3~&mxJ=!3*Ex${;NG#Jx~tB$jIrd!nVAE@$e)tCZKappft-zVPZxEtZl2q1 zEnBs8+Jn|pvpMVc%)E2|qj}L6^XAlm^!ADk$Lnes8S$*NAoYW*E)~xEh91TL3ZKlp zl#V@r<@08N<)@~QK1)2>D{cxaLq=?kj&$;|oV3P;zP+DQtcO9{th`-qO&-%K-EI1$ z-D`Riy zUmbLL+W;#|bFbs0Rxf_~2XQH3js7Ru(2bX?W(9n>Q!_f$QIo#@BycVn*;X?Pi(l{c z%ig-y$&Cq5wfncTONL*Xbm1=(q5Js9eSh`Ysn_)?MOptMv%e~ecGxWQ7Z|6Q9g;&N_&ipFDJejanq22ioj z%y{tB>kB^)u&&Lg&+z?C|Csy4ne&ntK5(Y*!k2B0vO1GDy6rw)o-po7_O?;8?vCZw zPw2j@VMa33Y~MTV^B9wBUbxyfg%uxAHDt?WkXvUo!nV!NXVT8;v5n^s_D`2DTEL$A zG*-&UH?25w!YA{2eR$b$L{oFp8PmIw`uvL0Lr|0w*QDND`iAfu_yMgx(_HaY?(yE) z>sG%3D-S1KefjB&CBDqWP*Z%dAas#HanGfwsQrcg>_z;pcUcR|+I(vWldkRhd+%!T z5gXnsj-0ypx8sWH@~<9^cZV*IZ}OcfZo6(+n(wn~g-gBjgBu^6f1RC@rzx7wDtvr? z)14iCPye}aqg|I7?UiS8U*103l~4cj4wwG2er==wwZjf&7T?OzP&8qR-`&yWDOCx~ zDPeJSgX8}AW#YXv+NN)7+}&A6BVOR+7vIocE^7Da|LD_HYgzM!lmq#H6sM`~uI#4s z%QNwf?Pblv&L?F-11s}BRiBimjnu5LiFELO!TK`CanG8fkylD5m42DYxwYiZ-mXcZ zp`VHU&mWE2Tbt6-<7eIQ_MFfAw|2U{uCcnWIxvgcV) zZamnBT2cK9Zt8q4Pmw}^{c3Q4foI}{cq-zJ8pDQ|Tt$F`8V;yoa6m2j?xy8}L-6Sh zhr=*3a5#c-z|mgJF<1t`spGJ>0*5+SD!}0ce3iqY9u^F6I0?y5kA~&oWfR~w2eiYE z^Bo3NNt10u0^EhwOn(lz$G{!&;rMtw7=z^xL6+jMBJ9XR?IpYgWcPzB#Y8DMU#1}Y zV!BQ;0q(=9NY@hvoeaEJ&;tfO0QbcG@o5+y){%!(G&B_kOBu8uH!6tDjSq?riHYLE zZdGC^ct~~>HDp<8^}XvR!IZd62p*w^aE1VnQEj)As}XpTkJ@WrJK1Z%)1R{a85wj_ z#FR|-L_K|&jlDm%X!FN_>K2xv%%{+PWeb_7je0?wn0W`bn1K5u6;~gT^u52VJ?z)>t@Zyt_s$!jiP?SbbPi(x`@=ld*V#cyHOPR6K#h-2;AmPB$cArhm zS*}+cd#;d*C_}$1>@j;N+kN?mXQhgqc#V!O>+PPS_@4g Date: Sun, 4 Jul 2021 22:18:53 +0200 Subject: [PATCH 157/166] updated opinosis example --- docs/notebooks/ensemble_lda_with_opinosis.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/notebooks/ensemble_lda_with_opinosis.ipynb b/docs/notebooks/ensemble_lda_with_opinosis.ipynb index cf96d3d74a..354a2eeff8 100644 --- a/docs/notebooks/ensemble_lda_with_opinosis.ipynb +++ b/docs/notebooks/ensemble_lda_with_opinosis.ipynb @@ -10,6 +10,7 @@ "source": [ "import logging\n", "from gensim.models import EnsembleLda, LdaMulticore\n", + "from gensim.models.ensemblelda import rank_masking\n", "from gensim.corpora import OpinosisCorpus\n", "import os" ] @@ -130,7 +131,7 @@ "elda = EnsembleLda(\n", " corpus=opinosis.corpus, id2word=opinosis.id2word, num_models=128, num_topics=20,\n", " passes=20, iterations=100, ensemble_workers=3, distance_workers=4,\n", - " topic_model_class='ldamulticore', masking_method='rank',\n", + " topic_model_class='ldamulticore', masking_method=rank_masking,\n", ")\n", "pretty_print_topics()" ] @@ -169,7 +170,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.9.5" } }, "nbformat": 4, From 1e5108ccb6f301d8622d763ab8eaab01ecdaf887 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Sun, 4 Jul 2021 22:34:37 +0200 Subject: [PATCH 158/166] tox --- gensim/models/ensemblelda.py | 2 +- gensim/test/test_ensemblelda.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 48cac7ab6d..856fa2cc26 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -136,7 +136,7 @@ def _remove_from_all_sets(label, clusters): def _contains_isolated_cores(label, cluster, min_cores): """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster.""" - return sum(map(lambda neighboring_labels: neighboring_labels == {label}, cluster["neighboring_labels"])) >= min_cores + return sum([neighboring_labels == {label} for neighboring_labels in cluster["neighboring_labels"]]) >= min_cores def _aggregate_topics(grouped_by_labels): diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 6cd18ab550..5cdccbfc73 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -224,8 +224,16 @@ def test_multiprocessing(self): memory_friendly_ttda=False, ) - np.testing.assert_allclose(elda.get_topics(), elda_multiprocessing.get_topics(), rtol=RTOL) - np.testing.assert_allclose(elda_mem_unfriendly.get_topics(), elda_multiprocessing_mem_unfriendly.get_topics(), rtol=RTOL) + np.testing.assert_allclose( + elda.get_topics(), + elda_multiprocessing.get_topics(), + rtol=RTOL + ) + np.testing.assert_allclose( + elda_mem_unfriendly.get_topics(), + elda_multiprocessing_mem_unfriendly.get_topics(), + rtol=RTOL + ) def test_add_models_to_empty(self): elda = self.get_elda() From 9ac34390e5597baa00c92819d94c91434ed96926 Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 5 Jul 2021 23:37:22 +0200 Subject: [PATCH 159/166] using dataclasses --- gensim/models/ensemblelda.py | 133 +++++++++++++++++------------- gensim/test/test_data/ensemblelda | Bin 10643 -> 10825 bytes gensim/test/test_ensemblelda.py | 8 +- 3 files changed, 80 insertions(+), 61 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 856fa2cc26..28684c247c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -99,14 +99,17 @@ import os from multiprocessing import Process, Pipe, ProcessError import importlib +from typing import Set, Optional, List import numpy as np from scipy.spatial.distance import cosine +from dataclasses import dataclass, replace from gensim import utils from gensim.models import ldamodel, ldamulticore, basemodel from gensim.utils import SaveLoad + logger = logging.getLogger(__name__) # _COSINE_DISTANCE_CALCULATION_THRESHOLD is used so that cosine distance calculations can be sped up by skipping @@ -114,29 +117,47 @@ _COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05 +@dataclass +class Topic: + is_core: bool # if the topic has enough neighbors + neighboring_labels: Set[int] # which other clusters are close by + neighboring_topic_indices: Set[int] # which other topics are close by + label: Optional[int] # to which cluster this topic belongs + num_neighboring_labels: int # how many different labels a core has as parents + valid_neighboring_labels: Set[int] # A set of labels of close by clusters that are large enough + + +@dataclass +class Cluster: + max_num_neighboring_labels: int # the max number of parent labels among each topic of a given cluster + neighboring_labels: List[Set[int]] # a concatenated list of the neighboring_labels sets of each topic + label: int # the unique identifier of the cluster + num_cores: int # how many topics in the cluster are cores + + def _is_valid_core(topic): """Check if the topic is a valid core, i.e. no neighboring valid cluster is overlapping with it. Parameters ---------- - topic : {'is_core', 'valid_neighboring_labels', 'label'} + topic : Topic topic to validate """ - return topic["is_core"] and (topic["valid_neighboring_labels"] == {topic["label"]}) + return topic.is_core and (topic.valid_neighboring_labels == {topic.label}) def _remove_from_all_sets(label, clusters): """Remove a label from every set in "neighboring_labels" for each core in ``clusters``.""" for cluster in clusters: - for neighboring_labels_set in cluster["neighboring_labels"]: + for neighboring_labels_set in cluster.neighboring_labels: if label in neighboring_labels_set: neighboring_labels_set.remove(label) def _contains_isolated_cores(label, cluster, min_cores): """Check if the cluster has at least ``min_cores`` of cores that belong to no other cluster.""" - return sum([neighboring_labels == {label} for neighboring_labels in cluster["neighboring_labels"]]) >= min_cores + return sum([neighboring_labels == {label} for neighboring_labels in cluster.neighboring_labels]) >= min_cores def _aggregate_topics(grouped_by_labels): @@ -144,37 +165,34 @@ def _aggregate_topics(grouped_by_labels): Parameters ---------- - grouped_by_labels : dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) + grouped_by_labels : dict of (int, list of :class:`Topic`) The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the label. Returns ------- - list of {'max_num_neighboring_labels', 'neighboring_labels', 'label'} - max_num_neighboring_labels is the max number of parent labels among each topic of a given cluster. label - refers to the label identifier of the cluster. neighboring_labels is a concatenated list of the - neighboring_labels sets of each topic. Its sorted by max_num_neighboring_labels in descending - order. There is one single element for each cluster. + list of :class:`Cluster` + It is sorted by max_num_neighboring_labels in descending order. There is one single element for each cluster. """ clusters = [] - for label, group in grouped_by_labels.items(): + for label, topics in grouped_by_labels.items(): max_num_neighboring_labels = 0 neighboring_labels = [] # will be a list of sets - for topic in group: - max_num_neighboring_labels = max(topic["num_neighboring_labels"], max_num_neighboring_labels) - neighboring_labels.append(topic["neighboring_labels"]) + for topic in topics: + max_num_neighboring_labels = max(topic.num_neighboring_labels, max_num_neighboring_labels) + neighboring_labels.append(topic.neighboring_labels) neighboring_labels = [x for x in neighboring_labels if len(x) > 0] - clusters.append({ - "max_num_neighboring_labels": max_num_neighboring_labels, - "neighboring_labels": neighboring_labels, - "label": label, - "num_cores": len([topic for topic in group if topic["is_core"]]), - }) + clusters.append(Cluster( + max_num_neighboring_labels=max_num_neighboring_labels, + neighboring_labels=neighboring_labels, + label=label, + num_cores=len([topic for topic in topics if topic.is_core]), + )) logger.info("found %s clusters", len(clusters)) @@ -186,15 +204,15 @@ def _group_by_labels(cbdbscan_topics): Parameters ---------- - cbdbscan_topics : {list of {'is_core', 'neighboring_labels', 'label'}} + cbdbscan_topics : list of :class:`Topic` A list of topic data resulting from fitting a :class:`~CBDBSCAN` object. After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results - member, which can be used as the argument to this function. It's a list of infos gathered during + member, which can be used as the argument to this function. It is a list of infos gathered during the clustering step and each element in the list corresponds to a single topic. Returns ------- - dict of (int, list of {'is_core', 'num_neighboring_labels', 'neighboring_labels'}) + dict of (int, list of :class:`Topic`) A mapping of the label to a list of topics that belong to that particular label. Also adds a new member to each topic called num_neighboring_labels, which is the number of neighboring_labels of that topic. @@ -203,13 +221,10 @@ def _group_by_labels(cbdbscan_topics): grouped_by_labels = {} for topic in cbdbscan_topics: - if topic["is_core"]: - topic = topic.copy() - - # counts how many different labels a core has as parents - topic["num_neighboring_labels"] = len(topic["neighboring_labels"]) + if topic.is_core: + topic.num_neighboring_labels = len(topic.neighboring_labels) - label = topic["label"] + label = topic.label if label not in grouped_by_labels: grouped_by_labels[label] = [] grouped_by_labels[label].append(topic) @@ -267,27 +282,27 @@ def _validate_clusters(clusters, min_cores): # easy ones and potentially already remove some noise by also sorting smaller clusters to the front. # This clears up the clusters a bit before checking the ones with many neighbors. def _cluster_sort_key(cluster): - return (cluster["max_num_neighboring_labels"], cluster["num_cores"], cluster["label"]) + return cluster.max_num_neighboring_labels, cluster.num_cores, cluster.label sorted_clusters = sorted(clusters, key=_cluster_sort_key, reverse=False) for cluster in sorted_clusters: - cluster["is_valid"] = None - if cluster["num_cores"] < min_cores: - cluster["is_valid"] = False - _remove_from_all_sets(cluster["label"], sorted_clusters) + cluster.is_valid = None + if cluster.num_cores < min_cores: + cluster.is_valid = False + _remove_from_all_sets(cluster.label, sorted_clusters) # now that invalid clusters are removed, check which clusters contain enough cores that don't belong to any # other cluster. - for cluster in [cluster for cluster in sorted_clusters if cluster["is_valid"] is None]: - label = cluster["label"] + for cluster in [cluster for cluster in sorted_clusters if cluster.is_valid is None]: + label = cluster.label if _contains_isolated_cores(label, cluster, min_cores): - cluster["is_valid"] = True + cluster.is_valid = True else: - cluster["is_valid"] = False + cluster.is_valid = False _remove_from_all_sets(label, sorted_clusters) - return [cluster for cluster in sorted_clusters if cluster["is_valid"]] + return [cluster for cluster in sorted_clusters if cluster.is_valid] class EnsembleLda(SaveLoad): @@ -631,7 +646,7 @@ def add_model(self, target, num_new_models=None): # If it has, append. # Be flexible. Can be a single element or a list of elements - # make sure it's a numpy array + # make sure it is a numpy array if not isinstance(target, (np.ndarray, list)): target = np.array([target]) else: @@ -1059,18 +1074,18 @@ def _generate_stable_topics(self, min_cores=None): grouped_by_labels = _group_by_labels(cbdbscan_topics) clusters = _aggregate_topics(grouped_by_labels) valid_clusters = _validate_clusters(clusters, min_cores) - valid_cluster_labels = {cluster["label"] for cluster in valid_clusters} + valid_cluster_labels = {cluster.label for cluster in valid_clusters} for topic in cbdbscan_topics: - topic["valid_neighboring_labels"] = { - label for label in topic["neighboring_labels"] + topic.valid_neighboring_labels = { + label for label in topic.neighboring_labels if label in valid_cluster_labels } # keeping only VALID cores valid_core_mask = np.vectorize(_is_valid_core)(cbdbscan_topics) valid_topics = self.ttda[valid_core_mask] - topic_labels = np.array([topic["label"] for topic in cbdbscan_topics])[valid_core_mask] + topic_labels = np.array([topic.label for topic in cbdbscan_topics])[valid_core_mask] unique_labels = np.unique(topic_labels) num_stable_topics = len(unique_labels) @@ -1222,12 +1237,16 @@ def fit(self, amatrix): """Apply the algorithm to an asymmetric distance matrix.""" self.next_label = 0 - topic_clustering_results = [{ - "is_core": False, - "neighboring_labels": set(), - "neighboring_topic_indices": set(), - "label": None, - } for _ in range(len(amatrix))] + topic_clustering_results = [ + Topic( + is_core=False, + neighboring_labels=set(), + neighboring_topic_indices=set(), + label=None, + num_neighboring_labels=0, + valid_neighboring_labels=set() + ) for i in range(len(amatrix)) + ] amatrix_copy = amatrix.copy() @@ -1266,7 +1285,7 @@ def scan_topic(topic_index, current_label=None, parent_neighbors=None): # This also takes neighbor indices that already are identified as core in count. if num_neighboring_topics >= self.min_samples: # This topic is a core! - topic_clustering_results[topic_index]["is_core"] = True + topic_clustering_results[topic_index].is_core = True # if current_label is none, then this is the first core # of a new cluster (hence next_label is used) @@ -1287,23 +1306,23 @@ def scan_topic(topic_index, current_label=None, parent_neighbors=None): current_label = self.next_label self.next_label += 1 - topic_clustering_results[topic_index]["label"] = current_label + topic_clustering_results[topic_index].label = current_label for neighboring_topic_index in neighboring_topic_indices: - if topic_clustering_results[neighboring_topic_index]["label"] is None: + if topic_clustering_results[neighboring_topic_index].label is None: ordered_min_similarity.remove(neighboring_topic_index) # try to extend the cluster into the direction of the neighbor scan_topic(neighboring_topic_index, current_label, neighboring_topic_indices + [topic_index]) - topic_clustering_results[neighboring_topic_index]["neighboring_topic_indices"].add(topic_index) - topic_clustering_results[neighboring_topic_index]["neighboring_labels"].add(current_label) + topic_clustering_results[neighboring_topic_index].neighboring_topic_indices.add(topic_index) + topic_clustering_results[neighboring_topic_index].neighboring_labels.add(current_label) else: # this topic is not a core! if current_label is None: - topic_clustering_results[topic_index]["label"] = -1 + topic_clustering_results[topic_index].label = -1 else: - topic_clustering_results[topic_index]["label"] = current_label + topic_clustering_results[topic_index].label = current_label # elements are going to be removed from that array in scan_topic, do until it is empty while len(ordered_min_similarity) != 0: diff --git a/gensim/test/test_data/ensemblelda b/gensim/test/test_data/ensemblelda index 9c746b200f18efbf64f9c5b3e514a0abd7456a45..cf396af29cb6122e2e198c9a783ba118d40ebd1b 100644 GIT binary patch delta 3330 zcmbtWcTm&W7AAy%^d1Q4(xiz}rAU`hjerO&J&*trLg-5o!5@*{q=nu^HmEF1Z%Q+u z6bm3oFA5??iXaitATPY}y`8sj=FQH&f4+0hz27AAEf3D%~d7O!ev7E?5Cz<*ij} zDKe3J9`C$!wOgW;G4$qQ1@YIRNS}u>6t(4FWw|8I7CEvc?rS@6gehBf8@9dJvx?ct z7ZfSgl+Gc&JFs}flg+_s7L2dy6c5&Hs1{#R?Oy$`7Q18j=*6ZbtbrNxG6C?Sj$X$P zSvRuPHnQDs@wz!8Y505E1;=Q;Eitq{o-EHG_#%ziAomeeeMX3k6K-75-(x3hEl@gN zE3xWd3X?P{*p8%VG~{+L7>rivPE{1$a{-$~quN8Im68pA{#x6`6)q_@>pj7)Byj$O zLGY;}offXa^y|3?SE2q3-~wR1?F&*R)P2Tk=D-;NxpfKT&O4=F)zDTi9H{RU!jr6e3-~D_rke?J$g$ zpGo3;@106j)1Yv$zv(%p8u^(LHR~o71poZ$#)(xURwQ7i)FR519ODaU+lnx$(`P*1 zz-g@)!+^NeLLPFkx)5f2G$T8}j13+5e6;KBgSrMj?Hh&80PNBUgY#B72$1YO#qN-| zM<$IKtdQ*aFN{QE=B!*FlBqrN>0$?GhE@7R9H5&jo#xL;eIAZo4~m zhvYlhDYpVlbhO2-)3*TyxgMOK9n~~n88zLMi_aj>P{RBkoEEU!$4KIZg%RbtQyM}! zmUDAYA#l@HUp{9=n+XdcxKpwA*tFr~?SN+4nfo)7NAvHRPP&}_yox*XD|!!JcM~is zXfdXT8_U^jhSsb#lz{s?5T6S0tYjVa=56=y9M|(Cro%Nwo&rJ7a(0)BlVQFG;?ec} zyI%1{!PZN^bjniYBC;NAd7|Ab$Jj=mJdM4rDei8w?L?@mY<&cX; zxwx9K$v1Z`vJH}>&zb{EtnT?Pv4I3H$FNgeAMMm2?lrdKr`;3h`(IJwNj9k=nP)=Y zmI-2rZQhVAGyqYN~I9PXf?&g{G>_U+jT=U$D9)^6TROeso}^ym$1ttZKhSDwdgv_mufip-X2ctr2( z1=4JD0)u|ArxBgWKn&_>uQ=yfpGFCKSg}sa^V#Tfh0bx|eCsqbLA4hz-@wSTLWL52 zb~_OpUe0dVp2f^p2ip3%kJ#!vn~07#-la>nC<9&Lu08?6_E51ZF4eHix}<@~yIi+a zk{F5m17VIW=G*CWDT(b%hNsr7!3^e>N+z!k15pQk0=TgHS_~&Y)MZA~(g6 zIKPKA1AGw%!~)3~>8Z6?jgJdqLndq6lUW?vhHq+FjC4jnT!J$D`CjEr!CTxuhc-vf8N)AvwbxZh(>KI*IlTE06jhs!Q{Br7EgyUsmtJv-Tszl!2olH2bqst>I>4T`)7RBiS~{w398KnD%Vz zV`A8qE6R-G1Dg*vk}~01<87q2=*8!DcbWX3DV+3_?{uAs7E;7sF?<4`!*jN`{1g@K z)cB3R+6A1K^L!195l|os6crjCI&P4sL4k^AsR39nY9Ah#67>~&YeT&{`LMz^4cKMe zMdZMYW1NDatpmRK=K9-$h^p@kS(!{Lm00ACl&;x@d~-(!S1JCYYk{JxAKusQ&-Lx) zcXzQ|4p{g6EZ2i?6>1!(Gh#DP!;6hEenhW6S3LOEY~R&Ijp^^wQ6}E9)8(FO=(_9S z$4@sNW@3L*@m5=(A(ON$#j+DW0nEMtamey*GL22pPf4LD+j1(!)U%muXMPQWmcX1W-c1WPfe!3C%@~-HeH=#)!vx`s18x<3Hh1Hq5Kk3 zMIl5WQa;+mmYnxZ;ak#r`*kQH1QfLscG-Ts%~(XYs7q5YH|5cJ%v<^=EUa~3u35Jy;S*4Jwj=Rb-ZmR8jl zny>6hou&i-tZZ*NU7yr{6%ZR{>cEqjpUmXzh@8qJry9heeOgOAM5X;A^yJIRFmX@9 zS9RMq7??Xu4wCTP<3>6O_T|XQ{OsKIjO5mpYkk0!p44{bk#gv;_*O~#*3h+$xyq8) z@eHoTJ`NvGiuPNa(4s;Hh&KzQjd)pd8FLjlEj)ndPwGs~-TFR>0n&u7%*Q|tY4+Y# zU$%0?XNiBL&N68X7%;wZ^N-yL<@3j_Z@QFnY z*Uw5=CYD0$AVJU*H)iyg&+4`xR8kh^&9=DSP+6Oc*OqdohpD0yaq|JoI-e*<*C{*B zg0*K)IDyVhA##(9fiiJPTw>19!<38_7}i2DGnQ|6XjaWRuZ;Z#A>8a$*Wpo>72VkU z?D}C6p_de@HK4&eI!wff?Fz)fVn&3^3;XeNl5IU-P~9qOjMJW-xy6h=wt30E_*@N`pWpgWq3RbOO>eo)-Mi>2 zP}Z93ExiXozq1Zz4N)_P#RX(GV=C{G&F|zF!-5wPBkjQ>Ffk^tx_fTK5NrKqN9~57 z?+;3X>I&Hn1uY#Ee_QiW3p~hBNH8K6pM%%g#26UX^fqKaJHO=1{E^NqeFOvtat3lXqp&{6G>Pw$DtxrM8+p*<>i}*i!uB+q;fKXSkIBaV z`;*FqOX+WQ=1jf5p|TVCtg>pikiXZDR8K{CG@aMBN54f&EY}f!A)r&i0sg@Rgiqrx zw-o&!e?Mh;MRjFmC8b6=J|Y(!=BT?UrtM@o-iKrm)t`Di7A z4$FTWKv?L%c_Dvo3h2M50DpB1=npY;2nd4dhZxYG#eg#YvKV^WlYqbVrvDHF{(r;( zKXwR0&{q8S7W^kMkpHzA&k%GfM<@#C?cwT<3km+kQy?gq;6jk0xrPb;&x4>JzFClx z`03aPv`efIEn-EELG~C_XyA$E%NCIqHAZ7~Y1I$=T%@HmU;L|$`( zML}vnx_}gEq9Dak2WIZvDQniuoj>p1d#!h`=j^l3kMqGW*?=+xoEj-qNl5@P${0#X zAtdDy(sFP~328+Ig{t=;YcRm`fh`IrV>?s=nNMKwf$vF$=DLXsLh;x_7bTMaj0=3Y zki^L#SsG3QP_KPY+B#Pj`rw_IEX<2TS5G!q>Cvm|n!8F>dODUZ$*iH3#TE0tBSS&v zPTV+MndV-Z)+znXZ#mNgMEaYtbMdY_l(!zr^7U((e|+@Q`DVi7^C7?iIS_#9fon1H0K`cD-zKe<+-7lbo>c@qBc+zf%aWkaCi+<`_g6IkJ^`A{3!GkvX z-XaNdSl?CkPtcAlz<5h~uIM>y2a(*ouL{MpM`y~5DD0wJG1bx2KJ70mLxJboiD@@@ zjjYs2Swp+HYO>?`k3TG_eP8rVf@gDz5=tK9?a64b&y^YXsSGEYq$=3@o#=D;UG$|tvzGZeeRL^AR=Ug+0TDz7BCZc4hRihdDF+nUHh#J zmCLVnjrJL}h&&!-VyMq$r0RsH*=}qJuvIigOBX-s^l2DJmf47pU70kA!Y*9Ygt8Bl zJ8a-`?njrbmy_ly)V`us#Fr*V1KRvIV|Y~{;h30$q5~+Vkq5?{4~vt#;RdUO@1Rz# zv_4j+S(hpR**Jr6;`&mU>J!Ts6{TNLE*PW9v5VQbZ+F=1RlyiJibSy`FV!{S z3r;M#md#({M=6Kc+`3rxp#u)vRvrh#n+sUm_6Nm40WF4QdLr$fRKPoaQ8bw+L)yLK zT3NwLb}-_lWz+M^tUT$6ScmWBJ`9e8G?Q zTjsxZvH0}!OF^7QG>)q*dbm7cuY$%`GtY*Lw^kWhrZ|e714fM;^ zS^+8|tvd$SNl%&)IICJ70e`jG%w z={|Z5o7u2cw#tk3wiw_{9hVY5SoHS&e9>;mxt(JAGZg;glf&+U0Lz7k-W@j4$1CC^ zN`4w7M}-Sq%^aGh>MClJPgfG@@Y;}fnSAke1sf?y9XdjH?Os+!Vfy;i+r>PWCQIQi z9V$5jA*t;Q^D9F+Jylai=nntVTQ5rhtIa^GxG#skM5o70sd>v=B|HSP->A0D`cyyL z|Nax4(V|e=epgp%%Bs?*w+BvsA*0-p?zgZ?qR%KN)@w>ln_SG4?5`Sp^fE9q1&g`E zVLpr5P$qzjij&lB)bP@|_e}yO1TsVVS$-KU5Dw?z*&wMcwuQ#QqAEJp7N2Xs2^Rg@ z4%a{$Rmt)s*ynuHtg=eiLwAq>@h`7`9M*vg9BVWrO+?XJK_X-A%TnAh2}S$saW5Rc z*Cb=f7CqIsrrHyV{7^z&q3jH-nb&VCSPk3;o)0f)#_n>b&@+#wlr0dm232+=u7ji6 zr# zRT%k*de0aLmSJeT@hLQiC?0K)(Ij9^gR>UAaJ1rWkzZO9q^jbC9DQ)R$-L1-O?71| zinPUdV}o!lb1kbaNFbx}w+BpAHKMuqqFWoTyu8t z;p%6d(-PV&8P@GHQ!jft{b2X}MYLAh z5(ln{(m|Oxmoiz>ADITdjpjiBu6vY@RT%c&b}~|jJfCBC7F7cuGrU)wDV&i|?MOU~ z7-~^azq;;KT%XS?b4Y}9kfETcj<1e2ZjcWVj>r!{PwnB+9AF5dj~|-r8=c;K9CK;k z*@lN0tP$mZZSJv;Y0GV_GSP^etZ8++V7~wxzR`7MQCVFzees%@#*o`g@1Z7D_4l1g zrZhE1g>GPi6j7^Q7buGsV2i4MMcC1{I+Xu7r&x2CTbRwGc)jm81&ul0=1h}RL zZq0|NCUB0Y0TvyXMpF5)^wCZMB-P6<$Nj=pF!5JcN-&9V@9rL05BsBnxowPH&l%b_ zt|&<7F#f4knQ@YgO779!r~Aj=3}xY$$OwtGhTp+T6YFY%4R`9v99U^0-co2!^5c8& zG`Tq^4fSC3#R+WaoxpXb7n$*C6B%#05krKVcoI`-!0oh3yWiHx_<=ZvB+u`+t3Wntzp@ko*`t~Hj)X@UmNAQ7Xg&={nBVWL~ zo_e+y581l`yBi}tk{;)x;?j4QXptR{Nx`kLbwDB)%T7?DP;CRaea7iO}&__KOQJ z-pU_#e3Mk*;x329y^Eg@uJsp;JC1~Xy>$_wZFZBYf5Cg$-eR>~(fvK7E*5q#Z;L#1 zv~H|B7vbKfKbU$03A;?(ZJy|a9_b=L;t`>n^7DHc9X{N5^r_>FDMgN)K|8%gwlPf{ z+Q}?5SW;Kn**XVy!h12j{nrj;JO+9YKQPAvC;v8Ns9!awVsQ0GvP{s`YXLtOv`Ve8 zTgb~-d__yP`bNlnbh-&=3YBfy*xEnfh>Djr&FyY4hz~GDaU@p*g zY;FI{`l}|VA!oDfK}x3`;k})+j)(T!b3mt<=dTps6k9yV)59l4$FXXQk)Pt9|9-ME zk_z(DRU#}!49T;+W>jog8&_w0tKlPQC;R_k z;6HaC{|2M{PlGvN&e`JW{YZUNYp)7?p*)F%`C-A#Au;XD<-QxQ9n6ZeUpcw|ol7%g`aA~98xf&=9L`bwlE aq~(*Dg$3xfPaHN*wiYQ)t~MB3_+J38$arZ0 diff --git a/gensim/test/test_ensemblelda.py b/gensim/test/test_ensemblelda.py index 5cdccbfc73..ad574108f9 100644 --- a/gensim/test/test_ensemblelda.py +++ b/gensim/test/test_ensemblelda.py @@ -152,12 +152,12 @@ def test_generate_gensim_representation(self): def assert_clustering_results_equal(self, clustering_results_1, clustering_results_2): """Assert important attributes of the cluster results""" np.testing.assert_array_equal( - [element["label"] for element in clustering_results_1], - [element["label"] for element in clustering_results_2], + [element.label for element in clustering_results_1], + [element.label for element in clustering_results_2], ) np.testing.assert_array_equal( - [element["is_core"] for element in clustering_results_1], - [element["is_core"] for element in clustering_results_2], + [element.is_core for element in clustering_results_1], + [element.is_core for element in clustering_results_2], ) def test_persisting(self): From 773ce173d1bad31b98c0047f579689b5c623fb7e Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 5 Jul 2021 23:43:51 +0200 Subject: [PATCH 160/166] updated type syntax for docstring --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 28684c247c..c415c7bdfd 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -140,7 +140,7 @@ def _is_valid_core(topic): Parameters ---------- - topic : Topic + topic : :class:`Topic` topic to validate """ From 4d674f98f65ff958f1afde0daedec2b8962e4e0d Mon Sep 17 00:00:00 2001 From: sezanzeb Date: Mon, 5 Jul 2021 23:47:35 +0200 Subject: [PATCH 161/166] unused import --- gensim/models/ensemblelda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index c415c7bdfd..88c37845ef 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -103,7 +103,7 @@ import numpy as np from scipy.spatial.distance import cosine -from dataclasses import dataclass, replace +from dataclasses import dataclass from gensim import utils from gensim.models import ldamodel, ldamulticore, basemodel From c35fb01bf5406754b569c74c09874ee10ee91eee Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 18 Jul 2021 20:49:23 +0900 Subject: [PATCH 162/166] update sbt install step --- .github/workflows/tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8e3ad48871..41a608ef90 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -39,7 +39,8 @@ jobs: # - name: Update sbt run: | - echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list + echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add sudo apt-get update -y sudo apt-get install -y sbt From 71b33ddfb9ad4e510fe39894fc82efc5ffef5788 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 18 Jul 2021 22:28:37 +0900 Subject: [PATCH 163/166] minor refactoring Decoupled multiprocessing code from EnsembleLda class. This reduces the length of the class by several hundred lines, making it slightly easier to understand. Added _generate_topic_models_worker function to clarify distinction between single-process and multi-process code. Fixed flake8 problem (l is an ambiguous variable name) Adjusted _teardown function (removed i parameter, it's only for logs) Moved _MAX_RANDOM_STATE to module level --- gensim/models/ensemblelda.py | 591 ++++++++++++++++++----------------- 1 file changed, 311 insertions(+), 280 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 88c37845ef..9fcb486144 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -116,6 +116,9 @@ # distance calculations for highly masked topic-term distributions _COSINE_DISTANCE_CALCULATION_THRESHOLD = 0.05 +# nps max random state of 2**32 - 1 is too large for windows +_MAX_RANDOM_STATE = np.iinfo(np.int32).max + @dataclass class Topic: @@ -241,12 +244,7 @@ def _teardown(pipes, processes, i): list of pipes that the processes use to communicate with the parent processes : {list of :class:`multiprocessing.Process`} list of worker processes - i : int - index of the process that could not be started - """ - logger.error(f"could not start process {i}") - for parent_conn, child_conn in pipes: child_conn.close() parent_conn.close() @@ -305,6 +303,292 @@ def _cluster_sort_key(cluster): return [cluster for cluster in sorted_clusters if cluster.is_valid] +def _generate_topic_models_multiproc(ensemble, num_models, ensemble_workers): + """Generate the topic models to form the ensemble in a multiprocessed way. + + Depending on the used topic model this can result in a speedup. + + Parameters + ---------- + ensemble: EnsembleLda + the ensemble + num_models : int + how many models to train in the ensemble + ensemble_workers : int + into how many processes to split the models will be set to max(workers, num_models), to avoid workers that + are supposed to train 0 models. + + to get maximum performance, set to the number of your cores, if non-parallelized models are being used in + the ensemble (LdaModel). + + For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus. + In that case, ensemble_workers=2 can be used. + + """ + # the way random_states is handled needs to prevent getting different results when multiprocessing is on, + # or getting the same results in every lda children. so it is solved by generating a list of state seeds before + # multiprocessing is started. + random_states = [ensemble.random_state.randint(_MAX_RANDOM_STATE) for _ in range(num_models)] + + # each worker has to work on at least one model. + # Don't spawn idle workers: + workers = min(ensemble_workers, num_models) + + # create worker processes: + # from what I know this is basically forking with a jump to a target function in each child + # so modifying the ensemble object will not modify the one in the parent because of no shared memory + processes = [] + pipes = [] + num_models_unhandled = num_models # how many more models need to be trained by workers? + + for i in range(workers): + parent_conn, child_conn = Pipe() + num_subprocess_models = 0 + if i == workers - 1: # i is a index, hence -1 + # is this the last worker that needs to be created? + # then task that worker with all the remaining models + num_subprocess_models = num_models_unhandled + else: + num_subprocess_models = int(num_models_unhandled / (workers - i)) + + # get the chunk from the random states that is meant to be for those models + random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] + + args = (ensemble, num_subprocess_models, random_states_for_worker, child_conn) + try: + process = Process(target=_generate_topic_models_worker, args=args) + processes.append(process) + pipes.append((parent_conn, child_conn)) + process.start() + num_models_unhandled -= num_subprocess_models + except ProcessError: + logger.error(f"could not start process {i}") + _teardown(pipes, processes) + raise ProcessError + + # aggregate results + # will also block until workers are finished + for parent_conn, _ in pipes: + answer = parent_conn.recv() + parent_conn.close() + # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): + if not ensemble.memory_friendly_ttda: + ensemble.tms += answer + ttda = np.concatenate([m.get_topics() for m in answer]) + else: + ttda = answer + ensemble.ttda = np.concatenate([ensemble.ttda, ttda]) + + for process in processes: + process.terminate() + + +def _generate_topic_models(ensemble, num_models, random_states=None): + """Train the topic models that form the ensemble. + + Parameters + ---------- + ensemble: EnsembleLda + the ensemble + num_models : int + number of models to be generated + random_states : list + list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles + RandomState if None (default). + """ + if random_states is None: + random_states = [ensemble.random_state.randint(_MAX_RANDOM_STATE) for _ in range(num_models)] + + assert len(random_states) == num_models + + kwargs = ensemble.gensim_kw_args.copy() + + tm = None # remember one of the topic models from the following + # loop, in order to collect some properties from it afterwards. + + for i in range(num_models): + kwargs["random_state"] = random_states[i] + + tm = ensemble.get_topic_model_class()(**kwargs) + + # adds the lambda (that is the unnormalized get_topics) to ttda, which is + # a list of all those lambdas + ensemble.ttda = np.concatenate([ensemble.ttda, tm.get_topics()]) + + # only saves the model if it is not "memory friendly" + if not ensemble.memory_friendly_ttda: + ensemble.tms += [tm] + + # use one of the tms to get some info that will be needed later + ensemble.sstats_sum = tm.state.sstats.sum() + ensemble.eta = tm.eta + + +def _generate_topic_models_worker(ensemble, num_models, random_states, pipe): + # + # Same as _generate_topic_models, but runs in a separate subprocess, and + # sends the updated ensemble state to the parent subprocess via a pipe. + # + logger.info(f"spawned worker to generate {num_models} topic models") + + _generate_topic_models(ensemble=ensemble, num_models=num_models, random_states=random_states) + + # send the ttda that is in the child/workers version of the memory into the pipe + # available, after _generate_topic_models has been called in the worker + if ensemble.memory_friendly_ttda: + # remember that this code is inside the worker processes memory, + # so self.ttda is the ttda of only a chunk of models + pipe.send(ensemble.ttda) + else: + pipe.send(ensemble.tms) + + pipe.close() + + +def _calculate_asymmetric_distance_matrix_chunk( + ttda1, + ttda2, + start_index, + masking_method, + masking_threshold, +): + """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. + + Parameters + ---------- + ttda1 and ttda2: 2D arrays of floats + Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one + topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. + start_index : int + this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the + complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into + two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 + masking_method: function + + masking_threshold: float + + Returns + ------- + 2D numpy.ndarray of floats + Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. + + """ + # initialize the distance matrix. ndarray is faster than zeros + distances = np.ndarray((len(ttda1), len(ttda2))) + + if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: + # the worker might not have received a ttda because it was chunked up too much + + # some help to find a better threshold by useful log messages + avg_mask_size = 0 + + # now iterate over each topic + for ttd1_idx, ttd1 in enumerate(ttda1): + # create mask from ttd1 that removes noise from a and keeps the largest terms + mask = masking_method(ttd1, masking_threshold) + ttd1_masked = ttd1[mask] + + avg_mask_size += mask.sum() + + # now look at every possible pair for topic a: + for ttd2_idx, ttd2 in enumerate(ttda2): + # distance to itself is 0 + if ttd1_idx + start_index == ttd2_idx: + distances[ttd1_idx][ttd2_idx] = 0 + continue + + # now mask b based on a, which will force the shape of a onto b + ttd2_masked = ttd2[mask] + + # Smart distance calculation avoids calculating cosine distance for highly masked topic-term + # distributions that will have distance values near 1. + if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD: + distance = 1 + else: + distance = cosine(ttd1_masked, ttd2_masked) + + distances[ttd1_idx][ttd2_idx] = distance + + percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) + logger.info(f'the given threshold of {masking_threshold} covered on average {percent}% of tokens') + + return distances + + +def _asymmetric_distance_matrix_worker( + worker_id, + entire_ttda, + ttdas_sent, + n_ttdas, + masking_method, + masking_threshold, + pipe, +): + """Worker that computes the distance to all other nodes from a chunk of nodes.""" + logger.info(f"spawned worker {worker_id} to generate {n_ttdas} rows of the asymmetric distance matrix") + # the chunk of ttda that's going to be calculated: + ttda1 = entire_ttda[ttdas_sent:ttdas_sent + n_ttdas] + distance_chunk = _calculate_asymmetric_distance_matrix_chunk( + ttda1=ttda1, + ttda2=entire_ttda, + start_index=ttdas_sent, + masking_method=masking_method, + masking_threshold=masking_threshold, + ) + pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory + pipe.close() + + +def _calculate_assymetric_distance_matrix_multiproc( + workers, + entire_ttda, + masking_method, + masking_threshold, +): + processes = [] + pipes = [] + ttdas_sent = 0 + + for i in range(workers): + try: + parent_conn, child_conn = Pipe() + + # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3. + n_ttdas = 0 + if i == workers - 1: # i is a index, hence -1 + # is this the last worker that needs to be created? + # then task that worker with all the remaining models + n_ttdas = len(entire_ttda) - ttdas_sent + else: + n_ttdas = int((len(entire_ttda) - ttdas_sent) / (workers - i)) + + args = (i, entire_ttda, ttdas_sent, n_ttdas, masking_method, masking_threshold, child_conn) + process = Process(target=_asymmetric_distance_matrix_worker, args=args) + ttdas_sent += n_ttdas + + processes.append(process) + pipes.append((parent_conn, child_conn)) + process.start() + except ProcessError: + logger.error(f"could not start process {i}") + _teardown(pipes, processes) + raise ProcessError + + distances = [] + # note, that the following loop maintains order in how the ttda will be concatenated + # which is very important. Ordering in ttda has to be the same as when using only one process + for parent_conn, _ in pipes: + worker_id, distance_chunk = parent_conn.recv() + parent_conn.close() # child conn will be closed from inside the worker + # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): + distances.append(distance_chunk) + + for process in processes: + process.terminate() + + return np.concatenate(distances) + + class EnsembleLda(SaveLoad): """Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble. @@ -384,10 +668,6 @@ def __init__( Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble. """ - # INTERNAL PARAMETERS - # Set random state - # nps max random state of 2**32 - 1 is too large for windows: - self._MAX_RANDOM_STATE = np.iinfo(np.int32).max if "id2word" not in gensim_kw_args: gensim_kw_args["id2word"] = None @@ -450,13 +730,12 @@ def __init__( if "passes" in gensim_kw_args and gensim_kw_args["passes"] <= 0: return - logger.info(f"generating {num_models} topic models...") + logger.info(f"generating {num_models} topic models using {ensemble_workers} workers") if ensemble_workers > 1: - self._generate_topic_models_multiproc(num_models, ensemble_workers) + _generate_topic_models_multiproc(self, num_models, ensemble_workers) else: - # singlecore - self._generate_topic_models(num_models) + _generate_topic_models(self, num_models) self._generate_asymmetric_distance_matrix() self._generate_topic_clusters(epsilon, min_samples) @@ -733,216 +1012,6 @@ def add_model(self, target, num_new_models=None): # tell recluster that the distance matrix needs to be regenerated self.asymmetric_distance_matrix_outdated = True - def _generate_topic_models_multiproc(self, num_models, ensemble_workers): - """Generate the topic models to form the ensemble in a multiprocessed way. - - Depending on the used topic model this can result in a speedup. - - Parameters - ---------- - num_models : int - how many models to train in the ensemble - ensemble_workers : int - into how many processes to split the models will be set to max(workers, num_models), to avoid workers that - are supposed to train 0 models. - - to get maximum performance, set to the number of your cores, if non-parallelized models are being used in - the ensemble (LdaModel). - - For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus. - In that case, ensemble_workers=2 can be used. - - """ - # the way random_states is handled needs to prevent getting different results when multiprocessing is on, - # or getting the same results in every lda children. so it is solved by generating a list of state seeds before - # multiprocessing is started. - random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] - - # each worker has to work on at least one model. - # Don't spawn idle workers: - workers = min(ensemble_workers, num_models) - - # create worker processes: - # from what I know this is basically forking with a jump to a target function in each child - # so modifying the ensemble object will not modify the one in the parent because of no shared memory - processes = [] - pipes = [] - num_models_unhandled = num_models # how many more models need to be trained by workers? - - for i in range(workers): - parent_conn, child_conn = Pipe() - num_subprocess_models = 0 - if i == workers - 1: # i is a index, hence -1 - # is this the last worker that needs to be created? - # then task that worker with all the remaining models - num_subprocess_models = num_models_unhandled - else: - num_subprocess_models = int(num_models_unhandled / (workers - i)) - - # get the chunk from the random states that is meant to be for those models - random_states_for_worker = random_states[-num_models_unhandled:][:num_subprocess_models] - - try: - process = Process( - target=self._generate_topic_models, - args=(num_subprocess_models, random_states_for_worker, child_conn), - ) - - processes.append(process) - pipes.append((parent_conn, child_conn)) - process.start() - - num_models_unhandled -= num_subprocess_models - - except ProcessError: - _teardown(pipes, processes, i) - raise ProcessError - - # aggregate results - # will also block until workers are finished - for parent_conn, _ in pipes: - answer = parent_conn.recv() - parent_conn.close() - # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): - if not self.memory_friendly_ttda: - self.tms += answer - ttda = np.concatenate([model.get_topics() for model in answer]) - else: - ttda = answer - self.ttda = np.concatenate([self.ttda, ttda]) - - # end all processes - for process in processes: - process.terminate() - - def _generate_topic_models(self, num_models, random_states=None, pipe=None): - """Train the topic models that form the ensemble. - - Parameters - ---------- - num_models : int - number of models to be generated - random_states : list - list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles - RandomState if None (default). - pipe : multiprocessing.pipe - Default None. If provided, will send the trained models over this pipe. If memory friendly, it will only - send the ttda. - - """ - if pipe is not None: - logger.info(f"spawned worker to generate {num_models} topic models") - - if random_states is None: - random_states = [self.random_state.randint(self._MAX_RANDOM_STATE) for _ in range(num_models)] - - assert len(random_states) == num_models - - kwargs = self.gensim_kw_args.copy() - - tm = None # remember one of the topic models from the following - # loop, in order to collect some properties from it afterwards. - - for i in range(num_models): - kwargs["random_state"] = random_states[i] - - tm = self.get_topic_model_class()(**kwargs) - - # adds the lambda (that is the unnormalized get_topics) to ttda, which is - # a list of all those lambdas - self.ttda = np.concatenate([self.ttda, tm.get_topics()]) - - # only saves the model if it is not "memory friendly" - if not self.memory_friendly_ttda: - self.tms += [tm] - - # use one of the tms to get some info that will be needed later - self.sstats_sum = tm.state.sstats.sum() - self.eta = tm.eta - - if pipe is not None: - # send the ttda that is in the child/workers version of the memory into the pipe - # available, after _generate_topic_models has been called in the worker - if self.memory_friendly_ttda: - # remember that this code is inside the worker processes memory, - # so self.ttda is the ttda of only a chunk of models - pipe.send(self.ttda) - else: - pipe.send(self.tms) - - pipe.close() - - def _calculate_asymmetric_distance_matrix_chunk(self, ttda1, ttda2, start_index): - """Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``. - - Parameters - ---------- - ttda1 and ttda2: 2D arrays of floats - Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one - topic. Each cell in the resulting matrix corresponds to the distance between a topic pair. - start_index : int - this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the - complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into - two pieces, each 100 ttdas long, then start_index should be be 100. default is 0 - - Returns - ------- - 2D numpy.ndarray of floats - Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``. - - """ - # initialize the distance matrix. ndarray is faster than zeros - distances = np.ndarray((len(ttda1), len(ttda2))) - - if ttda1.shape[0] > 0 and ttda2.shape[0] > 0: - # the worker might not have received a ttda because it was chunked up too much - - # some help to find a better threshold by useful log messages - avg_mask_size = 0 - - # now iterate over each topic - for ttd1_idx, ttd1 in enumerate(ttda1): - # create mask from ttd1 that removes noise from a and keeps the largest terms - mask = self.masking_method(ttd1, self.masking_threshold) - ttd1_masked = ttd1[mask] - - avg_mask_size += mask.sum() - - # now look at every possible pair for topic a: - for ttd2_idx, ttd2 in enumerate(ttda2): - # distance to itself is 0 - if ttd1_idx + start_index == ttd2_idx: - distances[ttd1_idx][ttd2_idx] = 0 - continue - - # now mask b based on a, which will force the shape of a onto b - ttd2_masked = ttd2[mask] - - # Smart distance calculation avoids calculating cosine distance for highly masked topic-term - # distributions that will have distance values near 1. - if ttd2_masked.sum() <= _COSINE_DISTANCE_CALCULATION_THRESHOLD: - distance = 1 - else: - distance = cosine(ttd1_masked, ttd2_masked) - - distances[ttd1_idx][ttd2_idx] = distance - - percent = round(100 * avg_mask_size / ttda1.shape[0] / ttda1.shape[1], 1) - logger.info(f'the given threshold of {self.masking_threshold} covered on average {percent}% of tokens') - - return distances - - def _asymmetric_distance_matrix_worker(self, worker_id, ttdas_sent, n_ttdas, pipe): - """Worker that computes the distance to all other nodes from a chunk of nodes.""" - logger.info(f"spawned worker to generate {n_ttdas} rows of the asymmetric distance matrix") - # the chunk of ttda that's going to be calculated: - ttda1 = self.ttda[ttdas_sent:ttdas_sent + n_ttdas] - distance_chunk = self._calculate_asymmetric_distance_matrix_chunk( - ttda1=ttda1, ttda2=self.ttda, start_index=ttdas_sent - ) - pipe.send((worker_id, distance_chunk)) # remember that this code is inside the workers memory - pipe.close() - def _generate_asymmetric_distance_matrix(self): """Calculate the pairwise distance matrix for all the ttdas from the ensemble. @@ -958,63 +1027,25 @@ def _generate_asymmetric_distance_matrix(self): logger.info(f"generating a {len(self.ttda)} x {len(self.ttda)} asymmetric distance matrix...") - # singlecore if workers is not None and workers <= 1: - self.asymmetric_distance_matrix = self._calculate_asymmetric_distance_matrix_chunk( - ttda1=self.ttda, ttda2=self.ttda, start_index=0, + self.asymmetric_distance_matrix = _calculate_asymmetric_distance_matrix_chunk( + ttda1=self.ttda, + ttda2=self.ttda, + start_index=0, + masking_method=self.masking_method, + masking_threshold=self.masking_threshold, + ) + else: + # best performance on 2-core machine: 2 workers + if workers is None: + workers = os.cpu_count() + + self.asymmetric_distance_matrix = _calculate_assymetric_distance_matrix_multiproc( + workers=workers, + entire_ttda=self.ttda, + masking_method=self.masking_method, + masking_threshold=self.masking_threshold, ) - - # else, if workers > 1 use multiprocessing - # best performance on 2-core machine: 2 workers - if workers is None: - workers = os.cpu_count() - - # create worker processes: - processes = [] - pipes = [] - ttdas_sent = 0 - - for i in range(workers): - try: - parent_conn, child_conn = Pipe() - - # Load Balancing, for example if there are 9 ttdas and 4 workers, the load will be balanced 2, 2, 2, 3. - n_ttdas = 0 - if i == workers - 1: # i is a index, hence -1 - # is this the last worker that needs to be created? - # then task that worker with all the remaining models - n_ttdas = len(self.ttda) - ttdas_sent - else: - n_ttdas = int((len(self.ttda) - ttdas_sent) / (workers - i)) - - process = Process( - target=self._asymmetric_distance_matrix_worker, - args=(i, ttdas_sent, n_ttdas, child_conn), - ) - ttdas_sent += n_ttdas - - processes.append(process) - pipes.append((parent_conn, child_conn)) - process.start() - - except ProcessError: - _teardown(pipes, processes, i) - raise ProcessError - - distances = [] - # note, that the following loop maintains order in how the ttda will be concatenated - # which is very important. Ordering in ttda has to be the same as when using only one process - for parent_conn, _ in pipes: - answer = parent_conn.recv() - parent_conn.close() # child conn will be closed from inside the worker - # this does basically the same as the _generate_topic_models function (concatenate all the ttdas): - distances.append(answer[1]) - - # end all processes - for process in processes: - process.terminate() - - self.asymmetric_distance_matrix = np.concatenate(distances) def _generate_topic_clusters(self, eps=0.1, min_samples=None): """Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices. @@ -1092,10 +1123,10 @@ def _generate_stable_topics(self, min_cores=None): stable_topics = np.empty((num_stable_topics, len(self.id2word))) # for each cluster - for l, label in enumerate(unique_labels): + for label_index, label in enumerate(unique_labels): # mean of all the topics that are of that cluster topics_of_cluster = np.array([topic for t, topic in enumerate(valid_topics) if topic_labels[t] == label]) - stable_topics[l] = topics_of_cluster.mean(axis=0) + stable_topics[label_index] = topics_of_cluster.mean(axis=0) self.valid_clusters = valid_clusters self.stable_topics = stable_topics From 444c1909dddcda5bce77ca205a12f9e66f48d102 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 18 Jul 2021 22:39:22 +0900 Subject: [PATCH 164/166] roll back change to docs/src/Makefile --- docs/src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/Makefile b/docs/src/Makefile index d4ae6caa5f..30cbe5a869 100644 --- a/docs/src/Makefile +++ b/docs/src/Makefile @@ -41,7 +41,7 @@ html: @echo "Build finished. The HTML pages are in ../" upload: - scp -r _build/html/* rr:public_html/gensim_4.0.0/ + scp -r _build/html/* rr:public_html/gensim/ dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml From f00aca8645fe6f4b3bcd8ad0fdcfd0b92a4fcf37 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 18 Jul 2021 23:39:21 +0900 Subject: [PATCH 165/166] re-raise caught exception instead of raising a new one we don't want to hide the details of the problem --- gensim/models/ensemblelda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 9fcb486144..a4b6eccb9c 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -364,7 +364,7 @@ def _generate_topic_models_multiproc(ensemble, num_models, ensemble_workers): except ProcessError: logger.error(f"could not start process {i}") _teardown(pipes, processes) - raise ProcessError + raise # aggregate results # will also block until workers are finished @@ -572,7 +572,7 @@ def _calculate_assymetric_distance_matrix_multiproc( except ProcessError: logger.error(f"could not start process {i}") _teardown(pipes, processes) - raise ProcessError + raise distances = [] # note, that the following loop maintains order in how the ttda will be concatenated From cac68195bd0a4a462025dcad3784e3d26bf03614 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 22 Jul 2021 21:31:47 +0900 Subject: [PATCH 166/166] add docstring --- gensim/models/ensemblelda.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index a4b6eccb9c..39d7e06620 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -425,6 +425,9 @@ def _generate_topic_models(ensemble, num_models, random_states=None): def _generate_topic_models_worker(ensemble, num_models, random_states, pipe): + """Wrapper for _generate_topic_models to write the results into a pipe. + + This is intended to be used inside a subprocess.""" # # Same as _generate_topic_models, but runs in a separate subprocess, and # sends the updated ensemble state to the parent subprocess via a pipe.